[AArch64] Handle more SVE predicate constants
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob27a9a5fd28dc0a87fa44699da4a5aeed878d9a0f
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
86 enum insn_type { MOV, MVN, INDEX, PTRUE };
87 enum modifier_type { LSL, MSL };
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97 /* The mode of the elements. */
98 scalar_mode elt_mode;
100 /* The instruction to use to move the immediate into a vector. */
101 insn_type insn;
103 union
105 /* For MOV and MVN. */
106 struct
108 /* The value of each element. */
109 rtx value;
111 /* The kind of shift modifier to use, and the number of bits to shift.
112 This is (LSL, 0) if no shift is needed. */
113 modifier_type modifier;
114 unsigned int shift;
115 } mov;
117 /* For INDEX. */
118 struct
120 /* The value of the first element and the step to be added for each
121 subsequent element. */
122 rtx base, step;
123 } index;
125 /* For PTRUE. */
126 aarch64_svpattern pattern;
127 } u;
130 /* Construct a floating-point immediate in which each element has mode
131 ELT_MODE_IN and value VALUE_IN. */
132 inline simd_immediate_info
133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
134 : elt_mode (elt_mode_in), insn (MOV)
136 u.mov.value = value_in;
137 u.mov.modifier = LSL;
138 u.mov.shift = 0;
141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
142 and value VALUE_IN. The other parameters are as for the structure
143 fields. */
144 inline simd_immediate_info
145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
146 unsigned HOST_WIDE_INT value_in,
147 insn_type insn_in, modifier_type modifier_in,
148 unsigned int shift_in)
149 : elt_mode (elt_mode_in), insn (insn_in)
151 u.mov.value = gen_int_mode (value_in, elt_mode_in);
152 u.mov.modifier = modifier_in;
153 u.mov.shift = shift_in;
156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
157 and where element I is equal to BASE_IN + I * STEP_IN. */
158 inline simd_immediate_info
159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
160 : elt_mode (elt_mode_in), insn (INDEX)
162 u.index.base = base_in;
163 u.index.step = step_in;
166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
167 and has PTRUE pattern PATTERN_IN. */
168 inline simd_immediate_info
169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
170 aarch64_svpattern pattern_in)
171 : elt_mode (elt_mode_in), insn (PTRUE)
173 u.pattern = pattern_in;
176 /* The current code model. */
177 enum aarch64_code_model aarch64_cmodel;
179 /* The number of 64-bit elements in an SVE vector. */
180 poly_uint16 aarch64_sve_vg;
182 #ifdef HAVE_AS_TLS
183 #undef TARGET_HAVE_TLS
184 #define TARGET_HAVE_TLS 1
185 #endif
187 static bool aarch64_composite_type_p (const_tree, machine_mode);
188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
189 const_tree,
190 machine_mode *, int *,
191 bool *);
192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_override_options_after_change (void);
195 static bool aarch64_vector_mode_supported_p (machine_mode);
196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
198 const_tree type,
199 int misalignment,
200 bool is_packed);
201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
203 aarch64_addr_query_type);
204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
206 /* Major revision number of the ARM Architecture implemented by the target. */
207 unsigned aarch64_architecture_version;
209 /* The processor for which instructions should be scheduled. */
210 enum aarch64_processor aarch64_tune = cortexa53;
212 /* Mask to specify which instruction scheduling options should be used. */
213 uint64_t aarch64_tune_flags = 0;
215 /* Global flag for PC relative loads. */
216 bool aarch64_pcrelative_literal_loads;
218 /* Global flag for whether frame pointer is enabled. */
219 bool aarch64_use_frame_pointer;
221 #define BRANCH_PROTECT_STR_MAX 255
222 char *accepted_branch_protection_string = NULL;
224 static enum aarch64_parse_opt_result
225 aarch64_parse_branch_protection (const char*, char**);
227 /* Support for command line parsing of boolean flags in the tuning
228 structures. */
229 struct aarch64_flag_desc
231 const char* name;
232 unsigned int flag;
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
239 { "none", AARCH64_FUSE_NOTHING },
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL },
242 { NULL, AARCH64_FUSE_NOTHING }
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
249 { "none", AARCH64_EXTRA_TUNE_NONE },
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL },
252 { NULL, AARCH64_EXTRA_TUNE_NONE }
255 /* Tuning parameters. */
257 static const struct cpu_addrcost_table generic_addrcost_table =
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
276 0, /* hi */
277 0, /* si */
278 0, /* di */
279 2, /* ti */
281 0, /* pre_modify */
282 0, /* post_modify */
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
286 0, /* imm_offset */
289 static const struct cpu_addrcost_table xgene1_addrcost_table =
292 1, /* hi */
293 0, /* si */
294 0, /* di */
295 1, /* ti */
297 1, /* pre_modify */
298 1, /* post_modify */
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
302 0, /* imm_offset */
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
308 1, /* hi */
309 1, /* si */
310 1, /* di */
311 2, /* ti */
313 0, /* pre_modify */
314 0, /* post_modify */
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
318 0, /* imm_offset */
321 static const struct cpu_addrcost_table tsv110_addrcost_table =
324 1, /* hi */
325 0, /* si */
326 0, /* di */
327 1, /* ti */
329 0, /* pre_modify */
330 0, /* post_modify */
331 0, /* register_offset */
332 1, /* register_sextend */
333 1, /* register_zextend */
334 0, /* imm_offset */
337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
340 1, /* hi */
341 1, /* si */
342 1, /* di */
343 2, /* ti */
345 1, /* pre_modify */
346 1, /* post_modify */
347 3, /* register_offset */
348 3, /* register_sextend */
349 3, /* register_zextend */
350 2, /* imm_offset */
353 static const struct cpu_regmove_cost generic_regmove_cost =
355 1, /* GP2GP */
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
358 5, /* GP2FP */
359 5, /* FP2GP */
360 2 /* FP2FP */
363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
365 1, /* GP2GP */
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost. */
368 5, /* GP2FP */
369 5, /* FP2GP */
370 2 /* FP2FP */
373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
375 1, /* GP2GP */
376 /* Avoid the use of slow int<->fp moves for spilling by setting
377 their cost higher than memmov_cost. */
378 5, /* GP2FP */
379 5, /* FP2GP */
380 2 /* FP2FP */
383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
385 1, /* GP2GP */
386 /* Avoid the use of slow int<->fp moves for spilling by setting
387 their cost higher than memmov_cost (actual, 4 and 9). */
388 9, /* GP2FP */
389 9, /* FP2GP */
390 1 /* FP2FP */
393 static const struct cpu_regmove_cost thunderx_regmove_cost =
395 2, /* GP2GP */
396 2, /* GP2FP */
397 6, /* FP2GP */
398 4 /* FP2FP */
401 static const struct cpu_regmove_cost xgene1_regmove_cost =
403 1, /* GP2GP */
404 /* Avoid the use of slow int<->fp moves for spilling by setting
405 their cost higher than memmov_cost. */
406 8, /* GP2FP */
407 8, /* FP2GP */
408 2 /* FP2FP */
411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
413 2, /* GP2GP */
414 /* Avoid the use of int<->fp moves for spilling. */
415 6, /* GP2FP */
416 6, /* FP2GP */
417 4 /* FP2FP */
420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
422 1, /* GP2GP */
423 /* Avoid the use of int<->fp moves for spilling. */
424 8, /* GP2FP */
425 8, /* FP2GP */
426 4 /* FP2FP */
429 static const struct cpu_regmove_cost tsv110_regmove_cost =
431 1, /* GP2GP */
432 /* Avoid the use of slow int<->fp moves for spilling by setting
433 their cost higher than memmov_cost. */
434 2, /* GP2FP */
435 3, /* FP2GP */
436 2 /* FP2FP */
439 /* Generic costs for vector insn classes. */
440 static const struct cpu_vector_cost generic_vector_cost =
442 1, /* scalar_int_stmt_cost */
443 1, /* scalar_fp_stmt_cost */
444 1, /* scalar_load_cost */
445 1, /* scalar_store_cost */
446 1, /* vec_int_stmt_cost */
447 1, /* vec_fp_stmt_cost */
448 2, /* vec_permute_cost */
449 1, /* vec_to_scalar_cost */
450 1, /* scalar_to_vec_cost */
451 1, /* vec_align_load_cost */
452 1, /* vec_unalign_load_cost */
453 1, /* vec_unalign_store_cost */
454 1, /* vec_store_cost */
455 3, /* cond_taken_branch_cost */
456 1 /* cond_not_taken_branch_cost */
459 /* QDF24XX costs for vector insn classes. */
460 static const struct cpu_vector_cost qdf24xx_vector_cost =
462 1, /* scalar_int_stmt_cost */
463 1, /* scalar_fp_stmt_cost */
464 1, /* scalar_load_cost */
465 1, /* scalar_store_cost */
466 1, /* vec_int_stmt_cost */
467 3, /* vec_fp_stmt_cost */
468 2, /* vec_permute_cost */
469 1, /* vec_to_scalar_cost */
470 1, /* scalar_to_vec_cost */
471 1, /* vec_align_load_cost */
472 1, /* vec_unalign_load_cost */
473 1, /* vec_unalign_store_cost */
474 1, /* vec_store_cost */
475 3, /* cond_taken_branch_cost */
476 1 /* cond_not_taken_branch_cost */
479 /* ThunderX costs for vector insn classes. */
480 static const struct cpu_vector_cost thunderx_vector_cost =
482 1, /* scalar_int_stmt_cost */
483 1, /* scalar_fp_stmt_cost */
484 3, /* scalar_load_cost */
485 1, /* scalar_store_cost */
486 4, /* vec_int_stmt_cost */
487 1, /* vec_fp_stmt_cost */
488 4, /* vec_permute_cost */
489 2, /* vec_to_scalar_cost */
490 2, /* scalar_to_vec_cost */
491 3, /* vec_align_load_cost */
492 5, /* vec_unalign_load_cost */
493 5, /* vec_unalign_store_cost */
494 1, /* vec_store_cost */
495 3, /* cond_taken_branch_cost */
496 3 /* cond_not_taken_branch_cost */
499 static const struct cpu_vector_cost tsv110_vector_cost =
501 1, /* scalar_int_stmt_cost */
502 1, /* scalar_fp_stmt_cost */
503 5, /* scalar_load_cost */
504 1, /* scalar_store_cost */
505 2, /* vec_int_stmt_cost */
506 2, /* vec_fp_stmt_cost */
507 2, /* vec_permute_cost */
508 3, /* vec_to_scalar_cost */
509 2, /* scalar_to_vec_cost */
510 5, /* vec_align_load_cost */
511 5, /* vec_unalign_load_cost */
512 1, /* vec_unalign_store_cost */
513 1, /* vec_store_cost */
514 1, /* cond_taken_branch_cost */
515 1 /* cond_not_taken_branch_cost */
518 /* Generic costs for vector insn classes. */
519 static const struct cpu_vector_cost cortexa57_vector_cost =
521 1, /* scalar_int_stmt_cost */
522 1, /* scalar_fp_stmt_cost */
523 4, /* scalar_load_cost */
524 1, /* scalar_store_cost */
525 2, /* vec_int_stmt_cost */
526 2, /* vec_fp_stmt_cost */
527 3, /* vec_permute_cost */
528 8, /* vec_to_scalar_cost */
529 8, /* scalar_to_vec_cost */
530 4, /* vec_align_load_cost */
531 4, /* vec_unalign_load_cost */
532 1, /* vec_unalign_store_cost */
533 1, /* vec_store_cost */
534 1, /* cond_taken_branch_cost */
535 1 /* cond_not_taken_branch_cost */
538 static const struct cpu_vector_cost exynosm1_vector_cost =
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 5, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 3, /* vec_int_stmt_cost */
545 3, /* vec_fp_stmt_cost */
546 3, /* vec_permute_cost */
547 3, /* vec_to_scalar_cost */
548 3, /* scalar_to_vec_cost */
549 5, /* vec_align_load_cost */
550 5, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 1, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
557 /* Generic costs for vector insn classes. */
558 static const struct cpu_vector_cost xgene1_vector_cost =
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 5, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 2, /* vec_int_stmt_cost */
565 2, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 4, /* vec_to_scalar_cost */
568 4, /* scalar_to_vec_cost */
569 10, /* vec_align_load_cost */
570 10, /* vec_unalign_load_cost */
571 2, /* vec_unalign_store_cost */
572 2, /* vec_store_cost */
573 2, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
577 /* Costs for vector insn classes for Vulcan. */
578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
580 1, /* scalar_int_stmt_cost */
581 6, /* scalar_fp_stmt_cost */
582 4, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 5, /* vec_int_stmt_cost */
585 6, /* vec_fp_stmt_cost */
586 3, /* vec_permute_cost */
587 6, /* vec_to_scalar_cost */
588 5, /* scalar_to_vec_cost */
589 8, /* vec_align_load_cost */
590 8, /* vec_unalign_load_cost */
591 4, /* vec_unalign_store_cost */
592 4, /* vec_store_cost */
593 2, /* cond_taken_branch_cost */
594 1 /* cond_not_taken_branch_cost */
597 /* Generic costs for branch instructions. */
598 static const struct cpu_branch_cost generic_branch_cost =
600 1, /* Predictable. */
601 3 /* Unpredictable. */
604 /* Generic approximation modes. */
605 static const cpu_approx_modes generic_approx_modes =
607 AARCH64_APPROX_NONE, /* division */
608 AARCH64_APPROX_NONE, /* sqrt */
609 AARCH64_APPROX_NONE /* recip_sqrt */
612 /* Approximation modes for Exynos M1. */
613 static const cpu_approx_modes exynosm1_approx_modes =
615 AARCH64_APPROX_NONE, /* division */
616 AARCH64_APPROX_ALL, /* sqrt */
617 AARCH64_APPROX_ALL /* recip_sqrt */
620 /* Approximation modes for X-Gene 1. */
621 static const cpu_approx_modes xgene1_approx_modes =
623 AARCH64_APPROX_NONE, /* division */
624 AARCH64_APPROX_NONE, /* sqrt */
625 AARCH64_APPROX_ALL /* recip_sqrt */
628 /* Generic prefetch settings (which disable prefetch). */
629 static const cpu_prefetch_tune generic_prefetch_tune =
631 0, /* num_slots */
632 -1, /* l1_cache_size */
633 -1, /* l1_cache_line_size */
634 -1, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 -1 /* default_opt_level */
640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
642 0, /* num_slots */
643 -1, /* l1_cache_size */
644 64, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
653 4, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 512, /* l2_cache_size */
657 false, /* prefetch_dynamic_strides */
658 2048, /* minimum_stride */
659 3 /* default_opt_level */
662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
664 8, /* num_slots */
665 32, /* l1_cache_size */
666 128, /* l1_cache_line_size */
667 16*1024, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 3 /* default_opt_level */
673 static const cpu_prefetch_tune thunderx_prefetch_tune =
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 128, /* l1_cache_line_size */
678 -1, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
686 8, /* num_slots */
687 32, /* l1_cache_size */
688 64, /* l1_cache_line_size */
689 256, /* l2_cache_size */
690 true, /* prefetch_dynamic_strides */
691 -1, /* minimum_stride */
692 -1 /* default_opt_level */
695 static const cpu_prefetch_tune tsv110_prefetch_tune =
697 0, /* num_slots */
698 64, /* l1_cache_size */
699 64, /* l1_cache_line_size */
700 512, /* l2_cache_size */
701 true, /* prefetch_dynamic_strides */
702 -1, /* minimum_stride */
703 -1 /* default_opt_level */
706 static const cpu_prefetch_tune xgene1_prefetch_tune =
708 8, /* num_slots */
709 32, /* l1_cache_size */
710 64, /* l1_cache_line_size */
711 256, /* l2_cache_size */
712 true, /* prefetch_dynamic_strides */
713 -1, /* minimum_stride */
714 -1 /* default_opt_level */
717 static const struct tune_params generic_tunings =
719 &cortexa57_extra_costs,
720 &generic_addrcost_table,
721 &generic_regmove_cost,
722 &generic_vector_cost,
723 &generic_branch_cost,
724 &generic_approx_modes,
725 SVE_NOT_IMPLEMENTED, /* sve_width */
726 4, /* memmov_cost */
727 2, /* issue_rate */
728 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
729 "16:12", /* function_align. */
730 "4", /* jump_align. */
731 "8", /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
739 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
740 &generic_prefetch_tune
743 static const struct tune_params cortexa35_tunings =
745 &cortexa53_extra_costs,
746 &generic_addrcost_table,
747 &cortexa53_regmove_cost,
748 &generic_vector_cost,
749 &generic_branch_cost,
750 &generic_approx_modes,
751 SVE_NOT_IMPLEMENTED, /* sve_width */
752 4, /* memmov_cost */
753 1, /* issue_rate */
754 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
756 "16", /* function_align. */
757 "4", /* jump_align. */
758 "8", /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
767 &generic_prefetch_tune
770 static const struct tune_params cortexa53_tunings =
772 &cortexa53_extra_costs,
773 &generic_addrcost_table,
774 &cortexa53_regmove_cost,
775 &generic_vector_cost,
776 &generic_branch_cost,
777 &generic_approx_modes,
778 SVE_NOT_IMPLEMENTED, /* sve_width */
779 4, /* memmov_cost */
780 2, /* issue_rate */
781 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
782 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
783 "16", /* function_align. */
784 "4", /* jump_align. */
785 "8", /* loop_align. */
786 2, /* int_reassoc_width. */
787 4, /* fp_reassoc_width. */
788 1, /* vec_reassoc_width. */
789 2, /* min_div_recip_mul_sf. */
790 2, /* min_div_recip_mul_df. */
791 0, /* max_case_values. */
792 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
793 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
794 &generic_prefetch_tune
797 static const struct tune_params cortexa57_tunings =
799 &cortexa57_extra_costs,
800 &generic_addrcost_table,
801 &cortexa57_regmove_cost,
802 &cortexa57_vector_cost,
803 &generic_branch_cost,
804 &generic_approx_modes,
805 SVE_NOT_IMPLEMENTED, /* sve_width */
806 4, /* memmov_cost */
807 3, /* issue_rate */
808 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
810 "16", /* function_align. */
811 "4", /* jump_align. */
812 "8", /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
821 &generic_prefetch_tune
824 static const struct tune_params cortexa72_tunings =
826 &cortexa57_extra_costs,
827 &generic_addrcost_table,
828 &cortexa57_regmove_cost,
829 &cortexa57_vector_cost,
830 &generic_branch_cost,
831 &generic_approx_modes,
832 SVE_NOT_IMPLEMENTED, /* sve_width */
833 4, /* memmov_cost */
834 3, /* issue_rate */
835 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
836 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
837 "16", /* function_align. */
838 "4", /* jump_align. */
839 "8", /* loop_align. */
840 2, /* int_reassoc_width. */
841 4, /* fp_reassoc_width. */
842 1, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &generic_prefetch_tune
851 static const struct tune_params cortexa73_tunings =
853 &cortexa57_extra_costs,
854 &generic_addrcost_table,
855 &cortexa57_regmove_cost,
856 &cortexa57_vector_cost,
857 &generic_branch_cost,
858 &generic_approx_modes,
859 SVE_NOT_IMPLEMENTED, /* sve_width */
860 4, /* memmov_cost. */
861 2, /* issue_rate. */
862 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
863 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
864 "16", /* function_align. */
865 "4", /* jump_align. */
866 "8", /* loop_align. */
867 2, /* int_reassoc_width. */
868 4, /* fp_reassoc_width. */
869 1, /* vec_reassoc_width. */
870 2, /* min_div_recip_mul_sf. */
871 2, /* min_div_recip_mul_df. */
872 0, /* max_case_values. */
873 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
874 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
875 &generic_prefetch_tune
880 static const struct tune_params exynosm1_tunings =
882 &exynosm1_extra_costs,
883 &exynosm1_addrcost_table,
884 &exynosm1_regmove_cost,
885 &exynosm1_vector_cost,
886 &generic_branch_cost,
887 &exynosm1_approx_modes,
888 SVE_NOT_IMPLEMENTED, /* sve_width */
889 4, /* memmov_cost */
890 3, /* issue_rate */
891 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
892 "4", /* function_align. */
893 "4", /* jump_align. */
894 "4", /* loop_align. */
895 2, /* int_reassoc_width. */
896 4, /* fp_reassoc_width. */
897 1, /* vec_reassoc_width. */
898 2, /* min_div_recip_mul_sf. */
899 2, /* min_div_recip_mul_df. */
900 48, /* max_case_values. */
901 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
902 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
903 &exynosm1_prefetch_tune
906 static const struct tune_params thunderxt88_tunings =
908 &thunderx_extra_costs,
909 &generic_addrcost_table,
910 &thunderx_regmove_cost,
911 &thunderx_vector_cost,
912 &generic_branch_cost,
913 &generic_approx_modes,
914 SVE_NOT_IMPLEMENTED, /* sve_width */
915 6, /* memmov_cost */
916 2, /* issue_rate */
917 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
918 "8", /* function_align. */
919 "8", /* jump_align. */
920 "8", /* loop_align. */
921 2, /* int_reassoc_width. */
922 4, /* fp_reassoc_width. */
923 1, /* vec_reassoc_width. */
924 2, /* min_div_recip_mul_sf. */
925 2, /* min_div_recip_mul_df. */
926 0, /* max_case_values. */
927 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
928 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
929 &thunderxt88_prefetch_tune
932 static const struct tune_params thunderx_tunings =
934 &thunderx_extra_costs,
935 &generic_addrcost_table,
936 &thunderx_regmove_cost,
937 &thunderx_vector_cost,
938 &generic_branch_cost,
939 &generic_approx_modes,
940 SVE_NOT_IMPLEMENTED, /* sve_width */
941 6, /* memmov_cost */
942 2, /* issue_rate */
943 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
944 "8", /* function_align. */
945 "8", /* jump_align. */
946 "8", /* loop_align. */
947 2, /* int_reassoc_width. */
948 4, /* fp_reassoc_width. */
949 1, /* vec_reassoc_width. */
950 2, /* min_div_recip_mul_sf. */
951 2, /* min_div_recip_mul_df. */
952 0, /* max_case_values. */
953 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
954 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
955 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
956 &thunderx_prefetch_tune
959 static const struct tune_params tsv110_tunings =
961 &tsv110_extra_costs,
962 &tsv110_addrcost_table,
963 &tsv110_regmove_cost,
964 &tsv110_vector_cost,
965 &generic_branch_cost,
966 &generic_approx_modes,
967 SVE_NOT_IMPLEMENTED, /* sve_width */
968 4, /* memmov_cost */
969 4, /* issue_rate */
970 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
971 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
972 "16", /* function_align. */
973 "4", /* jump_align. */
974 "8", /* loop_align. */
975 2, /* int_reassoc_width. */
976 4, /* fp_reassoc_width. */
977 1, /* vec_reassoc_width. */
978 2, /* min_div_recip_mul_sf. */
979 2, /* min_div_recip_mul_df. */
980 0, /* max_case_values. */
981 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
982 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
983 &tsv110_prefetch_tune
986 static const struct tune_params xgene1_tunings =
988 &xgene1_extra_costs,
989 &xgene1_addrcost_table,
990 &xgene1_regmove_cost,
991 &xgene1_vector_cost,
992 &generic_branch_cost,
993 &xgene1_approx_modes,
994 SVE_NOT_IMPLEMENTED, /* sve_width */
995 6, /* memmov_cost */
996 4, /* issue_rate */
997 AARCH64_FUSE_NOTHING, /* fusible_ops */
998 "16", /* function_align. */
999 "16", /* jump_align. */
1000 "16", /* loop_align. */
1001 2, /* int_reassoc_width. */
1002 4, /* fp_reassoc_width. */
1003 1, /* vec_reassoc_width. */
1004 2, /* min_div_recip_mul_sf. */
1005 2, /* min_div_recip_mul_df. */
1006 17, /* max_case_values. */
1007 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1008 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1009 &xgene1_prefetch_tune
1012 static const struct tune_params emag_tunings =
1014 &xgene1_extra_costs,
1015 &xgene1_addrcost_table,
1016 &xgene1_regmove_cost,
1017 &xgene1_vector_cost,
1018 &generic_branch_cost,
1019 &xgene1_approx_modes,
1020 SVE_NOT_IMPLEMENTED,
1021 6, /* memmov_cost */
1022 4, /* issue_rate */
1023 AARCH64_FUSE_NOTHING, /* fusible_ops */
1024 "16", /* function_align. */
1025 "16", /* jump_align. */
1026 "16", /* loop_align. */
1027 2, /* int_reassoc_width. */
1028 4, /* fp_reassoc_width. */
1029 1, /* vec_reassoc_width. */
1030 2, /* min_div_recip_mul_sf. */
1031 2, /* min_div_recip_mul_df. */
1032 17, /* max_case_values. */
1033 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1034 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1035 &xgene1_prefetch_tune
1038 static const struct tune_params qdf24xx_tunings =
1040 &qdf24xx_extra_costs,
1041 &qdf24xx_addrcost_table,
1042 &qdf24xx_regmove_cost,
1043 &qdf24xx_vector_cost,
1044 &generic_branch_cost,
1045 &generic_approx_modes,
1046 SVE_NOT_IMPLEMENTED, /* sve_width */
1047 4, /* memmov_cost */
1048 4, /* issue_rate */
1049 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1051 "16", /* function_align. */
1052 "8", /* jump_align. */
1053 "16", /* loop_align. */
1054 2, /* int_reassoc_width. */
1055 4, /* fp_reassoc_width. */
1056 1, /* vec_reassoc_width. */
1057 2, /* min_div_recip_mul_sf. */
1058 2, /* min_div_recip_mul_df. */
1059 0, /* max_case_values. */
1060 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1061 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1062 &qdf24xx_prefetch_tune
1065 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1066 for now. */
1067 static const struct tune_params saphira_tunings =
1069 &generic_extra_costs,
1070 &generic_addrcost_table,
1071 &generic_regmove_cost,
1072 &generic_vector_cost,
1073 &generic_branch_cost,
1074 &generic_approx_modes,
1075 SVE_NOT_IMPLEMENTED, /* sve_width */
1076 4, /* memmov_cost */
1077 4, /* issue_rate */
1078 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1080 "16", /* function_align. */
1081 "8", /* jump_align. */
1082 "16", /* loop_align. */
1083 2, /* int_reassoc_width. */
1084 4, /* fp_reassoc_width. */
1085 1, /* vec_reassoc_width. */
1086 2, /* min_div_recip_mul_sf. */
1087 2, /* min_div_recip_mul_df. */
1088 0, /* max_case_values. */
1089 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1090 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1091 &generic_prefetch_tune
1094 static const struct tune_params thunderx2t99_tunings =
1096 &thunderx2t99_extra_costs,
1097 &thunderx2t99_addrcost_table,
1098 &thunderx2t99_regmove_cost,
1099 &thunderx2t99_vector_cost,
1100 &generic_branch_cost,
1101 &generic_approx_modes,
1102 SVE_NOT_IMPLEMENTED, /* sve_width */
1103 4, /* memmov_cost. */
1104 4, /* issue_rate. */
1105 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1107 "16", /* function_align. */
1108 "8", /* jump_align. */
1109 "16", /* loop_align. */
1110 3, /* int_reassoc_width. */
1111 2, /* fp_reassoc_width. */
1112 2, /* vec_reassoc_width. */
1113 2, /* min_div_recip_mul_sf. */
1114 2, /* min_div_recip_mul_df. */
1115 0, /* max_case_values. */
1116 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1117 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1118 &thunderx2t99_prefetch_tune
1121 static const struct tune_params neoversen1_tunings =
1123 &cortexa57_extra_costs,
1124 &generic_addrcost_table,
1125 &generic_regmove_cost,
1126 &cortexa57_vector_cost,
1127 &generic_branch_cost,
1128 &generic_approx_modes,
1129 SVE_NOT_IMPLEMENTED, /* sve_width */
1130 4, /* memmov_cost */
1131 3, /* issue_rate */
1132 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1133 "32:16", /* function_align. */
1134 "32:16", /* jump_align. */
1135 "32:16", /* loop_align. */
1136 2, /* int_reassoc_width. */
1137 4, /* fp_reassoc_width. */
1138 2, /* vec_reassoc_width. */
1139 2, /* min_div_recip_mul_sf. */
1140 2, /* min_div_recip_mul_df. */
1141 0, /* max_case_values. */
1142 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1143 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1144 &generic_prefetch_tune
1147 /* Support for fine-grained override of the tuning structures. */
1148 struct aarch64_tuning_override_function
1150 const char* name;
1151 void (*parse_override)(const char*, struct tune_params*);
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1161 { "fuse", aarch64_parse_fuse_string },
1162 { "tune", aarch64_parse_tune_string },
1163 { "sve_width", aarch64_parse_sve_width_string },
1164 { NULL, NULL }
1167 /* A processor implementing AArch64. */
1168 struct processor
1170 const char *const name;
1171 enum aarch64_processor ident;
1172 enum aarch64_processor sched_core;
1173 enum aarch64_arch arch;
1174 unsigned architecture_version;
1175 const uint64_t flags;
1176 const struct tune_params *const tune;
1179 /* Architectures implementing AArch64. */
1180 static const struct processor all_architectures[] =
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1188 /* Processor cores implementing AArch64. */
1189 static const struct processor all_cores[] =
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1193 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1194 FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1202 /* Target specification. These are populated by the -march, -mtune, -mcpu
1203 handling code or by target attributes. */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210 /* The current tuning set. */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1213 /* Table of machine attributes. */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1216 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217 affects_type_identity, handler, exclude } */
1218 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1219 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224 /* An ISA extension in the co-processor and main instruction set space. */
1225 struct aarch64_option_extension
1227 const char *const name;
1228 const unsigned long flags_on;
1229 const unsigned long flags_off;
1232 typedef enum aarch64_cond_code
1234 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1238 aarch64_cc;
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242 struct aarch64_branch_protect_type
1244 /* The type's name that the user passes to the branch-protection option
1245 string. */
1246 const char* name;
1247 /* Function to handle the protection type and set global variables.
1248 First argument is the string token corresponding with this type and the
1249 second argument is the next token in the option string.
1250 Return values:
1251 * AARCH64_PARSE_OK: Handling was sucessful.
1252 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253 should print an error.
1254 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255 own error. */
1256 enum aarch64_parse_opt_result (*handler)(char*, char*);
1257 /* A list of types that can follow this type in the option string. */
1258 const aarch64_branch_protect_type* subtypes;
1259 unsigned int num_subtypes;
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266 aarch64_enable_bti = 0;
1267 if (rest)
1269 error ("unexpected %<%s%> after %<%s%>", rest, str);
1270 return AARCH64_PARSE_INVALID_FEATURE;
1272 return AARCH64_PARSE_OK;
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1278 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279 aarch64_ra_sign_key = AARCH64_KEY_A;
1280 aarch64_enable_bti = 1;
1281 if (rest)
1283 error ("unexpected %<%s%> after %<%s%>", rest, str);
1284 return AARCH64_PARSE_INVALID_FEATURE;
1286 return AARCH64_PARSE_OK;
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291 char* rest ATTRIBUTE_UNUSED)
1293 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294 aarch64_ra_sign_key = AARCH64_KEY_A;
1295 return AARCH64_PARSE_OK;
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300 char* rest ATTRIBUTE_UNUSED)
1302 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303 return AARCH64_PARSE_OK;
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308 char* rest ATTRIBUTE_UNUSED)
1310 aarch64_ra_sign_key = AARCH64_KEY_B;
1311 return AARCH64_PARSE_OK;
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316 char* rest ATTRIBUTE_UNUSED)
1318 aarch64_enable_bti = 1;
1319 return AARCH64_PARSE_OK;
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325 { NULL, NULL, NULL, 0 }
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334 { NULL, NULL, NULL, 0 }
1337 /* The condition codes of the processor, and the inverse function. */
1338 static const char * const aarch64_condition_codes[] =
1340 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1344 /* The preferred condition codes for SVE conditions. */
1345 static const char *const aarch64_sve_condition_codes[] =
1347 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1351 /* Return the assembly token for svpattern value VALUE. */
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1356 switch (pattern)
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359 AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361 case AARCH64_NUM_SVPATTERNS:
1362 break;
1364 gcc_unreachable ();
1367 /* Generate code to enable conditional branches in functions over 1 MiB. */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370 const char * branch_format)
1372 rtx_code_label * tmp_label = gen_label_rtx ();
1373 char label_buf[256];
1374 char buffer[128];
1375 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376 CODE_LABEL_NUMBER (tmp_label));
1377 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378 rtx dest_label = operands[pos_label];
1379 operands[pos_label] = tmp_label;
1381 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382 output_asm_insn (buffer, operands);
1384 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385 operands[pos_label] = dest_label;
1386 output_asm_insn (buffer, operands);
1387 return "";
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1393 if (TARGET_GENERAL_REGS_ONLY)
1394 if (FLOAT_MODE_P (mode))
1395 error ("%qs is incompatible with the use of floating-point types",
1396 "-mgeneral-regs-only");
1397 else
1398 error ("%qs is incompatible with the use of vector types",
1399 "-mgeneral-regs-only");
1400 else
1401 if (FLOAT_MODE_P (mode))
1402 error ("%qs feature modifier is incompatible with the use of"
1403 " floating-point types", "+nofp");
1404 else
1405 error ("%qs feature modifier is incompatible with the use of"
1406 " vector types", "+nofp");
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413 and GENERAL_REGS is lower than the memory cost (in this case the best class
1414 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1415 cost results in bad allocations with many redundant int<->FP moves which
1416 are expensive on various cores.
1417 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1419 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1420 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1421 The result of this is that it is no longer inefficient to have a higher
1422 memory move cost than the register move cost.
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427 reg_class_t best_class)
1429 machine_mode mode;
1431 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432 || !reg_class_subset_p (FP_REGS, allocno_class))
1433 return allocno_class;
1435 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436 || !reg_class_subset_p (FP_REGS, best_class))
1437 return best_class;
1439 mode = PSEUDO_REGNO_MODE (regno);
1440 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1446 if (GET_MODE_UNIT_SIZE (mode) == 4)
1447 return aarch64_tune_params.min_div_recip_mul_sf;
1448 return aarch64_tune_params.min_div_recip_mul_df;
1451 /* Return the reassociation width of treeop OPC with mode MODE. */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1455 if (VECTOR_MODE_P (mode))
1456 return aarch64_tune_params.vec_reassoc_width;
1457 if (INTEGRAL_MODE_P (mode))
1458 return aarch64_tune_params.int_reassoc_width;
1459 /* Avoid reassociating floating point addition so we emit more FMAs. */
1460 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461 return aarch64_tune_params.fp_reassoc_width;
1462 return 1;
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1469 if (GP_REGNUM_P (regno))
1470 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471 else if (regno == SP_REGNUM)
1472 return AARCH64_DWARF_SP;
1473 else if (FP_REGNUM_P (regno))
1474 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475 else if (PR_REGNUM_P (regno))
1476 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477 else if (regno == VG_REGNUM)
1478 return AARCH64_DWARF_VG;
1480 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481 equivalent DWARF register. */
1482 return DWARF_FRAME_REGISTERS;
1485 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1486 static bool
1487 aarch64_advsimd_struct_mode_p (machine_mode mode)
1489 return (TARGET_SIMD
1490 && (mode == OImode || mode == CImode || mode == XImode));
1493 /* Return true if MODE is an SVE predicate mode. */
1494 static bool
1495 aarch64_sve_pred_mode_p (machine_mode mode)
1497 return (TARGET_SVE
1498 && (mode == VNx16BImode
1499 || mode == VNx8BImode
1500 || mode == VNx4BImode
1501 || mode == VNx2BImode));
1504 /* Three mutually-exclusive flags describing a vector or predicate type. */
1505 const unsigned int VEC_ADVSIMD = 1;
1506 const unsigned int VEC_SVE_DATA = 2;
1507 const unsigned int VEC_SVE_PRED = 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509 a structure of 2, 3 or 4 vectors. */
1510 const unsigned int VEC_STRUCT = 8;
1511 /* Useful combinations of the above. */
1512 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1513 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516 Ignore modes that are not supported by the current target. */
1517 static unsigned int
1518 aarch64_classify_vector_mode (machine_mode mode)
1520 if (aarch64_advsimd_struct_mode_p (mode))
1521 return VEC_ADVSIMD | VEC_STRUCT;
1523 if (aarch64_sve_pred_mode_p (mode))
1524 return VEC_SVE_PRED;
1526 /* Make the decision based on the mode's enum value rather than its
1527 properties, so that we keep the correct classification regardless
1528 of -msve-vector-bits. */
1529 switch (mode)
1531 /* Single SVE vectors. */
1532 case E_VNx16QImode:
1533 case E_VNx8HImode:
1534 case E_VNx4SImode:
1535 case E_VNx2DImode:
1536 case E_VNx8HFmode:
1537 case E_VNx4SFmode:
1538 case E_VNx2DFmode:
1539 return TARGET_SVE ? VEC_SVE_DATA : 0;
1541 /* x2 SVE vectors. */
1542 case E_VNx32QImode:
1543 case E_VNx16HImode:
1544 case E_VNx8SImode:
1545 case E_VNx4DImode:
1546 case E_VNx16HFmode:
1547 case E_VNx8SFmode:
1548 case E_VNx4DFmode:
1549 /* x3 SVE vectors. */
1550 case E_VNx48QImode:
1551 case E_VNx24HImode:
1552 case E_VNx12SImode:
1553 case E_VNx6DImode:
1554 case E_VNx24HFmode:
1555 case E_VNx12SFmode:
1556 case E_VNx6DFmode:
1557 /* x4 SVE vectors. */
1558 case E_VNx64QImode:
1559 case E_VNx32HImode:
1560 case E_VNx16SImode:
1561 case E_VNx8DImode:
1562 case E_VNx32HFmode:
1563 case E_VNx16SFmode:
1564 case E_VNx8DFmode:
1565 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1567 /* 64-bit Advanced SIMD vectors. */
1568 case E_V8QImode:
1569 case E_V4HImode:
1570 case E_V2SImode:
1571 /* ...E_V1DImode doesn't exist. */
1572 case E_V4HFmode:
1573 case E_V2SFmode:
1574 case E_V1DFmode:
1575 /* 128-bit Advanced SIMD vectors. */
1576 case E_V16QImode:
1577 case E_V8HImode:
1578 case E_V4SImode:
1579 case E_V2DImode:
1580 case E_V8HFmode:
1581 case E_V4SFmode:
1582 case E_V2DFmode:
1583 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1585 default:
1586 return 0;
1590 /* Return true if MODE is any of the data vector modes, including
1591 structure modes. */
1592 static bool
1593 aarch64_vector_data_mode_p (machine_mode mode)
1595 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599 or a structure of vectors. */
1600 static bool
1601 aarch64_sve_data_mode_p (machine_mode mode)
1603 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1606 /* Implement target hook TARGET_ARRAY_MODE. */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1610 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1611 && IN_RANGE (nelems, 2, 4))
1612 return mode_for_vector (GET_MODE_INNER (mode),
1613 GET_MODE_NUNITS (mode) * nelems);
1615 return opt_machine_mode ();
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1619 static bool
1620 aarch64_array_mode_supported_p (machine_mode mode,
1621 unsigned HOST_WIDE_INT nelems)
1623 if (TARGET_SIMD
1624 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1625 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1626 && (nelems >= 2 && nelems <= 4))
1627 return true;
1629 return false;
1632 /* Return the SVE predicate mode to use for elements that have
1633 ELEM_NBYTES bytes, if such a mode exists. */
1635 opt_machine_mode
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1638 if (TARGET_SVE)
1640 if (elem_nbytes == 1)
1641 return VNx16BImode;
1642 if (elem_nbytes == 2)
1643 return VNx8BImode;
1644 if (elem_nbytes == 4)
1645 return VNx4BImode;
1646 if (elem_nbytes == 8)
1647 return VNx2BImode;
1649 return opt_machine_mode ();
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1657 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1659 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1660 machine_mode pred_mode;
1661 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1662 return pred_mode;
1665 return default_get_mask_mode (nunits, nbytes);
1668 /* Return the integer element mode associated with SVE mode MODE. */
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode)
1673 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1674 GET_MODE_NUNITS (mode));
1675 return int_mode_for_size (elt_bits, 0).require ();
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1679 prefer to use the first arithmetic operand as the else value if
1680 the else value doesn't matter, since that exactly matches the SVE
1681 destructive merging form. For ternary operations we could either
1682 pick the first operand and use FMAD-like instructions or the last
1683 operand and use FMLA-like instructions; the latter seems more
1684 natural. */
1686 static tree
1687 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1689 return nops == 3 ? ops[2] : ops[0];
1692 /* Implement TARGET_HARD_REGNO_NREGS. */
1694 static unsigned int
1695 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1697 /* ??? Logically we should only need to provide a value when
1698 HARD_REGNO_MODE_OK says that the combination is valid,
1699 but at the moment we need to handle all modes. Just ignore
1700 any runtime parts for registers that can't store them. */
1701 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1702 switch (aarch64_regno_regclass (regno))
1704 case FP_REGS:
1705 case FP_LO_REGS:
1706 case FP_LO8_REGS:
1707 if (aarch64_sve_data_mode_p (mode))
1708 return exact_div (GET_MODE_SIZE (mode),
1709 BYTES_PER_SVE_VECTOR).to_constant ();
1710 return CEIL (lowest_size, UNITS_PER_VREG);
1711 case PR_REGS:
1712 case PR_LO_REGS:
1713 case PR_HI_REGS:
1714 return 1;
1715 default:
1716 return CEIL (lowest_size, UNITS_PER_WORD);
1718 gcc_unreachable ();
1721 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1723 static bool
1724 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1726 if (GET_MODE_CLASS (mode) == MODE_CC)
1727 return regno == CC_REGNUM;
1729 if (regno == VG_REGNUM)
1730 /* This must have the same size as _Unwind_Word. */
1731 return mode == DImode;
1733 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1734 if (vec_flags & VEC_SVE_PRED)
1735 return PR_REGNUM_P (regno);
1737 if (PR_REGNUM_P (regno))
1738 return 0;
1740 if (regno == SP_REGNUM)
1741 /* The purpose of comparing with ptr_mode is to support the
1742 global register variable associated with the stack pointer
1743 register via the syntax of asm ("wsp") in ILP32. */
1744 return mode == Pmode || mode == ptr_mode;
1746 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1747 return mode == Pmode;
1749 if (GP_REGNUM_P (regno))
1751 if (known_le (GET_MODE_SIZE (mode), 8))
1752 return true;
1753 else if (known_le (GET_MODE_SIZE (mode), 16))
1754 return (regno & 1) == 0;
1756 else if (FP_REGNUM_P (regno))
1758 if (vec_flags & VEC_STRUCT)
1759 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1760 else
1761 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1764 return false;
1767 /* Return true if this is a definition of a vectorized simd function. */
1769 static bool
1770 aarch64_simd_decl_p (tree fndecl)
1772 tree fntype;
1774 if (fndecl == NULL)
1775 return false;
1776 fntype = TREE_TYPE (fndecl);
1777 if (fntype == NULL)
1778 return false;
1780 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1781 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1782 return true;
1784 return false;
1787 /* Return the mode a register save/restore should use. DImode for integer
1788 registers, DFmode for FP registers in non-SIMD functions (they only save
1789 the bottom half of a 128 bit register), or TFmode for FP registers in
1790 SIMD functions. */
1792 static machine_mode
1793 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1795 return GP_REGNUM_P (regno)
1796 ? E_DImode
1797 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1800 /* Return true if the instruction is a call to a SIMD function, false
1801 if it is not a SIMD function or if we do not know anything about
1802 the function. */
1804 static bool
1805 aarch64_simd_call_p (rtx_insn *insn)
1807 rtx symbol;
1808 rtx call;
1809 tree fndecl;
1811 gcc_assert (CALL_P (insn));
1812 call = get_call_rtx_from (insn);
1813 symbol = XEXP (XEXP (call, 0), 0);
1814 if (GET_CODE (symbol) != SYMBOL_REF)
1815 return false;
1816 fndecl = SYMBOL_REF_DECL (symbol);
1817 if (!fndecl)
1818 return false;
1820 return aarch64_simd_decl_p (fndecl);
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1824 a function that uses the SIMD ABI, take advantage of the extra
1825 call-preserved registers that the ABI provides. */
1827 void
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1829 HARD_REG_SET *return_set)
1831 if (aarch64_simd_call_p (insn))
1833 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1834 if (FP_SIMD_SAVED_REGNUM_P (regno))
1835 CLEAR_HARD_REG_BIT (*return_set, regno);
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1840 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1841 clobbers the top 64 bits when restoring the bottom 64 bits. */
1843 static bool
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1845 machine_mode mode)
1847 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1848 return FP_REGNUM_P (regno)
1849 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1854 rtx_insn *
1855 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1857 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1859 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1860 return call_1;
1861 else
1862 return call_2;
1865 /* Implement REGMODE_NATURAL_SIZE. */
1866 poly_uint64
1867 aarch64_regmode_natural_size (machine_mode mode)
1869 /* The natural size for SVE data modes is one SVE data vector,
1870 and similarly for predicates. We can't independently modify
1871 anything smaller than that. */
1872 /* ??? For now, only do this for variable-width SVE registers.
1873 Doing it for constant-sized registers breaks lower-subreg.c. */
1874 /* ??? And once that's fixed, we should probably have similar
1875 code for Advanced SIMD. */
1876 if (!aarch64_sve_vg.is_constant ())
1878 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1879 if (vec_flags & VEC_SVE_PRED)
1880 return BYTES_PER_SVE_PRED;
1881 if (vec_flags & VEC_SVE_DATA)
1882 return BYTES_PER_SVE_VECTOR;
1884 return UNITS_PER_WORD;
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1888 machine_mode
1889 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1890 machine_mode mode)
1892 /* The predicate mode determines which bits are significant and
1893 which are "don't care". Decreasing the number of lanes would
1894 lose data while increasing the number of lanes would make bits
1895 unnecessarily significant. */
1896 if (PR_REGNUM_P (regno))
1897 return mode;
1898 if (known_ge (GET_MODE_SIZE (mode), 4))
1899 return mode;
1900 else
1901 return SImode;
1904 /* Return true if I's bits are consecutive ones from the MSB. */
1905 bool
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1908 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1911 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1912 that strcpy from constants will be faster. */
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1917 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1918 return MAX (align, BITS_PER_WORD);
1919 return align;
1922 /* Return true if calls to DECL should be treated as
1923 long-calls (ie called via a register). */
1924 static bool
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1927 return false;
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931 long-calls (ie called via a register). */
1932 bool
1933 aarch64_is_long_call_p (rtx sym)
1935 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1938 /* Return true if calls to symbol-ref SYM should not go through
1939 plt stubs. */
1941 bool
1942 aarch64_is_noplt_call_p (rtx sym)
1944 const_tree decl = SYMBOL_REF_DECL (sym);
1946 if (flag_pic
1947 && decl
1948 && (!flag_plt
1949 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1950 && !targetm.binds_local_p (decl))
1951 return true;
1953 return false;
1956 /* Return true if the offsets to a zero/sign-extract operation
1957 represent an expression that matches an extend operation. The
1958 operands represent the paramters from
1960 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1961 bool
1962 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1963 rtx extract_imm)
1965 HOST_WIDE_INT mult_val, extract_val;
1967 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1968 return false;
1970 mult_val = INTVAL (mult_imm);
1971 extract_val = INTVAL (extract_imm);
1973 if (extract_val > 8
1974 && extract_val < GET_MODE_BITSIZE (mode)
1975 && exact_log2 (extract_val & ~7) > 0
1976 && (extract_val & 7) <= 4
1977 && mult_val == (1 << (extract_val & 7)))
1978 return true;
1980 return false;
1983 /* Emit an insn that's a simple single-set. Both the operands must be
1984 known to be valid. */
1985 inline static rtx_insn *
1986 emit_set_insn (rtx x, rtx y)
1988 return emit_insn (gen_rtx_SET (x, y));
1991 /* X and Y are two things to compare using CODE. Emit the compare insn and
1992 return the rtx for register 0 in the proper mode. */
1994 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1996 machine_mode mode = SELECT_CC_MODE (code, x, y);
1997 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1999 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2000 return cc_reg;
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2005 static rtx
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2007 machine_mode y_mode)
2009 if (y_mode == E_QImode || y_mode == E_HImode)
2011 if (CONST_INT_P (y))
2012 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2013 else
2015 rtx t, cc_reg;
2016 machine_mode cc_mode;
2018 t = gen_rtx_ZERO_EXTEND (SImode, y);
2019 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2020 cc_mode = CC_SWPmode;
2021 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2022 emit_set_insn (cc_reg, t);
2023 return cc_reg;
2027 return aarch64_gen_compare_reg (code, x, y);
2030 /* Build the SYMBOL_REF for __tls_get_addr. */
2032 static GTY(()) rtx tls_get_addr_libfunc;
2035 aarch64_tls_get_addr (void)
2037 if (!tls_get_addr_libfunc)
2038 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2039 return tls_get_addr_libfunc;
2042 /* Return the TLS model to use for ADDR. */
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr)
2047 enum tls_model tls_kind = TLS_MODEL_NONE;
2048 if (GET_CODE (addr) == CONST)
2050 poly_int64 addend;
2051 rtx sym = strip_offset (addr, &addend);
2052 if (GET_CODE (sym) == SYMBOL_REF)
2053 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2055 else if (GET_CODE (addr) == SYMBOL_REF)
2056 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2058 return tls_kind;
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062 so that combine would take care of combining addresses where
2063 necessary, but for generation purposes, we'll generate the address
2064 as :
2065 RTL Absolute
2066 tmp = hi (symbol_ref); adrp x1, foo
2067 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2070 PIC TLS
2071 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2072 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2073 bl __tls_get_addr
2076 Load TLS symbol, depending on TLS mechanism and TLS access model.
2078 Global Dynamic - Traditional TLS:
2079 adrp tmp, :tlsgd:imm
2080 add dest, tmp, #:tlsgd_lo12:imm
2081 bl __tls_get_addr
2083 Global Dynamic - TLS Descriptors:
2084 adrp dest, :tlsdesc:imm
2085 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2086 add dest, dest, #:tlsdesc_lo12:imm
2087 blr tmp
2088 mrs tp, tpidr_el0
2089 add dest, dest, tp
2091 Initial Exec:
2092 mrs tp, tpidr_el0
2093 adrp tmp, :gottprel:imm
2094 ldr dest, [tmp, #:gottprel_lo12:imm]
2095 add dest, dest, tp
2097 Local Exec:
2098 mrs tp, tpidr_el0
2099 add t0, tp, #:tprel_hi12:imm, lsl #12
2100 add t0, t0, #:tprel_lo12_nc:imm
2103 static void
2104 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2105 enum aarch64_symbol_type type)
2107 switch (type)
2109 case SYMBOL_SMALL_ABSOLUTE:
2111 /* In ILP32, the mode of dest can be either SImode or DImode. */
2112 rtx tmp_reg = dest;
2113 machine_mode mode = GET_MODE (dest);
2115 gcc_assert (mode == Pmode || mode == ptr_mode);
2117 if (can_create_pseudo_p ())
2118 tmp_reg = gen_reg_rtx (mode);
2120 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2121 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2122 return;
2125 case SYMBOL_TINY_ABSOLUTE:
2126 emit_insn (gen_rtx_SET (dest, imm));
2127 return;
2129 case SYMBOL_SMALL_GOT_28K:
2131 machine_mode mode = GET_MODE (dest);
2132 rtx gp_rtx = pic_offset_table_rtx;
2133 rtx insn;
2134 rtx mem;
2136 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137 here before rtl expand. Tree IVOPT will generate rtl pattern to
2138 decide rtx costs, in which case pic_offset_table_rtx is not
2139 initialized. For that case no need to generate the first adrp
2140 instruction as the final cost for global variable access is
2141 one instruction. */
2142 if (gp_rtx != NULL)
2144 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145 using the page base as GOT base, the first page may be wasted,
2146 in the worst scenario, there is only 28K space for GOT).
2148 The generate instruction sequence for accessing global variable
2151 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2153 Only one instruction needed. But we must initialize
2154 pic_offset_table_rtx properly. We generate initialize insn for
2155 every global access, and allow CSE to remove all redundant.
2157 The final instruction sequences will look like the following
2158 for multiply global variables access.
2160 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2162 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2165 ... */
2167 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2168 crtl->uses_pic_offset_table = 1;
2169 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2171 if (mode != GET_MODE (gp_rtx))
2172 gp_rtx = gen_lowpart (mode, gp_rtx);
2176 if (mode == ptr_mode)
2178 if (mode == DImode)
2179 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2180 else
2181 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2183 mem = XVECEXP (SET_SRC (insn), 0, 0);
2185 else
2187 gcc_assert (mode == Pmode);
2189 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2190 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2193 /* The operand is expected to be MEM. Whenever the related insn
2194 pattern changed, above code which calculate mem should be
2195 updated. */
2196 gcc_assert (GET_CODE (mem) == MEM);
2197 MEM_READONLY_P (mem) = 1;
2198 MEM_NOTRAP_P (mem) = 1;
2199 emit_insn (insn);
2200 return;
2203 case SYMBOL_SMALL_GOT_4G:
2205 /* In ILP32, the mode of dest can be either SImode or DImode,
2206 while the got entry is always of SImode size. The mode of
2207 dest depends on how dest is used: if dest is assigned to a
2208 pointer (e.g. in the memory), it has SImode; it may have
2209 DImode if dest is dereferenced to access the memeory.
2210 This is why we have to handle three different ldr_got_small
2211 patterns here (two patterns for ILP32). */
2213 rtx insn;
2214 rtx mem;
2215 rtx tmp_reg = dest;
2216 machine_mode mode = GET_MODE (dest);
2218 if (can_create_pseudo_p ())
2219 tmp_reg = gen_reg_rtx (mode);
2221 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2222 if (mode == ptr_mode)
2224 if (mode == DImode)
2225 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2226 else
2227 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2229 mem = XVECEXP (SET_SRC (insn), 0, 0);
2231 else
2233 gcc_assert (mode == Pmode);
2235 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2236 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2239 gcc_assert (GET_CODE (mem) == MEM);
2240 MEM_READONLY_P (mem) = 1;
2241 MEM_NOTRAP_P (mem) = 1;
2242 emit_insn (insn);
2243 return;
2246 case SYMBOL_SMALL_TLSGD:
2248 rtx_insn *insns;
2249 machine_mode mode = GET_MODE (dest);
2250 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2252 start_sequence ();
2253 if (TARGET_ILP32)
2254 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2255 else
2256 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2257 insns = get_insns ();
2258 end_sequence ();
2260 RTL_CONST_CALL_P (insns) = 1;
2261 emit_libcall_block (insns, dest, result, imm);
2262 return;
2265 case SYMBOL_SMALL_TLSDESC:
2267 machine_mode mode = GET_MODE (dest);
2268 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2269 rtx tp;
2271 gcc_assert (mode == Pmode || mode == ptr_mode);
2273 /* In ILP32, the got entry is always of SImode size. Unlike
2274 small GOT, the dest is fixed at reg 0. */
2275 if (TARGET_ILP32)
2276 emit_insn (gen_tlsdesc_small_si (imm));
2277 else
2278 emit_insn (gen_tlsdesc_small_di (imm));
2279 tp = aarch64_load_tp (NULL);
2281 if (mode != Pmode)
2282 tp = gen_lowpart (mode, tp);
2284 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2285 if (REG_P (dest))
2286 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2287 return;
2290 case SYMBOL_SMALL_TLSIE:
2292 /* In ILP32, the mode of dest can be either SImode or DImode,
2293 while the got entry is always of SImode size. The mode of
2294 dest depends on how dest is used: if dest is assigned to a
2295 pointer (e.g. in the memory), it has SImode; it may have
2296 DImode if dest is dereferenced to access the memeory.
2297 This is why we have to handle three different tlsie_small
2298 patterns here (two patterns for ILP32). */
2299 machine_mode mode = GET_MODE (dest);
2300 rtx tmp_reg = gen_reg_rtx (mode);
2301 rtx tp = aarch64_load_tp (NULL);
2303 if (mode == ptr_mode)
2305 if (mode == DImode)
2306 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2307 else
2309 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2310 tp = gen_lowpart (mode, tp);
2313 else
2315 gcc_assert (mode == Pmode);
2316 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2319 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2320 if (REG_P (dest))
2321 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2322 return;
2325 case SYMBOL_TLSLE12:
2326 case SYMBOL_TLSLE24:
2327 case SYMBOL_TLSLE32:
2328 case SYMBOL_TLSLE48:
2330 machine_mode mode = GET_MODE (dest);
2331 rtx tp = aarch64_load_tp (NULL);
2333 if (mode != Pmode)
2334 tp = gen_lowpart (mode, tp);
2336 switch (type)
2338 case SYMBOL_TLSLE12:
2339 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2340 (dest, tp, imm));
2341 break;
2342 case SYMBOL_TLSLE24:
2343 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2344 (dest, tp, imm));
2345 break;
2346 case SYMBOL_TLSLE32:
2347 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2348 (dest, imm));
2349 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2350 (dest, dest, tp));
2351 break;
2352 case SYMBOL_TLSLE48:
2353 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2354 (dest, imm));
2355 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2356 (dest, dest, tp));
2357 break;
2358 default:
2359 gcc_unreachable ();
2362 if (REG_P (dest))
2363 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2364 return;
2367 case SYMBOL_TINY_GOT:
2368 emit_insn (gen_ldr_got_tiny (dest, imm));
2369 return;
2371 case SYMBOL_TINY_TLSIE:
2373 machine_mode mode = GET_MODE (dest);
2374 rtx tp = aarch64_load_tp (NULL);
2376 if (mode == ptr_mode)
2378 if (mode == DImode)
2379 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2380 else
2382 tp = gen_lowpart (mode, tp);
2383 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2386 else
2388 gcc_assert (mode == Pmode);
2389 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2392 if (REG_P (dest))
2393 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2394 return;
2397 default:
2398 gcc_unreachable ();
2402 /* Emit a move from SRC to DEST. Assume that the move expanders can
2403 handle all moves if !can_create_pseudo_p (). The distinction is
2404 important because, unlike emit_move_insn, the move expanders know
2405 how to force Pmode objects into the constant pool even when the
2406 constant pool address is not itself legitimate. */
2407 static rtx
2408 aarch64_emit_move (rtx dest, rtx src)
2410 return (can_create_pseudo_p ()
2411 ? emit_move_insn (dest, src)
2412 : emit_move_insn_1 (dest, src));
2415 /* Apply UNOPTAB to OP and store the result in DEST. */
2417 static void
2418 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2420 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2421 if (dest != tmp)
2422 emit_move_insn (dest, tmp);
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2427 static void
2428 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2430 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2431 OPTAB_DIRECT);
2432 if (dest != tmp)
2433 emit_move_insn (dest, tmp);
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437 taking care to handle partial overlap of register to register
2438 copies. Special cases are needed when moving between GP regs and
2439 FP regs. SRC can be a register, constant or memory; DST a register
2440 or memory. If either operand is memory it must not have any side
2441 effects. */
2442 void
2443 aarch64_split_128bit_move (rtx dst, rtx src)
2445 rtx dst_lo, dst_hi;
2446 rtx src_lo, src_hi;
2448 machine_mode mode = GET_MODE (dst);
2450 gcc_assert (mode == TImode || mode == TFmode);
2451 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2452 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2454 if (REG_P (dst) && REG_P (src))
2456 int src_regno = REGNO (src);
2457 int dst_regno = REGNO (dst);
2459 /* Handle FP <-> GP regs. */
2460 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2462 src_lo = gen_lowpart (word_mode, src);
2463 src_hi = gen_highpart (word_mode, src);
2465 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2466 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2467 return;
2469 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2471 dst_lo = gen_lowpart (word_mode, dst);
2472 dst_hi = gen_highpart (word_mode, dst);
2474 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2475 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2476 return;
2480 dst_lo = gen_lowpart (word_mode, dst);
2481 dst_hi = gen_highpart (word_mode, dst);
2482 src_lo = gen_lowpart (word_mode, src);
2483 src_hi = gen_highpart_mode (word_mode, mode, src);
2485 /* At most one pairing may overlap. */
2486 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2488 aarch64_emit_move (dst_hi, src_hi);
2489 aarch64_emit_move (dst_lo, src_lo);
2491 else
2493 aarch64_emit_move (dst_lo, src_lo);
2494 aarch64_emit_move (dst_hi, src_hi);
2498 bool
2499 aarch64_split_128bit_move_p (rtx dst, rtx src)
2501 return (! REG_P (src)
2502 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2505 /* Split a complex SIMD combine. */
2507 void
2508 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2510 machine_mode src_mode = GET_MODE (src1);
2511 machine_mode dst_mode = GET_MODE (dst);
2513 gcc_assert (VECTOR_MODE_P (dst_mode));
2514 gcc_assert (register_operand (dst, dst_mode)
2515 && register_operand (src1, src_mode)
2516 && register_operand (src2, src_mode));
2518 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2519 return;
2522 /* Split a complex SIMD move. */
2524 void
2525 aarch64_split_simd_move (rtx dst, rtx src)
2527 machine_mode src_mode = GET_MODE (src);
2528 machine_mode dst_mode = GET_MODE (dst);
2530 gcc_assert (VECTOR_MODE_P (dst_mode));
2532 if (REG_P (dst) && REG_P (src))
2534 gcc_assert (VECTOR_MODE_P (src_mode));
2535 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2539 bool
2540 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2541 machine_mode ymode, rtx y)
2543 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2544 gcc_assert (r != NULL);
2545 return rtx_equal_p (x, r);
2549 /* Return TARGET if it is nonnull and a register of mode MODE.
2550 Otherwise, return a fresh register of mode MODE if we can,
2551 or TARGET reinterpreted as MODE if we can't. */
2553 static rtx
2554 aarch64_target_reg (rtx target, machine_mode mode)
2556 if (target && REG_P (target) && GET_MODE (target) == mode)
2557 return target;
2558 if (!can_create_pseudo_p ())
2560 gcc_assert (target);
2561 return gen_lowpart (mode, target);
2563 return gen_reg_rtx (mode);
2566 /* Return a register that contains the constant in BUILDER, given that
2567 the constant is a legitimate move operand. Use TARGET as the register
2568 if it is nonnull and convenient. */
2570 static rtx
2571 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2573 rtx src = builder.build ();
2574 target = aarch64_target_reg (target, GET_MODE (src));
2575 emit_insn (gen_rtx_SET (target, src));
2576 return target;
2579 static rtx
2580 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2582 if (can_create_pseudo_p ())
2583 return force_reg (mode, value);
2584 else
2586 gcc_assert (x);
2587 aarch64_emit_move (x, value);
2588 return x;
2592 /* Return true if predicate value X is a constant in which every element
2593 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2594 value, i.e. as a predicate in which all bits are significant. */
2596 static bool
2597 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2599 if (GET_CODE (x) != CONST_VECTOR)
2600 return false;
2602 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2603 GET_MODE_NUNITS (GET_MODE (x)));
2604 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2605 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2606 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2608 unsigned int nelts = const_vector_encoded_nelts (x);
2609 for (unsigned int i = 0; i < nelts; ++i)
2611 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2612 if (!CONST_INT_P (elt))
2613 return false;
2615 builder.quick_push (elt);
2616 for (unsigned int j = 1; j < factor; ++j)
2617 builder.quick_push (const0_rtx);
2619 builder.finalize ();
2620 return true;
2623 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2624 widest predicate element size it can have (that is, the largest size
2625 for which each element would still be 0 or 1). */
2627 unsigned int
2628 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2630 /* Start with the most optimistic assumption: that we only need
2631 one bit per pattern. This is what we will use if only the first
2632 bit in each pattern is ever set. */
2633 unsigned int mask = GET_MODE_SIZE (DImode);
2634 mask |= builder.npatterns ();
2636 /* Look for set bits. */
2637 unsigned int nelts = builder.encoded_nelts ();
2638 for (unsigned int i = 1; i < nelts; ++i)
2639 if (INTVAL (builder.elt (i)) != 0)
2641 if (i & 1)
2642 return 1;
2643 mask |= i;
2645 return mask & -mask;
2648 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2649 that the constant would have with predicate element size ELT_SIZE
2650 (ignoring the upper bits in each element) and return:
2652 * -1 if all bits are set
2653 * N if the predicate has N leading set bits followed by all clear bits
2654 * 0 if the predicate does not have any of these forms. */
2657 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2658 unsigned int elt_size)
2660 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2661 followed by set bits. */
2662 if (builder.nelts_per_pattern () == 3)
2663 return 0;
2665 /* Skip over leading set bits. */
2666 unsigned int nelts = builder.encoded_nelts ();
2667 unsigned int i = 0;
2668 for (; i < nelts; i += elt_size)
2669 if (INTVAL (builder.elt (i)) == 0)
2670 break;
2671 unsigned int vl = i / elt_size;
2673 /* Check for the all-true case. */
2674 if (i == nelts)
2675 return -1;
2677 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2678 repeating pattern of set bits followed by clear bits. */
2679 if (builder.nelts_per_pattern () != 2)
2680 return 0;
2682 /* We have a "foreground" value and a duplicated "background" value.
2683 If the background might repeat and the last set bit belongs to it,
2684 we might have set bits followed by clear bits followed by set bits. */
2685 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2686 return 0;
2688 /* Make sure that the rest are all clear. */
2689 for (; i < nelts; i += elt_size)
2690 if (INTVAL (builder.elt (i)) != 0)
2691 return 0;
2693 return vl;
2696 /* See if there is an svpattern that encodes an SVE predicate of mode
2697 PRED_MODE in which the first VL bits are set and the rest are clear.
2698 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2699 A VL of -1 indicates an all-true vector. */
2701 aarch64_svpattern
2702 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2704 if (vl < 0)
2705 return AARCH64_SV_ALL;
2707 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2708 return AARCH64_NUM_SVPATTERNS;
2710 if (vl >= 1 && vl <= 8)
2711 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2713 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2714 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2716 int max_vl;
2717 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2719 if (vl == (max_vl / 3) * 3)
2720 return AARCH64_SV_MUL3;
2721 /* These would only trigger for non-power-of-2 lengths. */
2722 if (vl == (max_vl & -4))
2723 return AARCH64_SV_MUL4;
2724 if (vl == (1 << floor_log2 (max_vl)))
2725 return AARCH64_SV_POW2;
2726 if (vl == max_vl)
2727 return AARCH64_SV_ALL;
2729 return AARCH64_NUM_SVPATTERNS;
2732 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2733 bits has the lowest bit set and the upper bits clear. This is the
2734 VNx16BImode equivalent of a PTRUE for controlling elements of
2735 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2736 all bits are significant, even the upper zeros. */
2739 aarch64_ptrue_all (unsigned int elt_size)
2741 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2742 builder.quick_push (const1_rtx);
2743 for (unsigned int i = 1; i < elt_size; ++i)
2744 builder.quick_push (const0_rtx);
2745 return builder.build ();
2748 /* Return an all-true predicate register of mode MODE. */
2751 aarch64_ptrue_reg (machine_mode mode)
2753 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2754 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2755 return gen_lowpart (mode, reg);
2758 /* Return an all-false predicate register of mode MODE. */
2761 aarch64_pfalse_reg (machine_mode mode)
2763 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2764 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2765 return gen_lowpart (mode, reg);
2768 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2769 true, or alternatively if we know that the operation predicated by
2770 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2771 aarch64_sve_gp_strictness operand that describes the operation
2772 predicated by PRED1[0]. */
2774 bool
2775 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2777 machine_mode mode = GET_MODE (pred2);
2778 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2779 && mode == GET_MODE (pred1[0])
2780 && aarch64_sve_gp_strictness (pred1[1], SImode));
2781 return (pred1[0] == CONSTM1_RTX (mode)
2782 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2783 || rtx_equal_p (pred1[0], pred2));
2786 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2787 for it. PRED2[0] is the predicate for the instruction whose result
2788 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2789 for it. Return true if we can prove that the two predicates are
2790 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2791 with PRED1[0] without changing behavior. */
2793 bool
2794 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2796 machine_mode mode = GET_MODE (pred1[0]);
2797 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2798 && mode == GET_MODE (pred2[0])
2799 && aarch64_sve_ptrue_flag (pred1[1], SImode)
2800 && aarch64_sve_ptrue_flag (pred2[1], SImode));
2802 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2803 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2804 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2805 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2806 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2809 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2810 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2811 Use TARGET as the target register if nonnull and convenient. */
2813 static rtx
2814 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2815 machine_mode data_mode, rtx op1, rtx op2)
2817 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2818 expand_operand ops[5];
2819 create_output_operand (&ops[0], target, pred_mode);
2820 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2821 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2822 create_input_operand (&ops[3], op1, data_mode);
2823 create_input_operand (&ops[4], op2, data_mode);
2824 expand_insn (icode, 5, ops);
2825 return ops[0].value;
2828 /* Use a comparison to convert integer vector SRC into MODE, which is
2829 the corresponding SVE predicate mode. Use TARGET for the result
2830 if it's nonnull and convenient. */
2832 static rtx
2833 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2835 machine_mode src_mode = GET_MODE (src);
2836 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2837 src, CONST0_RTX (src_mode));
2840 /* Return true if we can move VALUE into a register using a single
2841 CNT[BHWD] instruction. */
2843 static bool
2844 aarch64_sve_cnt_immediate_p (poly_int64 value)
2846 HOST_WIDE_INT factor = value.coeffs[0];
2847 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2848 return (value.coeffs[1] == factor
2849 && IN_RANGE (factor, 2, 16 * 16)
2850 && (factor & 1) == 0
2851 && factor <= 16 * (factor & -factor));
2854 /* Likewise for rtx X. */
2856 bool
2857 aarch64_sve_cnt_immediate_p (rtx x)
2859 poly_int64 value;
2860 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2863 /* Return the asm string for an instruction with a CNT-like vector size
2864 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2865 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2866 first part of the operands template (the part that comes before the
2867 vector size itself). FACTOR is the number of quadwords.
2868 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2869 If it is zero, we can use any element size. */
2871 static char *
2872 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2873 unsigned int factor,
2874 unsigned int nelts_per_vq)
2876 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2878 if (nelts_per_vq == 0)
2879 /* There is some overlap in the ranges of the four CNT instructions.
2880 Here we always use the smallest possible element size, so that the
2881 multiplier is 1 whereever possible. */
2882 nelts_per_vq = factor & -factor;
2883 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2884 gcc_assert (IN_RANGE (shift, 1, 4));
2885 char suffix = "dwhb"[shift - 1];
2887 factor >>= shift;
2888 unsigned int written;
2889 if (factor == 1)
2890 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2891 prefix, suffix, operands);
2892 else
2893 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2894 prefix, suffix, operands, factor);
2895 gcc_assert (written < sizeof (buffer));
2896 return buffer;
2899 /* Return the asm string for an instruction with a CNT-like vector size
2900 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2901 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2902 first part of the operands template (the part that comes before the
2903 vector size itself). X is the value of the vector size operand,
2904 as a polynomial integer rtx. */
2906 char *
2907 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2908 rtx x)
2910 poly_int64 value = rtx_to_poly_int64 (x);
2911 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2912 return aarch64_output_sve_cnt_immediate (prefix, operands,
2913 value.coeffs[1], 0);
2916 /* Return true if we can add VALUE to a register using a single ADDVL
2917 or ADDPL instruction. */
2919 static bool
2920 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2922 HOST_WIDE_INT factor = value.coeffs[0];
2923 if (factor == 0 || value.coeffs[1] != factor)
2924 return false;
2925 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2926 and a value of 16 is one vector width. */
2927 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2928 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2931 /* Likewise for rtx X. */
2933 bool
2934 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2936 poly_int64 value;
2937 return (poly_int_rtx_p (x, &value)
2938 && aarch64_sve_addvl_addpl_immediate_p (value));
2941 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2942 and storing the result in operand 0. */
2944 char *
2945 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2947 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2948 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2949 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2951 /* Use INC or DEC if possible. */
2952 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2954 if (aarch64_sve_cnt_immediate_p (offset_value))
2955 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2956 offset_value.coeffs[1], 0);
2957 if (aarch64_sve_cnt_immediate_p (-offset_value))
2958 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2959 -offset_value.coeffs[1], 0);
2962 int factor = offset_value.coeffs[1];
2963 if ((factor & 15) == 0)
2964 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2965 else
2966 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2967 return buffer;
2970 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2971 instruction. If it is, store the number of elements in each vector
2972 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2973 factor in *FACTOR_OUT (if nonnull). */
2975 bool
2976 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2977 unsigned int *nelts_per_vq_out)
2979 rtx elt;
2980 poly_int64 value;
2982 if (!const_vec_duplicate_p (x, &elt)
2983 || !poly_int_rtx_p (elt, &value))
2984 return false;
2986 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2987 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2988 /* There's no vector INCB. */
2989 return false;
2991 HOST_WIDE_INT factor = value.coeffs[0];
2992 if (value.coeffs[1] != factor)
2993 return false;
2995 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2996 if ((factor % nelts_per_vq) != 0
2997 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2998 return false;
3000 if (factor_out)
3001 *factor_out = factor;
3002 if (nelts_per_vq_out)
3003 *nelts_per_vq_out = nelts_per_vq;
3004 return true;
3007 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3008 instruction. */
3010 bool
3011 aarch64_sve_inc_dec_immediate_p (rtx x)
3013 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
3016 /* Return the asm template for an SVE vector INC or DEC instruction.
3017 OPERANDS gives the operands before the vector count and X is the
3018 value of the vector count operand itself. */
3020 char *
3021 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
3023 int factor;
3024 unsigned int nelts_per_vq;
3025 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3026 gcc_unreachable ();
3027 if (factor < 0)
3028 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3029 nelts_per_vq);
3030 else
3031 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3032 nelts_per_vq);
3035 static int
3036 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3037 scalar_int_mode mode)
3039 int i;
3040 unsigned HOST_WIDE_INT val, val2, mask;
3041 int one_match, zero_match;
3042 int num_insns;
3044 val = INTVAL (imm);
3046 if (aarch64_move_imm (val, mode))
3048 if (generate)
3049 emit_insn (gen_rtx_SET (dest, imm));
3050 return 1;
3053 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3054 (with XXXX non-zero). In that case check to see if the move can be done in
3055 a smaller mode. */
3056 val2 = val & 0xffffffff;
3057 if (mode == DImode
3058 && aarch64_move_imm (val2, SImode)
3059 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3061 if (generate)
3062 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3064 /* Check if we have to emit a second instruction by checking to see
3065 if any of the upper 32 bits of the original DI mode value is set. */
3066 if (val == val2)
3067 return 1;
3069 i = (val >> 48) ? 48 : 32;
3071 if (generate)
3072 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3073 GEN_INT ((val >> i) & 0xffff)));
3075 return 2;
3078 if ((val >> 32) == 0 || mode == SImode)
3080 if (generate)
3082 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3083 if (mode == SImode)
3084 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3085 GEN_INT ((val >> 16) & 0xffff)));
3086 else
3087 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3088 GEN_INT ((val >> 16) & 0xffff)));
3090 return 2;
3093 /* Remaining cases are all for DImode. */
3095 mask = 0xffff;
3096 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3097 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3098 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3099 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3101 if (zero_match != 2 && one_match != 2)
3103 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3104 For a 64-bit bitmask try whether changing 16 bits to all ones or
3105 zeroes creates a valid bitmask. To check any repeated bitmask,
3106 try using 16 bits from the other 32-bit half of val. */
3108 for (i = 0; i < 64; i += 16, mask <<= 16)
3110 val2 = val & ~mask;
3111 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3112 break;
3113 val2 = val | mask;
3114 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3115 break;
3116 val2 = val2 & ~mask;
3117 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3118 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3119 break;
3121 if (i != 64)
3123 if (generate)
3125 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3126 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3127 GEN_INT ((val >> i) & 0xffff)));
3129 return 2;
3133 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3134 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3135 otherwise skip zero bits. */
3137 num_insns = 1;
3138 mask = 0xffff;
3139 val2 = one_match > zero_match ? ~val : val;
3140 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3142 if (generate)
3143 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3144 ? (val | ~(mask << i))
3145 : (val & (mask << i)))));
3146 for (i += 16; i < 64; i += 16)
3148 if ((val2 & (mask << i)) == 0)
3149 continue;
3150 if (generate)
3151 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3152 GEN_INT ((val >> i) & 0xffff)));
3153 num_insns ++;
3156 return num_insns;
3159 /* Return whether imm is a 128-bit immediate which is simple enough to
3160 expand inline. */
3161 bool
3162 aarch64_mov128_immediate (rtx imm)
3164 if (GET_CODE (imm) == CONST_INT)
3165 return true;
3167 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3169 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3170 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3172 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3173 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3177 /* Return the number of temporary registers that aarch64_add_offset_1
3178 would need to add OFFSET to a register. */
3180 static unsigned int
3181 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3183 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3186 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3187 a non-polynomial OFFSET. MODE is the mode of the addition.
3188 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3189 be set and CFA adjustments added to the generated instructions.
3191 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3192 temporary if register allocation is already complete. This temporary
3193 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3194 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3195 the immediate again.
3197 Since this function may be used to adjust the stack pointer, we must
3198 ensure that it cannot cause transient stack deallocation (for example
3199 by first incrementing SP and then decrementing when adjusting by a
3200 large immediate). */
3202 static void
3203 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3204 rtx src, HOST_WIDE_INT offset, rtx temp1,
3205 bool frame_related_p, bool emit_move_imm)
3207 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3208 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3210 HOST_WIDE_INT moffset = abs_hwi (offset);
3211 rtx_insn *insn;
3213 if (!moffset)
3215 if (!rtx_equal_p (dest, src))
3217 insn = emit_insn (gen_rtx_SET (dest, src));
3218 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3220 return;
3223 /* Single instruction adjustment. */
3224 if (aarch64_uimm12_shift (moffset))
3226 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3227 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3228 return;
3231 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3232 and either:
3234 a) the offset cannot be loaded by a 16-bit move or
3235 b) there is no spare register into which we can move it. */
3236 if (moffset < 0x1000000
3237 && ((!temp1 && !can_create_pseudo_p ())
3238 || !aarch64_move_imm (moffset, mode)))
3240 HOST_WIDE_INT low_off = moffset & 0xfff;
3242 low_off = offset < 0 ? -low_off : low_off;
3243 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3244 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3245 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3246 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3247 return;
3250 /* Emit a move immediate if required and an addition/subtraction. */
3251 if (emit_move_imm)
3253 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3254 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3256 insn = emit_insn (offset < 0
3257 ? gen_sub3_insn (dest, src, temp1)
3258 : gen_add3_insn (dest, src, temp1));
3259 if (frame_related_p)
3261 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3262 rtx adj = plus_constant (mode, src, offset);
3263 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3267 /* Return the number of temporary registers that aarch64_add_offset
3268 would need to move OFFSET into a register or add OFFSET to a register;
3269 ADD_P is true if we want the latter rather than the former. */
3271 static unsigned int
3272 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3274 /* This follows the same structure as aarch64_add_offset. */
3275 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3276 return 0;
3278 unsigned int count = 0;
3279 HOST_WIDE_INT factor = offset.coeffs[1];
3280 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3281 poly_int64 poly_offset (factor, factor);
3282 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3283 /* Need one register for the ADDVL/ADDPL result. */
3284 count += 1;
3285 else if (factor != 0)
3287 factor = abs (factor);
3288 if (factor > 16 * (factor & -factor))
3289 /* Need one register for the CNT result and one for the multiplication
3290 factor. If necessary, the second temporary can be reused for the
3291 constant part of the offset. */
3292 return 2;
3293 /* Need one register for the CNT result (which might then
3294 be shifted). */
3295 count += 1;
3297 return count + aarch64_add_offset_1_temporaries (constant);
3300 /* If X can be represented as a poly_int64, return the number
3301 of temporaries that are required to add it to a register.
3302 Return -1 otherwise. */
3305 aarch64_add_offset_temporaries (rtx x)
3307 poly_int64 offset;
3308 if (!poly_int_rtx_p (x, &offset))
3309 return -1;
3310 return aarch64_offset_temporaries (true, offset);
3313 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3314 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3315 be set and CFA adjustments added to the generated instructions.
3317 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3318 temporary if register allocation is already complete. This temporary
3319 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3320 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3321 false to avoid emitting the immediate again.
3323 TEMP2, if nonnull, is a second temporary register that doesn't
3324 overlap either DEST or REG.
3326 Since this function may be used to adjust the stack pointer, we must
3327 ensure that it cannot cause transient stack deallocation (for example
3328 by first incrementing SP and then decrementing when adjusting by a
3329 large immediate). */
3331 static void
3332 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3333 poly_int64 offset, rtx temp1, rtx temp2,
3334 bool frame_related_p, bool emit_move_imm = true)
3336 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3337 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3338 gcc_assert (temp1 == NULL_RTX
3339 || !frame_related_p
3340 || !reg_overlap_mentioned_p (temp1, dest));
3341 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3343 /* Try using ADDVL or ADDPL to add the whole value. */
3344 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3346 rtx offset_rtx = gen_int_mode (offset, mode);
3347 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3348 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3349 return;
3352 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3353 SVE vector register, over and above the minimum size of 128 bits.
3354 This is equivalent to half the value returned by CNTD with a
3355 vector shape of ALL. */
3356 HOST_WIDE_INT factor = offset.coeffs[1];
3357 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3359 /* Try using ADDVL or ADDPL to add the VG-based part. */
3360 poly_int64 poly_offset (factor, factor);
3361 if (src != const0_rtx
3362 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3364 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3365 if (frame_related_p)
3367 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3368 RTX_FRAME_RELATED_P (insn) = true;
3369 src = dest;
3371 else
3373 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3374 src = aarch64_force_temporary (mode, temp1, addr);
3375 temp1 = temp2;
3376 temp2 = NULL_RTX;
3379 /* Otherwise use a CNT-based sequence. */
3380 else if (factor != 0)
3382 /* Use a subtraction if we have a negative factor. */
3383 rtx_code code = PLUS;
3384 if (factor < 0)
3386 factor = -factor;
3387 code = MINUS;
3390 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3391 into the multiplication. */
3392 rtx val;
3393 int shift = 0;
3394 if (factor & 1)
3395 /* Use a right shift by 1. */
3396 shift = -1;
3397 else
3398 factor /= 2;
3399 HOST_WIDE_INT low_bit = factor & -factor;
3400 if (factor <= 16 * low_bit)
3402 if (factor > 16 * 8)
3404 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3405 the value with the minimum multiplier and shift it into
3406 position. */
3407 int extra_shift = exact_log2 (low_bit);
3408 shift += extra_shift;
3409 factor >>= extra_shift;
3411 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3413 else
3415 /* Use CNTD, then multiply it by FACTOR. */
3416 val = gen_int_mode (poly_int64 (2, 2), mode);
3417 val = aarch64_force_temporary (mode, temp1, val);
3419 /* Go back to using a negative multiplication factor if we have
3420 no register from which to subtract. */
3421 if (code == MINUS && src == const0_rtx)
3423 factor = -factor;
3424 code = PLUS;
3426 rtx coeff1 = gen_int_mode (factor, mode);
3427 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3428 val = gen_rtx_MULT (mode, val, coeff1);
3431 if (shift > 0)
3433 /* Multiply by 1 << SHIFT. */
3434 val = aarch64_force_temporary (mode, temp1, val);
3435 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3437 else if (shift == -1)
3439 /* Divide by 2. */
3440 val = aarch64_force_temporary (mode, temp1, val);
3441 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3444 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3445 if (src != const0_rtx)
3447 val = aarch64_force_temporary (mode, temp1, val);
3448 val = gen_rtx_fmt_ee (code, mode, src, val);
3450 else if (code == MINUS)
3452 val = aarch64_force_temporary (mode, temp1, val);
3453 val = gen_rtx_NEG (mode, val);
3456 if (constant == 0 || frame_related_p)
3458 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3459 if (frame_related_p)
3461 RTX_FRAME_RELATED_P (insn) = true;
3462 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3463 gen_rtx_SET (dest, plus_constant (Pmode, src,
3464 poly_offset)));
3466 src = dest;
3467 if (constant == 0)
3468 return;
3470 else
3472 src = aarch64_force_temporary (mode, temp1, val);
3473 temp1 = temp2;
3474 temp2 = NULL_RTX;
3477 emit_move_imm = true;
3480 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3481 frame_related_p, emit_move_imm);
3484 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3485 than a poly_int64. */
3487 void
3488 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3489 rtx offset_rtx, rtx temp1, rtx temp2)
3491 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3492 temp1, temp2, false);
3495 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3496 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3497 if TEMP1 already contains abs (DELTA). */
3499 static inline void
3500 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3502 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3503 temp1, temp2, true, emit_move_imm);
3506 /* Subtract DELTA from the stack pointer, marking the instructions
3507 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3508 if nonnull. */
3510 static inline void
3511 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3512 bool emit_move_imm = true)
3514 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3515 temp1, temp2, frame_related_p, emit_move_imm);
3518 /* Set DEST to (vec_series BASE STEP). */
3520 static void
3521 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3523 machine_mode mode = GET_MODE (dest);
3524 scalar_mode inner = GET_MODE_INNER (mode);
3526 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3527 if (!aarch64_sve_index_immediate_p (base))
3528 base = force_reg (inner, base);
3529 if (!aarch64_sve_index_immediate_p (step))
3530 step = force_reg (inner, step);
3532 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3535 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3536 register of mode MODE. Use TARGET for the result if it's nonnull
3537 and convenient.
3539 The two vector modes must have the same element mode. The behavior
3540 is to duplicate architectural lane N of SRC into architectural lanes
3541 N + I * STEP of the result. On big-endian targets, architectural
3542 lane 0 of an Advanced SIMD vector is the last element of the vector
3543 in memory layout, so for big-endian targets this operation has the
3544 effect of reversing SRC before duplicating it. Callers need to
3545 account for this. */
3548 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3550 machine_mode src_mode = GET_MODE (src);
3551 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3552 insn_code icode = (BYTES_BIG_ENDIAN
3553 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3554 : code_for_aarch64_vec_duplicate_vq_le (mode));
3556 unsigned int i = 0;
3557 expand_operand ops[3];
3558 create_output_operand (&ops[i++], target, mode);
3559 create_output_operand (&ops[i++], src, src_mode);
3560 if (BYTES_BIG_ENDIAN)
3562 /* Create a PARALLEL describing the reversal of SRC. */
3563 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3564 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3565 nelts_per_vq - 1, -1);
3566 create_fixed_operand (&ops[i++], sel);
3568 expand_insn (icode, i, ops);
3569 return ops[0].value;
3572 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3573 the memory image into DEST. Return true on success. */
3575 static bool
3576 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3578 src = force_const_mem (GET_MODE (src), src);
3579 if (!src)
3580 return false;
3582 /* Make sure that the address is legitimate. */
3583 if (!aarch64_sve_ld1rq_operand_p (src))
3585 rtx addr = force_reg (Pmode, XEXP (src, 0));
3586 src = replace_equiv_address (src, addr);
3589 machine_mode mode = GET_MODE (dest);
3590 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3591 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3592 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3593 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3594 return true;
3597 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3598 SVE data mode and isn't a legitimate constant. Use TARGET for the
3599 result if convenient.
3601 The returned register can have whatever mode seems most natural
3602 given the contents of SRC. */
3604 static rtx
3605 aarch64_expand_sve_const_vector (rtx target, rtx src)
3607 machine_mode mode = GET_MODE (src);
3608 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3609 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3610 scalar_mode elt_mode = GET_MODE_INNER (mode);
3611 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3612 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3614 if (nelts_per_pattern == 1 && encoded_bits == 128)
3616 /* The constant is a duplicated quadword but can't be narrowed
3617 beyond a quadword. Get the memory image of the first quadword
3618 as a 128-bit vector and try using LD1RQ to load it from memory.
3620 The effect for both endiannesses is to load memory lane N into
3621 architectural lanes N + I * STEP of the result. On big-endian
3622 targets, the layout of the 128-bit vector in an Advanced SIMD
3623 register would be different from its layout in an SVE register,
3624 but this 128-bit vector is a memory value only. */
3625 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3626 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3627 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3628 return target;
3631 if (nelts_per_pattern == 1 && encoded_bits < 128)
3633 /* The vector is a repeating sequence of 64 bits or fewer.
3634 See if we can load them using an Advanced SIMD move and then
3635 duplicate it to fill a vector. This is better than using a GPR
3636 move because it keeps everything in the same register file. */
3637 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3638 rtx_vector_builder builder (vq_mode, npatterns, 1);
3639 for (unsigned int i = 0; i < npatterns; ++i)
3641 /* We want memory lane N to go into architectural lane N,
3642 so reverse for big-endian targets. The DUP .Q pattern
3643 has a compensating reverse built-in. */
3644 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3645 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3647 rtx vq_src = builder.build ();
3648 if (aarch64_simd_valid_immediate (vq_src, NULL))
3650 vq_src = force_reg (vq_mode, vq_src);
3651 return aarch64_expand_sve_dupq (target, mode, vq_src);
3654 /* Get an integer representation of the repeating part of Advanced
3655 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3656 which for big-endian targets is lane-swapped wrt a normal
3657 Advanced SIMD vector. This means that for both endiannesses,
3658 memory lane N of SVE vector SRC corresponds to architectural
3659 lane N of a register holding VQ_SRC. This in turn means that
3660 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3661 as a single 128-bit value) and thus that memory lane 0 of SRC is
3662 in the lsb of the integer. Duplicating the integer therefore
3663 ensures that memory lane N of SRC goes into architectural lane
3664 N + I * INDEX of the SVE register. */
3665 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3666 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3667 if (elt_value)
3669 /* Pretend that we had a vector of INT_MODE to start with. */
3670 elt_mode = int_mode;
3671 mode = aarch64_full_sve_mode (int_mode).require ();
3673 /* If the integer can be moved into a general register by a
3674 single instruction, do that and duplicate the result. */
3675 if (CONST_INT_P (elt_value)
3676 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3678 elt_value = force_reg (elt_mode, elt_value);
3679 return expand_vector_broadcast (mode, elt_value);
3682 else if (npatterns == 1)
3683 /* We're duplicating a single value, but can't do better than
3684 force it to memory and load from there. This handles things
3685 like symbolic constants. */
3686 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3688 if (elt_value)
3690 /* Load the element from memory if we can, otherwise move it into
3691 a register and use a DUP. */
3692 rtx op = force_const_mem (elt_mode, elt_value);
3693 if (!op)
3694 op = force_reg (elt_mode, elt_value);
3695 return expand_vector_broadcast (mode, op);
3699 /* Try using INDEX. */
3700 rtx base, step;
3701 if (const_vec_series_p (src, &base, &step))
3703 aarch64_expand_vec_series (target, base, step);
3704 return target;
3707 /* From here on, it's better to force the whole constant to memory
3708 if we can. */
3709 if (GET_MODE_NUNITS (mode).is_constant ())
3710 return NULL_RTX;
3712 /* Expand each pattern individually. */
3713 gcc_assert (npatterns > 1);
3714 rtx_vector_builder builder;
3715 auto_vec<rtx, 16> vectors (npatterns);
3716 for (unsigned int i = 0; i < npatterns; ++i)
3718 builder.new_vector (mode, 1, nelts_per_pattern);
3719 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3720 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3721 vectors.quick_push (force_reg (mode, builder.build ()));
3724 /* Use permutes to interleave the separate vectors. */
3725 while (npatterns > 1)
3727 npatterns /= 2;
3728 for (unsigned int i = 0; i < npatterns; ++i)
3730 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3731 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3732 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3733 vectors[i] = tmp;
3736 gcc_assert (vectors[0] == target);
3737 return target;
3740 /* Use WHILE to set a predicate register of mode MODE in which the first
3741 VL bits are set and the rest are clear. Use TARGET for the register
3742 if it's nonnull and convenient. */
3744 static rtx
3745 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3746 unsigned int vl)
3748 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3749 target = aarch64_target_reg (target, mode);
3750 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3751 return target;
3754 static rtx
3755 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3757 /* BUILDER is a constant predicate in which the index of every set bit
3758 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3759 by inverting every element at a multiple of ELT_SIZE and EORing the
3760 result with an ELT_SIZE PTRUE.
3762 Return a register that contains the constant on success, otherwise
3763 return null. Use TARGET as the register if it is nonnull and
3764 convenient. */
3766 static rtx
3767 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3768 unsigned int elt_size)
3770 /* Invert every element at a multiple of ELT_SIZE, keeping the
3771 other bits zero. */
3772 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3773 builder.nelts_per_pattern ());
3774 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3775 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3776 inv_builder.quick_push (const1_rtx);
3777 else
3778 inv_builder.quick_push (const0_rtx);
3779 inv_builder.finalize ();
3781 /* See if we can load the constant cheaply. */
3782 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3783 if (!inv)
3784 return NULL_RTX;
3786 /* EOR the result with an ELT_SIZE PTRUE. */
3787 rtx mask = aarch64_ptrue_all (elt_size);
3788 mask = force_reg (VNx16BImode, mask);
3789 target = aarch64_target_reg (target, VNx16BImode);
3790 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3791 return target;
3794 /* BUILDER is a constant predicate in which the index of every set bit
3795 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3796 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3797 register on success, otherwise return null. Use TARGET as the register
3798 if nonnull and convenient. */
3800 static rtx
3801 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3802 unsigned int elt_size,
3803 unsigned int permute_size)
3805 /* We're going to split the constant into two new constants A and B,
3806 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3807 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3809 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3810 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3812 where _ indicates elements that will be discarded by the permute.
3814 First calculate the ELT_SIZEs for A and B. */
3815 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3816 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3817 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3818 if (INTVAL (builder.elt (i)) != 0)
3820 if (i & permute_size)
3821 b_elt_size |= i - permute_size;
3822 else
3823 a_elt_size |= i;
3825 a_elt_size &= -a_elt_size;
3826 b_elt_size &= -b_elt_size;
3828 /* Now construct the vectors themselves. */
3829 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3830 builder.nelts_per_pattern ());
3831 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3832 builder.nelts_per_pattern ());
3833 unsigned int nelts = builder.encoded_nelts ();
3834 for (unsigned int i = 0; i < nelts; ++i)
3835 if (i & (elt_size - 1))
3837 a_builder.quick_push (const0_rtx);
3838 b_builder.quick_push (const0_rtx);
3840 else if ((i & permute_size) == 0)
3842 /* The A and B elements are significant. */
3843 a_builder.quick_push (builder.elt (i));
3844 b_builder.quick_push (builder.elt (i + permute_size));
3846 else
3848 /* The A and B elements are going to be discarded, so pick whatever
3849 is likely to give a nice constant. We are targeting element
3850 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3851 with the aim of each being a sequence of ones followed by
3852 a sequence of zeros. So:
3854 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3855 duplicate the last X_ELT_SIZE element, to extend the
3856 current sequence of ones or zeros.
3858 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3859 zero, so that the constant really does have X_ELT_SIZE and
3860 not a smaller size. */
3861 if (a_elt_size > permute_size)
3862 a_builder.quick_push (const0_rtx);
3863 else
3864 a_builder.quick_push (a_builder.elt (i - a_elt_size));
3865 if (b_elt_size > permute_size)
3866 b_builder.quick_push (const0_rtx);
3867 else
3868 b_builder.quick_push (b_builder.elt (i - b_elt_size));
3870 a_builder.finalize ();
3871 b_builder.finalize ();
3873 /* Try loading A into a register. */
3874 rtx_insn *last = get_last_insn ();
3875 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3876 if (!a)
3877 return NULL_RTX;
3879 /* Try loading B into a register. */
3880 rtx b = a;
3881 if (a_builder != b_builder)
3883 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3884 if (!b)
3886 delete_insns_since (last);
3887 return NULL_RTX;
3891 /* Emit the TRN1 itself. */
3892 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3893 target = aarch64_target_reg (target, mode);
3894 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3895 gen_lowpart (mode, a),
3896 gen_lowpart (mode, b)));
3897 return target;
3900 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3901 constant in BUILDER into an SVE predicate register. Return the register
3902 on success, otherwise return null. Use TARGET for the register if
3903 nonnull and convenient.
3905 ALLOW_RECURSE_P is true if we can use methods that would call this
3906 function recursively. */
3908 static rtx
3909 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3910 bool allow_recurse_p)
3912 if (builder.encoded_nelts () == 1)
3913 /* A PFALSE or a PTRUE .B ALL. */
3914 return aarch64_emit_set_immediate (target, builder);
3916 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3917 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3919 /* If we can load the constant using PTRUE, use it as-is. */
3920 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3921 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3922 return aarch64_emit_set_immediate (target, builder);
3924 /* Otherwise use WHILE to set the first VL bits. */
3925 return aarch64_sve_move_pred_via_while (target, mode, vl);
3928 if (!allow_recurse_p)
3929 return NULL_RTX;
3931 /* Try inverting the vector in element size ELT_SIZE and then EORing
3932 the result with an ELT_SIZE PTRUE. */
3933 if (INTVAL (builder.elt (0)) == 0)
3934 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
3935 elt_size))
3936 return res;
3938 /* Try using TRN1 to permute two simpler constants. */
3939 for (unsigned int i = elt_size; i <= 8; i *= 2)
3940 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
3941 elt_size, i))
3942 return res;
3944 return NULL_RTX;
3947 /* Return an SVE predicate register that contains the VNx16BImode
3948 constant in BUILDER, without going through the move expanders.
3950 The returned register can have whatever mode seems most natural
3951 given the contents of BUILDER. Use TARGET for the result if
3952 convenient. */
3954 static rtx
3955 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3957 /* Try loading the constant using pure predicate operations. */
3958 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
3959 return res;
3961 /* Try forcing the constant to memory. */
3962 if (builder.full_nelts ().is_constant ())
3963 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
3965 target = aarch64_target_reg (target, VNx16BImode);
3966 emit_move_insn (target, mem);
3967 return target;
3970 /* The last resort is to load the constant as an integer and then
3971 compare it against zero. Use -1 for set bits in order to increase
3972 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
3973 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
3974 builder.nelts_per_pattern ());
3975 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3976 int_builder.quick_push (INTVAL (builder.elt (i))
3977 ? constm1_rtx : const0_rtx);
3978 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
3979 int_builder.build ());
3982 /* Set DEST to immediate IMM. */
3984 void
3985 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3987 machine_mode mode = GET_MODE (dest);
3989 /* Check on what type of symbol it is. */
3990 scalar_int_mode int_mode;
3991 if ((GET_CODE (imm) == SYMBOL_REF
3992 || GET_CODE (imm) == LABEL_REF
3993 || GET_CODE (imm) == CONST
3994 || GET_CODE (imm) == CONST_POLY_INT)
3995 && is_a <scalar_int_mode> (mode, &int_mode))
3997 rtx mem;
3998 poly_int64 offset;
3999 HOST_WIDE_INT const_offset;
4000 enum aarch64_symbol_type sty;
4002 /* If we have (const (plus symbol offset)), separate out the offset
4003 before we start classifying the symbol. */
4004 rtx base = strip_offset (imm, &offset);
4006 /* We must always add an offset involving VL separately, rather than
4007 folding it into the relocation. */
4008 if (!offset.is_constant (&const_offset))
4010 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4011 emit_insn (gen_rtx_SET (dest, imm));
4012 else
4014 /* Do arithmetic on 32-bit values if the result is smaller
4015 than that. */
4016 if (partial_subreg_p (int_mode, SImode))
4018 /* It is invalid to do symbol calculations in modes
4019 narrower than SImode. */
4020 gcc_assert (base == const0_rtx);
4021 dest = gen_lowpart (SImode, dest);
4022 int_mode = SImode;
4024 if (base != const0_rtx)
4026 base = aarch64_force_temporary (int_mode, dest, base);
4027 aarch64_add_offset (int_mode, dest, base, offset,
4028 NULL_RTX, NULL_RTX, false);
4030 else
4031 aarch64_add_offset (int_mode, dest, base, offset,
4032 dest, NULL_RTX, false);
4034 return;
4037 sty = aarch64_classify_symbol (base, const_offset);
4038 switch (sty)
4040 case SYMBOL_FORCE_TO_MEM:
4041 if (const_offset != 0
4042 && targetm.cannot_force_const_mem (int_mode, imm))
4044 gcc_assert (can_create_pseudo_p ());
4045 base = aarch64_force_temporary (int_mode, dest, base);
4046 aarch64_add_offset (int_mode, dest, base, const_offset,
4047 NULL_RTX, NULL_RTX, false);
4048 return;
4051 mem = force_const_mem (ptr_mode, imm);
4052 gcc_assert (mem);
4054 /* If we aren't generating PC relative literals, then
4055 we need to expand the literal pool access carefully.
4056 This is something that needs to be done in a number
4057 of places, so could well live as a separate function. */
4058 if (!aarch64_pcrelative_literal_loads)
4060 gcc_assert (can_create_pseudo_p ());
4061 base = gen_reg_rtx (ptr_mode);
4062 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4063 if (ptr_mode != Pmode)
4064 base = convert_memory_address (Pmode, base);
4065 mem = gen_rtx_MEM (ptr_mode, base);
4068 if (int_mode != ptr_mode)
4069 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4071 emit_insn (gen_rtx_SET (dest, mem));
4073 return;
4075 case SYMBOL_SMALL_TLSGD:
4076 case SYMBOL_SMALL_TLSDESC:
4077 case SYMBOL_SMALL_TLSIE:
4078 case SYMBOL_SMALL_GOT_28K:
4079 case SYMBOL_SMALL_GOT_4G:
4080 case SYMBOL_TINY_GOT:
4081 case SYMBOL_TINY_TLSIE:
4082 if (const_offset != 0)
4084 gcc_assert(can_create_pseudo_p ());
4085 base = aarch64_force_temporary (int_mode, dest, base);
4086 aarch64_add_offset (int_mode, dest, base, const_offset,
4087 NULL_RTX, NULL_RTX, false);
4088 return;
4090 /* FALLTHRU */
4092 case SYMBOL_SMALL_ABSOLUTE:
4093 case SYMBOL_TINY_ABSOLUTE:
4094 case SYMBOL_TLSLE12:
4095 case SYMBOL_TLSLE24:
4096 case SYMBOL_TLSLE32:
4097 case SYMBOL_TLSLE48:
4098 aarch64_load_symref_appropriately (dest, imm, sty);
4099 return;
4101 default:
4102 gcc_unreachable ();
4106 if (!CONST_INT_P (imm))
4108 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4110 /* Only the low bit of each .H, .S and .D element is defined,
4111 so we can set the upper bits to whatever we like. If the
4112 predicate is all-true in MODE, prefer to set all the undefined
4113 bits as well, so that we can share a single .B predicate for
4114 all modes. */
4115 if (imm == CONSTM1_RTX (mode))
4116 imm = CONSTM1_RTX (VNx16BImode);
4118 /* All methods for constructing predicate modes wider than VNx16BI
4119 will set the upper bits of each element to zero. Expose this
4120 by moving such constants as a VNx16BI, so that all bits are
4121 significant and so that constants for different modes can be
4122 shared. The wider constant will still be available as a
4123 REG_EQUAL note. */
4124 rtx_vector_builder builder;
4125 if (aarch64_get_sve_pred_bits (builder, imm))
4127 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4128 if (dest != res)
4129 emit_move_insn (dest, gen_lowpart (mode, res));
4130 return;
4134 if (GET_CODE (imm) == HIGH
4135 || aarch64_simd_valid_immediate (imm, NULL))
4137 emit_insn (gen_rtx_SET (dest, imm));
4138 return;
4141 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4142 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4144 if (dest != res)
4145 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4146 return;
4149 rtx mem = force_const_mem (mode, imm);
4150 gcc_assert (mem);
4151 emit_move_insn (dest, mem);
4152 return;
4155 aarch64_internal_mov_immediate (dest, imm, true,
4156 as_a <scalar_int_mode> (mode));
4159 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4160 that is known to contain PTRUE. */
4162 void
4163 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4165 expand_operand ops[3];
4166 machine_mode mode = GET_MODE (dest);
4167 create_output_operand (&ops[0], dest, mode);
4168 create_input_operand (&ops[1], pred, GET_MODE(pred));
4169 create_input_operand (&ops[2], src, mode);
4170 temporary_volatile_ok v (true);
4171 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4174 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4175 operand is in memory. In this case we need to use the predicated LD1
4176 and ST1 instead of LDR and STR, both for correctness on big-endian
4177 targets and because LD1 and ST1 support a wider range of addressing modes.
4178 PRED_MODE is the mode of the predicate.
4180 See the comment at the head of aarch64-sve.md for details about the
4181 big-endian handling. */
4183 void
4184 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4186 machine_mode mode = GET_MODE (dest);
4187 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4188 if (!register_operand (src, mode)
4189 && !register_operand (dest, mode))
4191 rtx tmp = gen_reg_rtx (mode);
4192 if (MEM_P (src))
4193 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4194 else
4195 emit_move_insn (tmp, src);
4196 src = tmp;
4198 aarch64_emit_sve_pred_move (dest, ptrue, src);
4201 /* Called only on big-endian targets. See whether an SVE vector move
4202 from SRC to DEST is effectively a REV[BHW] instruction, because at
4203 least one operand is a subreg of an SVE vector that has wider or
4204 narrower elements. Return true and emit the instruction if so.
4206 For example:
4208 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4210 represents a VIEW_CONVERT between the following vectors, viewed
4211 in memory order:
4213 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4214 R1: { [0], [1], [2], [3], ... }
4216 The high part of lane X in R2 should therefore correspond to lane X*2
4217 of R1, but the register representations are:
4219 msb lsb
4220 R2: ...... [1].high [1].low [0].high [0].low
4221 R1: ...... [3] [2] [1] [0]
4223 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4224 We therefore need a reverse operation to swap the high and low values
4225 around.
4227 This is purely an optimization. Without it we would spill the
4228 subreg operand to the stack in one mode and reload it in the
4229 other mode, which has the same effect as the REV. */
4231 bool
4232 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4234 gcc_assert (BYTES_BIG_ENDIAN);
4235 if (GET_CODE (dest) == SUBREG)
4236 dest = SUBREG_REG (dest);
4237 if (GET_CODE (src) == SUBREG)
4238 src = SUBREG_REG (src);
4240 /* The optimization handles two single SVE REGs with different element
4241 sizes. */
4242 if (!REG_P (dest)
4243 || !REG_P (src)
4244 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4245 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4246 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4247 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4248 return false;
4250 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4251 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4252 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4253 UNSPEC_REV_SUBREG);
4254 emit_insn (gen_rtx_SET (dest, unspec));
4255 return true;
4258 /* Return a copy of X with mode MODE, without changing its other
4259 attributes. Unlike gen_lowpart, this doesn't care whether the
4260 mode change is valid. */
4262 static rtx
4263 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4265 if (GET_MODE (x) == mode)
4266 return x;
4268 x = shallow_copy_rtx (x);
4269 set_mode_and_regno (x, mode, REGNO (x));
4270 return x;
4273 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4274 operands. */
4276 void
4277 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4279 /* Decide which REV operation we need. The mode with narrower elements
4280 determines the mode of the operands and the mode with the wider
4281 elements determines the reverse width. */
4282 machine_mode mode_with_wider_elts = GET_MODE (dest);
4283 machine_mode mode_with_narrower_elts = GET_MODE (src);
4284 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4285 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4286 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4288 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4289 unsigned int unspec;
4290 if (wider_bytes == 8)
4291 unspec = UNSPEC_REV64;
4292 else if (wider_bytes == 4)
4293 unspec = UNSPEC_REV32;
4294 else if (wider_bytes == 2)
4295 unspec = UNSPEC_REV16;
4296 else
4297 gcc_unreachable ();
4298 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4300 /* Emit:
4302 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)] UNSPEC_PRED_X))
4304 with the appropriate modes. */
4305 ptrue = gen_lowpart (pred_mode, ptrue);
4306 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
4307 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
4308 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
4309 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
4310 UNSPEC_PRED_X);
4311 emit_insn (gen_rtx_SET (dest, src));
4314 static bool
4315 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4316 tree exp ATTRIBUTE_UNUSED)
4318 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4319 return false;
4321 return true;
4324 /* Implement TARGET_PASS_BY_REFERENCE. */
4326 static bool
4327 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4328 machine_mode mode,
4329 const_tree type,
4330 bool named ATTRIBUTE_UNUSED)
4332 HOST_WIDE_INT size;
4333 machine_mode dummymode;
4334 int nregs;
4336 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4337 if (mode == BLKmode && type)
4338 size = int_size_in_bytes (type);
4339 else
4340 /* No frontends can create types with variable-sized modes, so we
4341 shouldn't be asked to pass or return them. */
4342 size = GET_MODE_SIZE (mode).to_constant ();
4344 /* Aggregates are passed by reference based on their size. */
4345 if (type && AGGREGATE_TYPE_P (type))
4347 size = int_size_in_bytes (type);
4350 /* Variable sized arguments are always returned by reference. */
4351 if (size < 0)
4352 return true;
4354 /* Can this be a candidate to be passed in fp/simd register(s)? */
4355 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4356 &dummymode, &nregs,
4357 NULL))
4358 return false;
4360 /* Arguments which are variable sized or larger than 2 registers are
4361 passed by reference unless they are a homogenous floating point
4362 aggregate. */
4363 return size > 2 * UNITS_PER_WORD;
4366 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4367 static bool
4368 aarch64_return_in_msb (const_tree valtype)
4370 machine_mode dummy_mode;
4371 int dummy_int;
4373 /* Never happens in little-endian mode. */
4374 if (!BYTES_BIG_ENDIAN)
4375 return false;
4377 /* Only composite types smaller than or equal to 16 bytes can
4378 be potentially returned in registers. */
4379 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4380 || int_size_in_bytes (valtype) <= 0
4381 || int_size_in_bytes (valtype) > 16)
4382 return false;
4384 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4385 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4386 is always passed/returned in the least significant bits of fp/simd
4387 register(s). */
4388 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4389 &dummy_mode, &dummy_int, NULL))
4390 return false;
4392 return true;
4395 /* Implement TARGET_FUNCTION_VALUE.
4396 Define how to find the value returned by a function. */
4398 static rtx
4399 aarch64_function_value (const_tree type, const_tree func,
4400 bool outgoing ATTRIBUTE_UNUSED)
4402 machine_mode mode;
4403 int unsignedp;
4404 int count;
4405 machine_mode ag_mode;
4407 mode = TYPE_MODE (type);
4408 if (INTEGRAL_TYPE_P (type))
4409 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4411 if (aarch64_return_in_msb (type))
4413 HOST_WIDE_INT size = int_size_in_bytes (type);
4415 if (size % UNITS_PER_WORD != 0)
4417 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4418 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4422 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4423 &ag_mode, &count, NULL))
4425 if (!aarch64_composite_type_p (type, mode))
4427 gcc_assert (count == 1 && mode == ag_mode);
4428 return gen_rtx_REG (mode, V0_REGNUM);
4430 else
4432 int i;
4433 rtx par;
4435 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4436 for (i = 0; i < count; i++)
4438 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4439 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4440 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4441 XVECEXP (par, 0, i) = tmp;
4443 return par;
4446 else
4447 return gen_rtx_REG (mode, R0_REGNUM);
4450 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4451 Return true if REGNO is the number of a hard register in which the values
4452 of called function may come back. */
4454 static bool
4455 aarch64_function_value_regno_p (const unsigned int regno)
4457 /* Maximum of 16 bytes can be returned in the general registers. Examples
4458 of 16-byte return values are: 128-bit integers and 16-byte small
4459 structures (excluding homogeneous floating-point aggregates). */
4460 if (regno == R0_REGNUM || regno == R1_REGNUM)
4461 return true;
4463 /* Up to four fp/simd registers can return a function value, e.g. a
4464 homogeneous floating-point aggregate having four members. */
4465 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4466 return TARGET_FLOAT;
4468 return false;
4471 /* Implement TARGET_RETURN_IN_MEMORY.
4473 If the type T of the result of a function is such that
4474 void func (T arg)
4475 would require that arg be passed as a value in a register (or set of
4476 registers) according to the parameter passing rules, then the result
4477 is returned in the same registers as would be used for such an
4478 argument. */
4480 static bool
4481 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4483 HOST_WIDE_INT size;
4484 machine_mode ag_mode;
4485 int count;
4487 if (!AGGREGATE_TYPE_P (type)
4488 && TREE_CODE (type) != COMPLEX_TYPE
4489 && TREE_CODE (type) != VECTOR_TYPE)
4490 /* Simple scalar types always returned in registers. */
4491 return false;
4493 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4494 type,
4495 &ag_mode,
4496 &count,
4497 NULL))
4498 return false;
4500 /* Types larger than 2 registers returned in memory. */
4501 size = int_size_in_bytes (type);
4502 return (size < 0 || size > 2 * UNITS_PER_WORD);
4505 static bool
4506 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4507 const_tree type, int *nregs)
4509 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4510 return aarch64_vfp_is_call_or_return_candidate (mode,
4511 type,
4512 &pcum->aapcs_vfp_rmode,
4513 nregs,
4514 NULL);
4517 /* Given MODE and TYPE of a function argument, return the alignment in
4518 bits. The idea is to suppress any stronger alignment requested by
4519 the user and opt for the natural alignment (specified in AAPCS64 \S
4520 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4521 calculated in versions of GCC prior to GCC-9. This is a helper
4522 function for local use only. */
4524 static unsigned int
4525 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4526 bool *abi_break)
4528 *abi_break = false;
4529 if (!type)
4530 return GET_MODE_ALIGNMENT (mode);
4532 if (integer_zerop (TYPE_SIZE (type)))
4533 return 0;
4535 gcc_assert (TYPE_MODE (type) == mode);
4537 if (!AGGREGATE_TYPE_P (type))
4538 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4540 if (TREE_CODE (type) == ARRAY_TYPE)
4541 return TYPE_ALIGN (TREE_TYPE (type));
4543 unsigned int alignment = 0;
4544 unsigned int bitfield_alignment = 0;
4545 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4546 if (TREE_CODE (field) == FIELD_DECL)
4548 alignment = std::max (alignment, DECL_ALIGN (field));
4549 if (DECL_BIT_FIELD_TYPE (field))
4550 bitfield_alignment
4551 = std::max (bitfield_alignment,
4552 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4555 if (bitfield_alignment > alignment)
4557 *abi_break = true;
4558 return bitfield_alignment;
4561 return alignment;
4564 /* Layout a function argument according to the AAPCS64 rules. The rule
4565 numbers refer to the rule numbers in the AAPCS64. */
4567 static void
4568 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4569 const_tree type,
4570 bool named ATTRIBUTE_UNUSED)
4572 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4573 int ncrn, nvrn, nregs;
4574 bool allocate_ncrn, allocate_nvrn;
4575 HOST_WIDE_INT size;
4576 bool abi_break;
4578 /* We need to do this once per argument. */
4579 if (pcum->aapcs_arg_processed)
4580 return;
4582 pcum->aapcs_arg_processed = true;
4584 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4585 if (type)
4586 size = int_size_in_bytes (type);
4587 else
4588 /* No frontends can create types with variable-sized modes, so we
4589 shouldn't be asked to pass or return them. */
4590 size = GET_MODE_SIZE (mode).to_constant ();
4591 size = ROUND_UP (size, UNITS_PER_WORD);
4593 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4594 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4595 mode,
4596 type,
4597 &nregs);
4599 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4600 The following code thus handles passing by SIMD/FP registers first. */
4602 nvrn = pcum->aapcs_nvrn;
4604 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4605 and homogenous short-vector aggregates (HVA). */
4606 if (allocate_nvrn)
4608 if (!TARGET_FLOAT)
4609 aarch64_err_no_fpadvsimd (mode);
4611 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4613 pcum->aapcs_nextnvrn = nvrn + nregs;
4614 if (!aarch64_composite_type_p (type, mode))
4616 gcc_assert (nregs == 1);
4617 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4619 else
4621 rtx par;
4622 int i;
4623 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4624 for (i = 0; i < nregs; i++)
4626 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4627 V0_REGNUM + nvrn + i);
4628 rtx offset = gen_int_mode
4629 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4630 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4631 XVECEXP (par, 0, i) = tmp;
4633 pcum->aapcs_reg = par;
4635 return;
4637 else
4639 /* C.3 NSRN is set to 8. */
4640 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4641 goto on_stack;
4645 ncrn = pcum->aapcs_ncrn;
4646 nregs = size / UNITS_PER_WORD;
4648 /* C6 - C9. though the sign and zero extension semantics are
4649 handled elsewhere. This is the case where the argument fits
4650 entirely general registers. */
4651 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4653 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4655 /* C.8 if the argument has an alignment of 16 then the NGRN is
4656 rounded up to the next even number. */
4657 if (nregs == 2
4658 && ncrn % 2
4659 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4660 comparison is there because for > 16 * BITS_PER_UNIT
4661 alignment nregs should be > 2 and therefore it should be
4662 passed by reference rather than value. */
4663 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4664 == 16 * BITS_PER_UNIT))
4666 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4667 inform (input_location, "parameter passing for argument of type "
4668 "%qT changed in GCC 9.1", type);
4669 ++ncrn;
4670 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4673 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4674 A reg is still generated for it, but the caller should be smart
4675 enough not to use it. */
4676 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4677 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4678 else
4680 rtx par;
4681 int i;
4683 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4684 for (i = 0; i < nregs; i++)
4686 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4687 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4688 GEN_INT (i * UNITS_PER_WORD));
4689 XVECEXP (par, 0, i) = tmp;
4691 pcum->aapcs_reg = par;
4694 pcum->aapcs_nextncrn = ncrn + nregs;
4695 return;
4698 /* C.11 */
4699 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4701 /* The argument is passed on stack; record the needed number of words for
4702 this argument and align the total size if necessary. */
4703 on_stack:
4704 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4706 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4707 == 16 * BITS_PER_UNIT)
4709 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4710 if (pcum->aapcs_stack_size != new_size)
4712 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4713 inform (input_location, "parameter passing for argument of type "
4714 "%qT changed in GCC 9.1", type);
4715 pcum->aapcs_stack_size = new_size;
4718 return;
4721 /* Implement TARGET_FUNCTION_ARG. */
4723 static rtx
4724 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4725 const_tree type, bool named)
4727 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4728 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4730 if (mode == VOIDmode)
4731 return NULL_RTX;
4733 aarch64_layout_arg (pcum_v, mode, type, named);
4734 return pcum->aapcs_reg;
4737 void
4738 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4739 const_tree fntype ATTRIBUTE_UNUSED,
4740 rtx libname ATTRIBUTE_UNUSED,
4741 const_tree fndecl ATTRIBUTE_UNUSED,
4742 unsigned n_named ATTRIBUTE_UNUSED)
4744 pcum->aapcs_ncrn = 0;
4745 pcum->aapcs_nvrn = 0;
4746 pcum->aapcs_nextncrn = 0;
4747 pcum->aapcs_nextnvrn = 0;
4748 pcum->pcs_variant = ARM_PCS_AAPCS64;
4749 pcum->aapcs_reg = NULL_RTX;
4750 pcum->aapcs_arg_processed = false;
4751 pcum->aapcs_stack_words = 0;
4752 pcum->aapcs_stack_size = 0;
4754 if (!TARGET_FLOAT
4755 && fndecl && TREE_PUBLIC (fndecl)
4756 && fntype && fntype != error_mark_node)
4758 const_tree type = TREE_TYPE (fntype);
4759 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4760 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4761 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4762 &mode, &nregs, NULL))
4763 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4765 return;
4768 static void
4769 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4770 machine_mode mode,
4771 const_tree type,
4772 bool named)
4774 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4775 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4777 aarch64_layout_arg (pcum_v, mode, type, named);
4778 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4779 != (pcum->aapcs_stack_words != 0));
4780 pcum->aapcs_arg_processed = false;
4781 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4782 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4783 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4784 pcum->aapcs_stack_words = 0;
4785 pcum->aapcs_reg = NULL_RTX;
4789 bool
4790 aarch64_function_arg_regno_p (unsigned regno)
4792 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4793 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4796 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4797 PARM_BOUNDARY bits of alignment, but will be given anything up
4798 to STACK_BOUNDARY bits if the type requires it. This makes sure
4799 that both before and after the layout of each argument, the Next
4800 Stacked Argument Address (NSAA) will have a minimum alignment of
4801 8 bytes. */
4803 static unsigned int
4804 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4806 bool abi_break;
4807 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4808 &abi_break);
4809 if (abi_break & warn_psabi)
4810 inform (input_location, "parameter passing for argument of type "
4811 "%qT changed in GCC 9.1", type);
4813 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4816 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4818 static fixed_size_mode
4819 aarch64_get_reg_raw_mode (int regno)
4821 if (TARGET_SVE && FP_REGNUM_P (regno))
4822 /* Don't use the SVE part of the register for __builtin_apply and
4823 __builtin_return. The SVE registers aren't used by the normal PCS,
4824 so using them there would be a waste of time. The PCS extensions
4825 for SVE types are fundamentally incompatible with the
4826 __builtin_return/__builtin_apply interface. */
4827 return as_a <fixed_size_mode> (V16QImode);
4828 return default_get_reg_raw_mode (regno);
4831 /* Implement TARGET_FUNCTION_ARG_PADDING.
4833 Small aggregate types are placed in the lowest memory address.
4835 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4837 static pad_direction
4838 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4840 /* On little-endian targets, the least significant byte of every stack
4841 argument is passed at the lowest byte address of the stack slot. */
4842 if (!BYTES_BIG_ENDIAN)
4843 return PAD_UPWARD;
4845 /* Otherwise, integral, floating-point and pointer types are padded downward:
4846 the least significant byte of a stack argument is passed at the highest
4847 byte address of the stack slot. */
4848 if (type
4849 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4850 || POINTER_TYPE_P (type))
4851 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4852 return PAD_DOWNWARD;
4854 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4855 return PAD_UPWARD;
4858 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4860 It specifies padding for the last (may also be the only)
4861 element of a block move between registers and memory. If
4862 assuming the block is in the memory, padding upward means that
4863 the last element is padded after its highest significant byte,
4864 while in downward padding, the last element is padded at the
4865 its least significant byte side.
4867 Small aggregates and small complex types are always padded
4868 upwards.
4870 We don't need to worry about homogeneous floating-point or
4871 short-vector aggregates; their move is not affected by the
4872 padding direction determined here. Regardless of endianness,
4873 each element of such an aggregate is put in the least
4874 significant bits of a fp/simd register.
4876 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4877 register has useful data, and return the opposite if the most
4878 significant byte does. */
4880 bool
4881 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4882 bool first ATTRIBUTE_UNUSED)
4885 /* Small composite types are always padded upward. */
4886 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4888 HOST_WIDE_INT size;
4889 if (type)
4890 size = int_size_in_bytes (type);
4891 else
4892 /* No frontends can create types with variable-sized modes, so we
4893 shouldn't be asked to pass or return them. */
4894 size = GET_MODE_SIZE (mode).to_constant ();
4895 if (size < 2 * UNITS_PER_WORD)
4896 return true;
4899 /* Otherwise, use the default padding. */
4900 return !BYTES_BIG_ENDIAN;
4903 static scalar_int_mode
4904 aarch64_libgcc_cmp_return_mode (void)
4906 return SImode;
4909 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4911 /* We use the 12-bit shifted immediate arithmetic instructions so values
4912 must be multiple of (1 << 12), i.e. 4096. */
4913 #define ARITH_FACTOR 4096
4915 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4916 #error Cannot use simple address calculation for stack probing
4917 #endif
4919 /* The pair of scratch registers used for stack probing. */
4920 #define PROBE_STACK_FIRST_REG R9_REGNUM
4921 #define PROBE_STACK_SECOND_REG R10_REGNUM
4923 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4924 inclusive. These are offsets from the current stack pointer. */
4926 static void
4927 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4929 HOST_WIDE_INT size;
4930 if (!poly_size.is_constant (&size))
4932 sorry ("stack probes for SVE frames");
4933 return;
4936 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4938 /* See the same assertion on PROBE_INTERVAL above. */
4939 gcc_assert ((first % ARITH_FACTOR) == 0);
4941 /* See if we have a constant small number of probes to generate. If so,
4942 that's the easy case. */
4943 if (size <= PROBE_INTERVAL)
4945 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4947 emit_set_insn (reg1,
4948 plus_constant (Pmode,
4949 stack_pointer_rtx, -(first + base)));
4950 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4953 /* The run-time loop is made up of 8 insns in the generic case while the
4954 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4955 else if (size <= 4 * PROBE_INTERVAL)
4957 HOST_WIDE_INT i, rem;
4959 emit_set_insn (reg1,
4960 plus_constant (Pmode,
4961 stack_pointer_rtx,
4962 -(first + PROBE_INTERVAL)));
4963 emit_stack_probe (reg1);
4965 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4966 it exceeds SIZE. If only two probes are needed, this will not
4967 generate any code. Then probe at FIRST + SIZE. */
4968 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4970 emit_set_insn (reg1,
4971 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4972 emit_stack_probe (reg1);
4975 rem = size - (i - PROBE_INTERVAL);
4976 if (rem > 256)
4978 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4980 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4981 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4983 else
4984 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4987 /* Otherwise, do the same as above, but in a loop. Note that we must be
4988 extra careful with variables wrapping around because we might be at
4989 the very top (or the very bottom) of the address space and we have
4990 to be able to handle this case properly; in particular, we use an
4991 equality test for the loop condition. */
4992 else
4994 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4996 /* Step 1: round SIZE to the previous multiple of the interval. */
4998 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5001 /* Step 2: compute initial and final value of the loop counter. */
5003 /* TEST_ADDR = SP + FIRST. */
5004 emit_set_insn (reg1,
5005 plus_constant (Pmode, stack_pointer_rtx, -first));
5007 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5008 HOST_WIDE_INT adjustment = - (first + rounded_size);
5009 if (! aarch64_uimm12_shift (adjustment))
5011 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5012 true, Pmode);
5013 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5015 else
5016 emit_set_insn (reg2,
5017 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5019 /* Step 3: the loop
5023 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5024 probe at TEST_ADDR
5026 while (TEST_ADDR != LAST_ADDR)
5028 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5029 until it is equal to ROUNDED_SIZE. */
5031 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5034 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5035 that SIZE is equal to ROUNDED_SIZE. */
5037 if (size != rounded_size)
5039 HOST_WIDE_INT rem = size - rounded_size;
5041 if (rem > 256)
5043 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5045 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5046 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5048 else
5049 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5053 /* Make sure nothing is scheduled before we are done. */
5054 emit_insn (gen_blockage ());
5057 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5058 absolute addresses. */
5060 const char *
5061 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5063 static int labelno = 0;
5064 char loop_lab[32];
5065 rtx xops[2];
5067 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5069 /* Loop. */
5070 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5072 HOST_WIDE_INT stack_clash_probe_interval
5073 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5075 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5076 xops[0] = reg1;
5077 HOST_WIDE_INT interval;
5078 if (flag_stack_clash_protection)
5079 interval = stack_clash_probe_interval;
5080 else
5081 interval = PROBE_INTERVAL;
5083 gcc_assert (aarch64_uimm12_shift (interval));
5084 xops[1] = GEN_INT (interval);
5086 output_asm_insn ("sub\t%0, %0, %1", xops);
5088 /* If doing stack clash protection then we probe up by the ABI specified
5089 amount. We do this because we're dropping full pages at a time in the
5090 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5091 if (flag_stack_clash_protection)
5092 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5093 else
5094 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5096 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5097 by this amount for each iteration. */
5098 output_asm_insn ("str\txzr, [%0, %1]", xops);
5100 /* Test if TEST_ADDR == LAST_ADDR. */
5101 xops[1] = reg2;
5102 output_asm_insn ("cmp\t%0, %1", xops);
5104 /* Branch. */
5105 fputs ("\tb.ne\t", asm_out_file);
5106 assemble_name_raw (asm_out_file, loop_lab);
5107 fputc ('\n', asm_out_file);
5109 return "";
5112 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5113 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5114 of GUARD_SIZE. When a probe is emitted it is done at most
5115 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5116 at most MIN_PROBE_THRESHOLD. By the end of this function
5117 BASE = BASE - ADJUSTMENT. */
5119 const char *
5120 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5121 rtx min_probe_threshold, rtx guard_size)
5123 /* This function is not allowed to use any instruction generation function
5124 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5125 so instead emit the code you want using output_asm_insn. */
5126 gcc_assert (flag_stack_clash_protection);
5127 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5128 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5130 /* The minimum required allocation before the residual requires probing. */
5131 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5133 /* Clamp the value down to the nearest value that can be used with a cmp. */
5134 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5135 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5137 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5138 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5140 static int labelno = 0;
5141 char loop_start_lab[32];
5142 char loop_end_lab[32];
5143 rtx xops[2];
5145 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5146 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5148 /* Emit loop start label. */
5149 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5151 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5152 xops[0] = adjustment;
5153 xops[1] = probe_offset_value_rtx;
5154 output_asm_insn ("cmp\t%0, %1", xops);
5156 /* Branch to end if not enough adjustment to probe. */
5157 fputs ("\tb.lt\t", asm_out_file);
5158 assemble_name_raw (asm_out_file, loop_end_lab);
5159 fputc ('\n', asm_out_file);
5161 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5162 xops[0] = base;
5163 xops[1] = probe_offset_value_rtx;
5164 output_asm_insn ("sub\t%0, %0, %1", xops);
5166 /* Probe at BASE. */
5167 xops[1] = const0_rtx;
5168 output_asm_insn ("str\txzr, [%0, %1]", xops);
5170 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5171 xops[0] = adjustment;
5172 xops[1] = probe_offset_value_rtx;
5173 output_asm_insn ("sub\t%0, %0, %1", xops);
5175 /* Branch to start if still more bytes to allocate. */
5176 fputs ("\tb\t", asm_out_file);
5177 assemble_name_raw (asm_out_file, loop_start_lab);
5178 fputc ('\n', asm_out_file);
5180 /* No probe leave. */
5181 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5183 /* BASE = BASE - ADJUSTMENT. */
5184 xops[0] = base;
5185 xops[1] = adjustment;
5186 output_asm_insn ("sub\t%0, %0, %1", xops);
5187 return "";
5190 /* Determine whether a frame chain needs to be generated. */
5191 static bool
5192 aarch64_needs_frame_chain (void)
5194 /* Force a frame chain for EH returns so the return address is at FP+8. */
5195 if (frame_pointer_needed || crtl->calls_eh_return)
5196 return true;
5198 /* A leaf function cannot have calls or write LR. */
5199 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5201 /* Don't use a frame chain in leaf functions if leaf frame pointers
5202 are disabled. */
5203 if (flag_omit_leaf_frame_pointer && is_leaf)
5204 return false;
5206 return aarch64_use_frame_pointer;
5209 /* Mark the registers that need to be saved by the callee and calculate
5210 the size of the callee-saved registers area and frame record (both FP
5211 and LR may be omitted). */
5212 static void
5213 aarch64_layout_frame (void)
5215 HOST_WIDE_INT offset = 0;
5216 int regno, last_fp_reg = INVALID_REGNUM;
5217 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5219 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5221 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5222 the mid-end is doing. */
5223 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5225 #define SLOT_NOT_REQUIRED (-2)
5226 #define SLOT_REQUIRED (-1)
5228 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5229 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5231 /* If this is a non-leaf simd function with calls we assume that
5232 at least one of those calls is to a non-simd function and thus
5233 we must save V8 to V23 in the prologue. */
5235 if (simd_function && !crtl->is_leaf)
5237 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5238 if (FP_SIMD_SAVED_REGNUM_P (regno))
5239 df_set_regs_ever_live (regno, true);
5242 /* First mark all the registers that really need to be saved... */
5243 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5244 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5246 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5247 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5249 /* ... that includes the eh data registers (if needed)... */
5250 if (crtl->calls_eh_return)
5251 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5252 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5253 = SLOT_REQUIRED;
5255 /* ... and any callee saved register that dataflow says is live. */
5256 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5257 if (df_regs_ever_live_p (regno)
5258 && (regno == R30_REGNUM
5259 || !call_used_regs[regno]))
5260 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5262 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5263 if (df_regs_ever_live_p (regno)
5264 && (!call_used_regs[regno]
5265 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5267 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5268 last_fp_reg = regno;
5271 if (cfun->machine->frame.emit_frame_chain)
5273 /* FP and LR are placed in the linkage record. */
5274 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5275 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5276 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5277 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5278 offset = 2 * UNITS_PER_WORD;
5281 /* With stack-clash, LR must be saved in non-leaf functions. */
5282 gcc_assert (crtl->is_leaf
5283 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5284 != SLOT_NOT_REQUIRED));
5286 /* Now assign stack slots for them. */
5287 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5288 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5290 cfun->machine->frame.reg_offset[regno] = offset;
5291 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5292 cfun->machine->frame.wb_candidate1 = regno;
5293 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5294 cfun->machine->frame.wb_candidate2 = regno;
5295 offset += UNITS_PER_WORD;
5298 HOST_WIDE_INT max_int_offset = offset;
5299 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5300 bool has_align_gap = offset != max_int_offset;
5302 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5303 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5305 /* If there is an alignment gap between integer and fp callee-saves,
5306 allocate the last fp register to it if possible. */
5307 if (regno == last_fp_reg
5308 && has_align_gap
5309 && !simd_function
5310 && (offset & 8) == 0)
5312 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5313 break;
5316 cfun->machine->frame.reg_offset[regno] = offset;
5317 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5318 cfun->machine->frame.wb_candidate1 = regno;
5319 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5320 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5321 cfun->machine->frame.wb_candidate2 = regno;
5322 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5325 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5327 cfun->machine->frame.saved_regs_size = offset;
5329 HOST_WIDE_INT varargs_and_saved_regs_size
5330 = offset + cfun->machine->frame.saved_varargs_size;
5332 cfun->machine->frame.hard_fp_offset
5333 = aligned_upper_bound (varargs_and_saved_regs_size
5334 + get_frame_size (),
5335 STACK_BOUNDARY / BITS_PER_UNIT);
5337 /* Both these values are already aligned. */
5338 gcc_assert (multiple_p (crtl->outgoing_args_size,
5339 STACK_BOUNDARY / BITS_PER_UNIT));
5340 cfun->machine->frame.frame_size
5341 = (cfun->machine->frame.hard_fp_offset
5342 + crtl->outgoing_args_size);
5344 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5346 cfun->machine->frame.initial_adjust = 0;
5347 cfun->machine->frame.final_adjust = 0;
5348 cfun->machine->frame.callee_adjust = 0;
5349 cfun->machine->frame.callee_offset = 0;
5351 HOST_WIDE_INT max_push_offset = 0;
5352 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5353 max_push_offset = 512;
5354 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5355 max_push_offset = 256;
5357 HOST_WIDE_INT const_size, const_fp_offset;
5358 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5359 && const_size < max_push_offset
5360 && known_eq (crtl->outgoing_args_size, 0))
5362 /* Simple, small frame with no outgoing arguments:
5363 stp reg1, reg2, [sp, -frame_size]!
5364 stp reg3, reg4, [sp, 16] */
5365 cfun->machine->frame.callee_adjust = const_size;
5367 else if (known_lt (crtl->outgoing_args_size
5368 + cfun->machine->frame.saved_regs_size, 512)
5369 && !(cfun->calls_alloca
5370 && known_lt (cfun->machine->frame.hard_fp_offset,
5371 max_push_offset)))
5373 /* Frame with small outgoing arguments:
5374 sub sp, sp, frame_size
5375 stp reg1, reg2, [sp, outgoing_args_size]
5376 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5377 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5378 cfun->machine->frame.callee_offset
5379 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5381 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5382 && const_fp_offset < max_push_offset)
5384 /* Frame with large outgoing arguments but a small local area:
5385 stp reg1, reg2, [sp, -hard_fp_offset]!
5386 stp reg3, reg4, [sp, 16]
5387 sub sp, sp, outgoing_args_size */
5388 cfun->machine->frame.callee_adjust = const_fp_offset;
5389 cfun->machine->frame.final_adjust
5390 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5392 else
5394 /* Frame with large local area and outgoing arguments using frame pointer:
5395 sub sp, sp, hard_fp_offset
5396 stp x29, x30, [sp, 0]
5397 add x29, sp, 0
5398 stp reg3, reg4, [sp, 16]
5399 sub sp, sp, outgoing_args_size */
5400 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5401 cfun->machine->frame.final_adjust
5402 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5405 cfun->machine->frame.laid_out = true;
5408 /* Return true if the register REGNO is saved on entry to
5409 the current function. */
5411 static bool
5412 aarch64_register_saved_on_entry (int regno)
5414 return cfun->machine->frame.reg_offset[regno] >= 0;
5417 /* Return the next register up from REGNO up to LIMIT for the callee
5418 to save. */
5420 static unsigned
5421 aarch64_next_callee_save (unsigned regno, unsigned limit)
5423 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5424 regno ++;
5425 return regno;
5428 /* Push the register number REGNO of mode MODE to the stack with write-back
5429 adjusting the stack by ADJUSTMENT. */
5431 static void
5432 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5433 HOST_WIDE_INT adjustment)
5435 rtx base_rtx = stack_pointer_rtx;
5436 rtx insn, reg, mem;
5438 reg = gen_rtx_REG (mode, regno);
5439 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5440 plus_constant (Pmode, base_rtx, -adjustment));
5441 mem = gen_frame_mem (mode, mem);
5443 insn = emit_move_insn (mem, reg);
5444 RTX_FRAME_RELATED_P (insn) = 1;
5447 /* Generate and return an instruction to store the pair of registers
5448 REG and REG2 of mode MODE to location BASE with write-back adjusting
5449 the stack location BASE by ADJUSTMENT. */
5451 static rtx
5452 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5453 HOST_WIDE_INT adjustment)
5455 switch (mode)
5457 case E_DImode:
5458 return gen_storewb_pairdi_di (base, base, reg, reg2,
5459 GEN_INT (-adjustment),
5460 GEN_INT (UNITS_PER_WORD - adjustment));
5461 case E_DFmode:
5462 return gen_storewb_pairdf_di (base, base, reg, reg2,
5463 GEN_INT (-adjustment),
5464 GEN_INT (UNITS_PER_WORD - adjustment));
5465 case E_TFmode:
5466 return gen_storewb_pairtf_di (base, base, reg, reg2,
5467 GEN_INT (-adjustment),
5468 GEN_INT (UNITS_PER_VREG - adjustment));
5469 default:
5470 gcc_unreachable ();
5474 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5475 stack pointer by ADJUSTMENT. */
5477 static void
5478 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5480 rtx_insn *insn;
5481 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5483 if (regno2 == INVALID_REGNUM)
5484 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5486 rtx reg1 = gen_rtx_REG (mode, regno1);
5487 rtx reg2 = gen_rtx_REG (mode, regno2);
5489 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5490 reg2, adjustment));
5491 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5492 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5493 RTX_FRAME_RELATED_P (insn) = 1;
5496 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5497 adjusting it by ADJUSTMENT afterwards. */
5499 static rtx
5500 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5501 HOST_WIDE_INT adjustment)
5503 switch (mode)
5505 case E_DImode:
5506 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5507 GEN_INT (UNITS_PER_WORD));
5508 case E_DFmode:
5509 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5510 GEN_INT (UNITS_PER_WORD));
5511 case E_TFmode:
5512 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5513 GEN_INT (UNITS_PER_VREG));
5514 default:
5515 gcc_unreachable ();
5519 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5520 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5521 into CFI_OPS. */
5523 static void
5524 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5525 rtx *cfi_ops)
5527 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5528 rtx reg1 = gen_rtx_REG (mode, regno1);
5530 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5532 if (regno2 == INVALID_REGNUM)
5534 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5535 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5536 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5538 else
5540 rtx reg2 = gen_rtx_REG (mode, regno2);
5541 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5542 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5543 reg2, adjustment));
5547 /* Generate and return a store pair instruction of mode MODE to store
5548 register REG1 to MEM1 and register REG2 to MEM2. */
5550 static rtx
5551 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5552 rtx reg2)
5554 switch (mode)
5556 case E_DImode:
5557 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5559 case E_DFmode:
5560 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5562 case E_TFmode:
5563 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5565 default:
5566 gcc_unreachable ();
5570 /* Generate and regurn a load pair isntruction of mode MODE to load register
5571 REG1 from MEM1 and register REG2 from MEM2. */
5573 static rtx
5574 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5575 rtx mem2)
5577 switch (mode)
5579 case E_DImode:
5580 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5582 case E_DFmode:
5583 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5585 case E_TFmode:
5586 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5588 default:
5589 gcc_unreachable ();
5593 /* Return TRUE if return address signing should be enabled for the current
5594 function, otherwise return FALSE. */
5596 bool
5597 aarch64_return_address_signing_enabled (void)
5599 /* This function should only be called after frame laid out. */
5600 gcc_assert (cfun->machine->frame.laid_out);
5602 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5603 if its LR is pushed onto stack. */
5604 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5605 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5606 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5609 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5610 bool
5611 aarch64_bti_enabled (void)
5613 return (aarch64_enable_bti == 1);
5616 /* Emit code to save the callee-saved registers from register number START
5617 to LIMIT to the stack at the location starting at offset START_OFFSET,
5618 skipping any write-back candidates if SKIP_WB is true. */
5620 static void
5621 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5622 unsigned start, unsigned limit, bool skip_wb)
5624 rtx_insn *insn;
5625 unsigned regno;
5626 unsigned regno2;
5628 for (regno = aarch64_next_callee_save (start, limit);
5629 regno <= limit;
5630 regno = aarch64_next_callee_save (regno + 1, limit))
5632 rtx reg, mem;
5633 poly_int64 offset;
5634 int offset_diff;
5636 if (skip_wb
5637 && (regno == cfun->machine->frame.wb_candidate1
5638 || regno == cfun->machine->frame.wb_candidate2))
5639 continue;
5641 if (cfun->machine->reg_is_wrapped_separately[regno])
5642 continue;
5644 reg = gen_rtx_REG (mode, regno);
5645 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5646 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5647 offset));
5649 regno2 = aarch64_next_callee_save (regno + 1, limit);
5650 offset_diff = cfun->machine->frame.reg_offset[regno2]
5651 - cfun->machine->frame.reg_offset[regno];
5653 if (regno2 <= limit
5654 && !cfun->machine->reg_is_wrapped_separately[regno2]
5655 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5657 rtx reg2 = gen_rtx_REG (mode, regno2);
5658 rtx mem2;
5660 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5661 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5662 offset));
5663 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5664 reg2));
5666 /* The first part of a frame-related parallel insn is
5667 always assumed to be relevant to the frame
5668 calculations; subsequent parts, are only
5669 frame-related if explicitly marked. */
5670 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5671 regno = regno2;
5673 else
5674 insn = emit_move_insn (mem, reg);
5676 RTX_FRAME_RELATED_P (insn) = 1;
5680 /* Emit code to restore the callee registers of mode MODE from register
5681 number START up to and including LIMIT. Restore from the stack offset
5682 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5683 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5685 static void
5686 aarch64_restore_callee_saves (machine_mode mode,
5687 poly_int64 start_offset, unsigned start,
5688 unsigned limit, bool skip_wb, rtx *cfi_ops)
5690 rtx base_rtx = stack_pointer_rtx;
5691 unsigned regno;
5692 unsigned regno2;
5693 poly_int64 offset;
5695 for (regno = aarch64_next_callee_save (start, limit);
5696 regno <= limit;
5697 regno = aarch64_next_callee_save (regno + 1, limit))
5699 if (cfun->machine->reg_is_wrapped_separately[regno])
5700 continue;
5702 rtx reg, mem;
5703 int offset_diff;
5705 if (skip_wb
5706 && (regno == cfun->machine->frame.wb_candidate1
5707 || regno == cfun->machine->frame.wb_candidate2))
5708 continue;
5710 reg = gen_rtx_REG (mode, regno);
5711 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5712 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5714 regno2 = aarch64_next_callee_save (regno + 1, limit);
5715 offset_diff = cfun->machine->frame.reg_offset[regno2]
5716 - cfun->machine->frame.reg_offset[regno];
5718 if (regno2 <= limit
5719 && !cfun->machine->reg_is_wrapped_separately[regno2]
5720 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5722 rtx reg2 = gen_rtx_REG (mode, regno2);
5723 rtx mem2;
5725 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5726 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5727 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5729 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5730 regno = regno2;
5732 else
5733 emit_move_insn (reg, mem);
5734 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5738 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5739 of MODE. */
5741 static inline bool
5742 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5744 HOST_WIDE_INT multiple;
5745 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5746 && IN_RANGE (multiple, -8, 7));
5749 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5750 of MODE. */
5752 static inline bool
5753 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5755 HOST_WIDE_INT multiple;
5756 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5757 && IN_RANGE (multiple, 0, 63));
5760 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5761 of MODE. */
5763 bool
5764 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5766 HOST_WIDE_INT multiple;
5767 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5768 && IN_RANGE (multiple, -64, 63));
5771 /* Return true if OFFSET is a signed 9-bit value. */
5773 bool
5774 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5775 poly_int64 offset)
5777 HOST_WIDE_INT const_offset;
5778 return (offset.is_constant (&const_offset)
5779 && IN_RANGE (const_offset, -256, 255));
5782 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5783 of MODE. */
5785 static inline bool
5786 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5788 HOST_WIDE_INT multiple;
5789 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5790 && IN_RANGE (multiple, -256, 255));
5793 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5794 of MODE. */
5796 static inline bool
5797 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5799 HOST_WIDE_INT multiple;
5800 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5801 && IN_RANGE (multiple, 0, 4095));
5804 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5806 static sbitmap
5807 aarch64_get_separate_components (void)
5809 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5810 bitmap_clear (components);
5812 /* The registers we need saved to the frame. */
5813 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5814 if (aarch64_register_saved_on_entry (regno))
5816 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5817 if (!frame_pointer_needed)
5818 offset += cfun->machine->frame.frame_size
5819 - cfun->machine->frame.hard_fp_offset;
5820 /* Check that we can access the stack slot of the register with one
5821 direct load with no adjustments needed. */
5822 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5823 bitmap_set_bit (components, regno);
5826 /* Don't mess with the hard frame pointer. */
5827 if (frame_pointer_needed)
5828 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5830 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5831 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5832 /* If registers have been chosen to be stored/restored with
5833 writeback don't interfere with them to avoid having to output explicit
5834 stack adjustment instructions. */
5835 if (reg2 != INVALID_REGNUM)
5836 bitmap_clear_bit (components, reg2);
5837 if (reg1 != INVALID_REGNUM)
5838 bitmap_clear_bit (components, reg1);
5840 bitmap_clear_bit (components, LR_REGNUM);
5841 bitmap_clear_bit (components, SP_REGNUM);
5843 return components;
5846 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5848 static sbitmap
5849 aarch64_components_for_bb (basic_block bb)
5851 bitmap in = DF_LIVE_IN (bb);
5852 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5853 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5854 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5856 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5857 bitmap_clear (components);
5859 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5860 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5861 if ((!call_used_regs[regno]
5862 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5863 && (bitmap_bit_p (in, regno)
5864 || bitmap_bit_p (gen, regno)
5865 || bitmap_bit_p (kill, regno)))
5867 unsigned regno2, offset, offset2;
5868 bitmap_set_bit (components, regno);
5870 /* If there is a callee-save at an adjacent offset, add it too
5871 to increase the use of LDP/STP. */
5872 offset = cfun->machine->frame.reg_offset[regno];
5873 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5875 if (regno2 <= LAST_SAVED_REGNUM)
5877 offset2 = cfun->machine->frame.reg_offset[regno2];
5878 if ((offset & ~8) == (offset2 & ~8))
5879 bitmap_set_bit (components, regno2);
5883 return components;
5886 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5887 Nothing to do for aarch64. */
5889 static void
5890 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5894 /* Return the next set bit in BMP from START onwards. Return the total number
5895 of bits in BMP if no set bit is found at or after START. */
5897 static unsigned int
5898 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5900 unsigned int nbits = SBITMAP_SIZE (bmp);
5901 if (start == nbits)
5902 return start;
5904 gcc_assert (start < nbits);
5905 for (unsigned int i = start; i < nbits; i++)
5906 if (bitmap_bit_p (bmp, i))
5907 return i;
5909 return nbits;
5912 /* Do the work for aarch64_emit_prologue_components and
5913 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5914 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5915 for these components or the epilogue sequence. That is, it determines
5916 whether we should emit stores or loads and what kind of CFA notes to attach
5917 to the insns. Otherwise the logic for the two sequences is very
5918 similar. */
5920 static void
5921 aarch64_process_components (sbitmap components, bool prologue_p)
5923 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5924 ? HARD_FRAME_POINTER_REGNUM
5925 : STACK_POINTER_REGNUM);
5927 unsigned last_regno = SBITMAP_SIZE (components);
5928 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5929 rtx_insn *insn = NULL;
5931 while (regno != last_regno)
5933 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5934 so DFmode for the vector registers is enough. For simd functions
5935 we want to save the low 128 bits. */
5936 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5938 rtx reg = gen_rtx_REG (mode, regno);
5939 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5940 if (!frame_pointer_needed)
5941 offset += cfun->machine->frame.frame_size
5942 - cfun->machine->frame.hard_fp_offset;
5943 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5944 rtx mem = gen_frame_mem (mode, addr);
5946 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5947 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5948 /* No more registers to handle after REGNO.
5949 Emit a single save/restore and exit. */
5950 if (regno2 == last_regno)
5952 insn = emit_insn (set);
5953 RTX_FRAME_RELATED_P (insn) = 1;
5954 if (prologue_p)
5955 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5956 else
5957 add_reg_note (insn, REG_CFA_RESTORE, reg);
5958 break;
5961 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5962 /* The next register is not of the same class or its offset is not
5963 mergeable with the current one into a pair. */
5964 if (!satisfies_constraint_Ump (mem)
5965 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5966 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5967 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5968 GET_MODE_SIZE (mode)))
5970 insn = emit_insn (set);
5971 RTX_FRAME_RELATED_P (insn) = 1;
5972 if (prologue_p)
5973 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5974 else
5975 add_reg_note (insn, REG_CFA_RESTORE, reg);
5977 regno = regno2;
5978 continue;
5981 /* REGNO2 can be saved/restored in a pair with REGNO. */
5982 rtx reg2 = gen_rtx_REG (mode, regno2);
5983 if (!frame_pointer_needed)
5984 offset2 += cfun->machine->frame.frame_size
5985 - cfun->machine->frame.hard_fp_offset;
5986 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5987 rtx mem2 = gen_frame_mem (mode, addr2);
5988 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5989 : gen_rtx_SET (reg2, mem2);
5991 if (prologue_p)
5992 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5993 else
5994 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5996 RTX_FRAME_RELATED_P (insn) = 1;
5997 if (prologue_p)
5999 add_reg_note (insn, REG_CFA_OFFSET, set);
6000 add_reg_note (insn, REG_CFA_OFFSET, set2);
6002 else
6004 add_reg_note (insn, REG_CFA_RESTORE, reg);
6005 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6008 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6012 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6014 static void
6015 aarch64_emit_prologue_components (sbitmap components)
6017 aarch64_process_components (components, true);
6020 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6022 static void
6023 aarch64_emit_epilogue_components (sbitmap components)
6025 aarch64_process_components (components, false);
6028 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6030 static void
6031 aarch64_set_handled_components (sbitmap components)
6033 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6034 if (bitmap_bit_p (components, regno))
6035 cfun->machine->reg_is_wrapped_separately[regno] = true;
6038 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6039 determining the probe offset for alloca. */
6041 static HOST_WIDE_INT
6042 aarch64_stack_clash_protection_alloca_probe_range (void)
6044 return STACK_CLASH_CALLER_GUARD;
6048 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6049 registers. If POLY_SIZE is not large enough to require a probe this function
6050 will only adjust the stack. When allocating the stack space
6051 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6052 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6053 arguments. If we are then we ensure that any allocation larger than the ABI
6054 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6055 maintained.
6057 We emit barriers after each stack adjustment to prevent optimizations from
6058 breaking the invariant that we never drop the stack more than a page. This
6059 invariant is needed to make it easier to correctly handle asynchronous
6060 events, e.g. if we were to allow the stack to be dropped by more than a page
6061 and then have multiple probes up and we take a signal somewhere in between
6062 then the signal handler doesn't know the state of the stack and can make no
6063 assumptions about which pages have been probed. */
6065 static void
6066 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6067 poly_int64 poly_size,
6068 bool frame_related_p,
6069 bool final_adjustment_p)
6071 HOST_WIDE_INT guard_size
6072 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6073 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6074 /* When doing the final adjustment for the outgoing argument size we can't
6075 assume that LR was saved at position 0. So subtract it's offset from the
6076 ABI safe buffer so that we don't accidentally allow an adjustment that
6077 would result in an allocation larger than the ABI buffer without
6078 probing. */
6079 HOST_WIDE_INT min_probe_threshold
6080 = final_adjustment_p
6081 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6082 : guard_size - guard_used_by_caller;
6084 poly_int64 frame_size = cfun->machine->frame.frame_size;
6086 /* We should always have a positive probe threshold. */
6087 gcc_assert (min_probe_threshold > 0);
6089 if (flag_stack_clash_protection && !final_adjustment_p)
6091 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6092 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6094 if (known_eq (frame_size, 0))
6096 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6098 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6099 && known_lt (final_adjust, guard_used_by_caller))
6101 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6105 /* If SIZE is not large enough to require probing, just adjust the stack and
6106 exit. */
6107 if (known_lt (poly_size, min_probe_threshold)
6108 || !flag_stack_clash_protection)
6110 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6111 return;
6114 HOST_WIDE_INT size;
6115 /* Handle the SVE non-constant case first. */
6116 if (!poly_size.is_constant (&size))
6118 if (dump_file)
6120 fprintf (dump_file, "Stack clash SVE prologue: ");
6121 print_dec (poly_size, dump_file);
6122 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6125 /* First calculate the amount of bytes we're actually spilling. */
6126 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6127 poly_size, temp1, temp2, false, true);
6129 rtx_insn *insn = get_last_insn ();
6131 if (frame_related_p)
6133 /* This is done to provide unwinding information for the stack
6134 adjustments we're about to do, however to prevent the optimizers
6135 from removing the R11 move and leaving the CFA note (which would be
6136 very wrong) we tie the old and new stack pointer together.
6137 The tie will expand to nothing but the optimizers will not touch
6138 the instruction. */
6139 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6140 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6141 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6143 /* We want the CFA independent of the stack pointer for the
6144 duration of the loop. */
6145 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6146 RTX_FRAME_RELATED_P (insn) = 1;
6149 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6150 rtx guard_const = gen_int_mode (guard_size, Pmode);
6152 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6153 stack_pointer_rtx, temp1,
6154 probe_const, guard_const));
6156 /* Now reset the CFA register if needed. */
6157 if (frame_related_p)
6159 add_reg_note (insn, REG_CFA_DEF_CFA,
6160 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6161 gen_int_mode (poly_size, Pmode)));
6162 RTX_FRAME_RELATED_P (insn) = 1;
6165 return;
6168 if (dump_file)
6169 fprintf (dump_file,
6170 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6171 " bytes, probing will be required.\n", size);
6173 /* Round size to the nearest multiple of guard_size, and calculate the
6174 residual as the difference between the original size and the rounded
6175 size. */
6176 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6177 HOST_WIDE_INT residual = size - rounded_size;
6179 /* We can handle a small number of allocations/probes inline. Otherwise
6180 punt to a loop. */
6181 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6183 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6185 aarch64_sub_sp (NULL, temp2, guard_size, true);
6186 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6187 guard_used_by_caller));
6188 emit_insn (gen_blockage ());
6190 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6192 else
6194 /* Compute the ending address. */
6195 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6196 temp1, NULL, false, true);
6197 rtx_insn *insn = get_last_insn ();
6199 /* For the initial allocation, we don't have a frame pointer
6200 set up, so we always need CFI notes. If we're doing the
6201 final allocation, then we may have a frame pointer, in which
6202 case it is the CFA, otherwise we need CFI notes.
6204 We can determine which allocation we are doing by looking at
6205 the value of FRAME_RELATED_P since the final allocations are not
6206 frame related. */
6207 if (frame_related_p)
6209 /* We want the CFA independent of the stack pointer for the
6210 duration of the loop. */
6211 add_reg_note (insn, REG_CFA_DEF_CFA,
6212 plus_constant (Pmode, temp1, rounded_size));
6213 RTX_FRAME_RELATED_P (insn) = 1;
6216 /* This allocates and probes the stack. Note that this re-uses some of
6217 the existing Ada stack protection code. However we are guaranteed not
6218 to enter the non loop or residual branches of that code.
6220 The non-loop part won't be entered because if our allocation amount
6221 doesn't require a loop, the case above would handle it.
6223 The residual amount won't be entered because TEMP1 is a mutliple of
6224 the allocation size. The residual will always be 0. As such, the only
6225 part we are actually using from that code is the loop setup. The
6226 actual probing is done in aarch64_output_probe_stack_range. */
6227 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6228 stack_pointer_rtx, temp1));
6230 /* Now reset the CFA register if needed. */
6231 if (frame_related_p)
6233 add_reg_note (insn, REG_CFA_DEF_CFA,
6234 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6235 RTX_FRAME_RELATED_P (insn) = 1;
6238 emit_insn (gen_blockage ());
6239 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6242 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6243 be probed. This maintains the requirement that each page is probed at
6244 least once. For initial probing we probe only if the allocation is
6245 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6246 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6247 GUARD_SIZE. This works that for any allocation that is large enough to
6248 trigger a probe here, we'll have at least one, and if they're not large
6249 enough for this code to emit anything for them, The page would have been
6250 probed by the saving of FP/LR either by this function or any callees. If
6251 we don't have any callees then we won't have more stack adjustments and so
6252 are still safe. */
6253 if (residual)
6255 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6256 /* If we're doing final adjustments, and we've done any full page
6257 allocations then any residual needs to be probed. */
6258 if (final_adjustment_p && rounded_size != 0)
6259 min_probe_threshold = 0;
6260 /* If doing a small final adjustment, we always probe at offset 0.
6261 This is done to avoid issues when LR is not at position 0 or when
6262 the final adjustment is smaller than the probing offset. */
6263 else if (final_adjustment_p && rounded_size == 0)
6264 residual_probe_offset = 0;
6266 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6267 if (residual >= min_probe_threshold)
6269 if (dump_file)
6270 fprintf (dump_file,
6271 "Stack clash AArch64 prologue residuals: "
6272 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6273 "\n", residual);
6275 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6276 residual_probe_offset));
6277 emit_insn (gen_blockage ());
6282 /* Return 1 if the register is used by the epilogue. We need to say the
6283 return register is used, but only after epilogue generation is complete.
6284 Note that in the case of sibcalls, the values "used by the epilogue" are
6285 considered live at the start of the called function.
6287 For SIMD functions we need to return 1 for FP registers that are saved and
6288 restored by a function but are not zero in call_used_regs. If we do not do
6289 this optimizations may remove the restore of the register. */
6292 aarch64_epilogue_uses (int regno)
6294 if (epilogue_completed)
6296 if (regno == LR_REGNUM)
6297 return 1;
6298 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6299 return 1;
6301 return 0;
6304 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6305 is saved at BASE + OFFSET. */
6307 static void
6308 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6309 rtx base, poly_int64 offset)
6311 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6312 add_reg_note (insn, REG_CFA_EXPRESSION,
6313 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6316 /* AArch64 stack frames generated by this compiler look like:
6318 +-------------------------------+
6320 | incoming stack arguments |
6322 +-------------------------------+
6323 | | <-- incoming stack pointer (aligned)
6324 | callee-allocated save area |
6325 | for register varargs |
6327 +-------------------------------+
6328 | local variables | <-- frame_pointer_rtx
6330 +-------------------------------+
6331 | padding | \
6332 +-------------------------------+ |
6333 | callee-saved registers | | frame.saved_regs_size
6334 +-------------------------------+ |
6335 | LR' | |
6336 +-------------------------------+ |
6337 | FP' | / <- hard_frame_pointer_rtx (aligned)
6338 +-------------------------------+
6339 | dynamic allocation |
6340 +-------------------------------+
6341 | padding |
6342 +-------------------------------+
6343 | outgoing stack arguments | <-- arg_pointer
6345 +-------------------------------+
6346 | | <-- stack_pointer_rtx (aligned)
6348 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6349 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6350 unchanged.
6352 By default for stack-clash we assume the guard is at least 64KB, but this
6353 value is configurable to either 4KB or 64KB. We also force the guard size to
6354 be the same as the probing interval and both values are kept in sync.
6356 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6357 on the guard size) of stack space without probing.
6359 When probing is needed, we emit a probe at the start of the prologue
6360 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6362 We have to track how much space has been allocated and the only stores
6363 to the stack we track as implicit probes are the FP/LR stores.
6365 For outgoing arguments we probe if the size is larger than 1KB, such that
6366 the ABI specified buffer is maintained for the next callee.
6368 The following registers are reserved during frame layout and should not be
6369 used for any other purpose:
6371 - r11: Used by stack clash protection when SVE is enabled.
6372 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6373 - r14 and r15: Used for speculation tracking.
6374 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6375 - r30(LR), r29(FP): Used by standard frame layout.
6377 These registers must be avoided in frame layout related code unless the
6378 explicit intention is to interact with one of the features listed above. */
6380 /* Generate the prologue instructions for entry into a function.
6381 Establish the stack frame by decreasing the stack pointer with a
6382 properly calculated size and, if necessary, create a frame record
6383 filled with the values of LR and previous frame pointer. The
6384 current FP is also set up if it is in use. */
6386 void
6387 aarch64_expand_prologue (void)
6389 poly_int64 frame_size = cfun->machine->frame.frame_size;
6390 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6391 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6392 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6393 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6394 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6395 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6396 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6397 rtx_insn *insn;
6399 /* Sign return address for functions. */
6400 if (aarch64_return_address_signing_enabled ())
6402 switch (aarch64_ra_sign_key)
6404 case AARCH64_KEY_A:
6405 insn = emit_insn (gen_paciasp ());
6406 break;
6407 case AARCH64_KEY_B:
6408 insn = emit_insn (gen_pacibsp ());
6409 break;
6410 default:
6411 gcc_unreachable ();
6413 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6414 RTX_FRAME_RELATED_P (insn) = 1;
6417 if (flag_stack_usage_info)
6418 current_function_static_stack_size = constant_lower_bound (frame_size);
6420 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6422 if (crtl->is_leaf && !cfun->calls_alloca)
6424 if (maybe_gt (frame_size, PROBE_INTERVAL)
6425 && maybe_gt (frame_size, get_stack_check_protect ()))
6426 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6427 (frame_size
6428 - get_stack_check_protect ()));
6430 else if (maybe_gt (frame_size, 0))
6431 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6434 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6435 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6437 /* In theory we should never have both an initial adjustment
6438 and a callee save adjustment. Verify that is the case since the
6439 code below does not handle it for -fstack-clash-protection. */
6440 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6442 /* Will only probe if the initial adjustment is larger than the guard
6443 less the amount of the guard reserved for use by the caller's
6444 outgoing args. */
6445 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6446 true, false);
6448 if (callee_adjust != 0)
6449 aarch64_push_regs (reg1, reg2, callee_adjust);
6451 if (emit_frame_chain)
6453 poly_int64 reg_offset = callee_adjust;
6454 if (callee_adjust == 0)
6456 reg1 = R29_REGNUM;
6457 reg2 = R30_REGNUM;
6458 reg_offset = callee_offset;
6459 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6461 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6462 stack_pointer_rtx, callee_offset,
6463 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6464 if (frame_pointer_needed && !frame_size.is_constant ())
6466 /* Variable-sized frames need to describe the save slot
6467 address using DW_CFA_expression rather than DW_CFA_offset.
6468 This means that, without taking further action, the
6469 locations of the registers that we've already saved would
6470 remain based on the stack pointer even after we redefine
6471 the CFA based on the frame pointer. We therefore need new
6472 DW_CFA_expressions to re-express the save slots with addresses
6473 based on the frame pointer. */
6474 rtx_insn *insn = get_last_insn ();
6475 gcc_assert (RTX_FRAME_RELATED_P (insn));
6477 /* Add an explicit CFA definition if this was previously
6478 implicit. */
6479 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6481 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6482 callee_offset);
6483 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6484 gen_rtx_SET (hard_frame_pointer_rtx, src));
6487 /* Change the save slot expressions for the registers that
6488 we've already saved. */
6489 reg_offset -= callee_offset;
6490 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6491 reg_offset + UNITS_PER_WORD);
6492 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6493 reg_offset);
6495 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6498 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6499 callee_adjust != 0 || emit_frame_chain);
6500 if (aarch64_simd_decl_p (cfun->decl))
6501 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6502 callee_adjust != 0 || emit_frame_chain);
6503 else
6504 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6505 callee_adjust != 0 || emit_frame_chain);
6507 /* We may need to probe the final adjustment if it is larger than the guard
6508 that is assumed by the called. */
6509 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6510 !frame_pointer_needed, true);
6513 /* Return TRUE if we can use a simple_return insn.
6515 This function checks whether the callee saved stack is empty, which
6516 means no restore actions are need. The pro_and_epilogue will use
6517 this to check whether shrink-wrapping opt is feasible. */
6519 bool
6520 aarch64_use_return_insn_p (void)
6522 if (!reload_completed)
6523 return false;
6525 if (crtl->profile)
6526 return false;
6528 return known_eq (cfun->machine->frame.frame_size, 0);
6531 /* Return false for non-leaf SIMD functions in order to avoid
6532 shrink-wrapping them. Doing this will lose the necessary
6533 save/restore of FP registers. */
6535 bool
6536 aarch64_use_simple_return_insn_p (void)
6538 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6539 return false;
6541 return true;
6544 /* Generate the epilogue instructions for returning from a function.
6545 This is almost exactly the reverse of the prolog sequence, except
6546 that we need to insert barriers to avoid scheduling loads that read
6547 from a deallocated stack, and we optimize the unwind records by
6548 emitting them all together if possible. */
6549 void
6550 aarch64_expand_epilogue (bool for_sibcall)
6552 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6553 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6554 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6555 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6556 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6557 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6558 rtx cfi_ops = NULL;
6559 rtx_insn *insn;
6560 /* A stack clash protection prologue may not have left EP0_REGNUM or
6561 EP1_REGNUM in a usable state. The same is true for allocations
6562 with an SVE component, since we then need both temporary registers
6563 for each allocation. For stack clash we are in a usable state if
6564 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6565 HOST_WIDE_INT guard_size
6566 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6567 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6569 /* We can re-use the registers when the allocation amount is smaller than
6570 guard_size - guard_used_by_caller because we won't be doing any probes
6571 then. In such situations the register should remain live with the correct
6572 value. */
6573 bool can_inherit_p = (initial_adjust.is_constant ()
6574 && final_adjust.is_constant ())
6575 && (!flag_stack_clash_protection
6576 || known_lt (initial_adjust,
6577 guard_size - guard_used_by_caller));
6579 /* We need to add memory barrier to prevent read from deallocated stack. */
6580 bool need_barrier_p
6581 = maybe_ne (get_frame_size ()
6582 + cfun->machine->frame.saved_varargs_size, 0);
6584 /* Emit a barrier to prevent loads from a deallocated stack. */
6585 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6586 || cfun->calls_alloca
6587 || crtl->calls_eh_return)
6589 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6590 need_barrier_p = false;
6593 /* Restore the stack pointer from the frame pointer if it may not
6594 be the same as the stack pointer. */
6595 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6596 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6597 if (frame_pointer_needed
6598 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6599 /* If writeback is used when restoring callee-saves, the CFA
6600 is restored on the instruction doing the writeback. */
6601 aarch64_add_offset (Pmode, stack_pointer_rtx,
6602 hard_frame_pointer_rtx, -callee_offset,
6603 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6604 else
6605 /* The case where we need to re-use the register here is very rare, so
6606 avoid the complicated condition and just always emit a move if the
6607 immediate doesn't fit. */
6608 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6610 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6611 callee_adjust != 0, &cfi_ops);
6612 if (aarch64_simd_decl_p (cfun->decl))
6613 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6614 callee_adjust != 0, &cfi_ops);
6615 else
6616 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6617 callee_adjust != 0, &cfi_ops);
6619 if (need_barrier_p)
6620 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6622 if (callee_adjust != 0)
6623 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6625 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6627 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6628 insn = get_last_insn ();
6629 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6630 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6631 RTX_FRAME_RELATED_P (insn) = 1;
6632 cfi_ops = NULL;
6635 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6636 add restriction on emit_move optimization to leaf functions. */
6637 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6638 (!can_inherit_p || !crtl->is_leaf
6639 || df_regs_ever_live_p (EP0_REGNUM)));
6641 if (cfi_ops)
6643 /* Emit delayed restores and reset the CFA to be SP. */
6644 insn = get_last_insn ();
6645 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6646 REG_NOTES (insn) = cfi_ops;
6647 RTX_FRAME_RELATED_P (insn) = 1;
6650 /* We prefer to emit the combined return/authenticate instruction RETAA,
6651 however there are three cases in which we must instead emit an explicit
6652 authentication instruction.
6654 1) Sibcalls don't return in a normal way, so if we're about to call one
6655 we must authenticate.
6657 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6658 generating code for !TARGET_ARMV8_3 we can't use it and must
6659 explicitly authenticate.
6661 3) On an eh_return path we make extra stack adjustments to update the
6662 canonical frame address to be the exception handler's CFA. We want
6663 to authenticate using the CFA of the function which calls eh_return.
6665 if (aarch64_return_address_signing_enabled ()
6666 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6668 switch (aarch64_ra_sign_key)
6670 case AARCH64_KEY_A:
6671 insn = emit_insn (gen_autiasp ());
6672 break;
6673 case AARCH64_KEY_B:
6674 insn = emit_insn (gen_autibsp ());
6675 break;
6676 default:
6677 gcc_unreachable ();
6679 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6680 RTX_FRAME_RELATED_P (insn) = 1;
6683 /* Stack adjustment for exception handler. */
6684 if (crtl->calls_eh_return && !for_sibcall)
6686 /* We need to unwind the stack by the offset computed by
6687 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6688 to be SP; letting the CFA move during this adjustment
6689 is just as correct as retaining the CFA from the body
6690 of the function. Therefore, do nothing special. */
6691 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6694 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6695 if (!for_sibcall)
6696 emit_jump_insn (ret_rtx);
6699 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6700 normally or return to a previous frame after unwinding.
6702 An EH return uses a single shared return sequence. The epilogue is
6703 exactly like a normal epilogue except that it has an extra input
6704 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6705 that must be applied after the frame has been destroyed. An extra label
6706 is inserted before the epilogue which initializes this register to zero,
6707 and this is the entry point for a normal return.
6709 An actual EH return updates the return address, initializes the stack
6710 adjustment and jumps directly into the epilogue (bypassing the zeroing
6711 of the adjustment). Since the return address is typically saved on the
6712 stack when a function makes a call, the saved LR must be updated outside
6713 the epilogue.
6715 This poses problems as the store is generated well before the epilogue,
6716 so the offset of LR is not known yet. Also optimizations will remove the
6717 store as it appears dead, even after the epilogue is generated (as the
6718 base or offset for loading LR is different in many cases).
6720 To avoid these problems this implementation forces the frame pointer
6721 in eh_return functions so that the location of LR is fixed and known early.
6722 It also marks the store volatile, so no optimization is permitted to
6723 remove the store. */
6725 aarch64_eh_return_handler_rtx (void)
6727 rtx tmp = gen_frame_mem (Pmode,
6728 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6730 /* Mark the store volatile, so no optimization is permitted to remove it. */
6731 MEM_VOLATILE_P (tmp) = true;
6732 return tmp;
6735 /* Output code to add DELTA to the first argument, and then jump
6736 to FUNCTION. Used for C++ multiple inheritance. */
6737 static void
6738 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6739 HOST_WIDE_INT delta,
6740 HOST_WIDE_INT vcall_offset,
6741 tree function)
6743 /* The this pointer is always in x0. Note that this differs from
6744 Arm where the this pointer maybe bumped to r1 if r0 is required
6745 to return a pointer to an aggregate. On AArch64 a result value
6746 pointer will be in x8. */
6747 int this_regno = R0_REGNUM;
6748 rtx this_rtx, temp0, temp1, addr, funexp;
6749 rtx_insn *insn;
6750 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6752 if (aarch64_bti_enabled ())
6753 emit_insn (gen_bti_c());
6755 reload_completed = 1;
6756 emit_note (NOTE_INSN_PROLOGUE_END);
6758 this_rtx = gen_rtx_REG (Pmode, this_regno);
6759 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6760 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6762 if (vcall_offset == 0)
6763 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6764 else
6766 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6768 addr = this_rtx;
6769 if (delta != 0)
6771 if (delta >= -256 && delta < 256)
6772 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6773 plus_constant (Pmode, this_rtx, delta));
6774 else
6775 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6776 temp1, temp0, false);
6779 if (Pmode == ptr_mode)
6780 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6781 else
6782 aarch64_emit_move (temp0,
6783 gen_rtx_ZERO_EXTEND (Pmode,
6784 gen_rtx_MEM (ptr_mode, addr)));
6786 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6787 addr = plus_constant (Pmode, temp0, vcall_offset);
6788 else
6790 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6791 Pmode);
6792 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6795 if (Pmode == ptr_mode)
6796 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6797 else
6798 aarch64_emit_move (temp1,
6799 gen_rtx_SIGN_EXTEND (Pmode,
6800 gen_rtx_MEM (ptr_mode, addr)));
6802 emit_insn (gen_add2_insn (this_rtx, temp1));
6805 /* Generate a tail call to the target function. */
6806 if (!TREE_USED (function))
6808 assemble_external (function);
6809 TREE_USED (function) = 1;
6811 funexp = XEXP (DECL_RTL (function), 0);
6812 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6813 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6814 SIBLING_CALL_P (insn) = 1;
6816 insn = get_insns ();
6817 shorten_branches (insn);
6819 assemble_start_function (thunk, fnname);
6820 final_start_function (insn, file, 1);
6821 final (insn, file, 1);
6822 final_end_function ();
6823 assemble_end_function (thunk, fnname);
6825 /* Stop pretending to be a post-reload pass. */
6826 reload_completed = 0;
6829 static bool
6830 aarch64_tls_referenced_p (rtx x)
6832 if (!TARGET_HAVE_TLS)
6833 return false;
6834 subrtx_iterator::array_type array;
6835 FOR_EACH_SUBRTX (iter, array, x, ALL)
6837 const_rtx x = *iter;
6838 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6839 return true;
6840 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6841 TLS offsets, not real symbol references. */
6842 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6843 iter.skip_subrtxes ();
6845 return false;
6849 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6850 a left shift of 0 or 12 bits. */
6851 bool
6852 aarch64_uimm12_shift (HOST_WIDE_INT val)
6854 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6855 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6859 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6860 that can be created with a left shift of 0 or 12. */
6861 static HOST_WIDE_INT
6862 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6864 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6865 handle correctly. */
6866 gcc_assert ((val & 0xffffff) == val);
6868 if (((val & 0xfff) << 0) == val)
6869 return val;
6871 return val & (0xfff << 12);
6874 /* Return true if val is an immediate that can be loaded into a
6875 register by a MOVZ instruction. */
6876 static bool
6877 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6879 if (GET_MODE_SIZE (mode) > 4)
6881 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6882 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6883 return 1;
6885 else
6887 /* Ignore sign extension. */
6888 val &= (HOST_WIDE_INT) 0xffffffff;
6890 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6891 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6894 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6895 64-bit (DImode) integer. */
6897 static unsigned HOST_WIDE_INT
6898 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6900 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6901 while (size < 64)
6903 val &= (HOST_WIDE_INT_1U << size) - 1;
6904 val |= val << size;
6905 size *= 2;
6907 return val;
6910 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6912 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6914 0x0000000100000001ull,
6915 0x0001000100010001ull,
6916 0x0101010101010101ull,
6917 0x1111111111111111ull,
6918 0x5555555555555555ull,
6922 /* Return true if val is a valid bitmask immediate. */
6924 bool
6925 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6927 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6928 int bits;
6930 /* Check for a single sequence of one bits and return quickly if so.
6931 The special cases of all ones and all zeroes returns false. */
6932 val = aarch64_replicate_bitmask_imm (val_in, mode);
6933 tmp = val + (val & -val);
6935 if (tmp == (tmp & -tmp))
6936 return (val + 1) > 1;
6938 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6939 if (mode == SImode)
6940 val = (val << 32) | (val & 0xffffffff);
6942 /* Invert if the immediate doesn't start with a zero bit - this means we
6943 only need to search for sequences of one bits. */
6944 if (val & 1)
6945 val = ~val;
6947 /* Find the first set bit and set tmp to val with the first sequence of one
6948 bits removed. Return success if there is a single sequence of ones. */
6949 first_one = val & -val;
6950 tmp = val & (val + first_one);
6952 if (tmp == 0)
6953 return true;
6955 /* Find the next set bit and compute the difference in bit position. */
6956 next_one = tmp & -tmp;
6957 bits = clz_hwi (first_one) - clz_hwi (next_one);
6958 mask = val ^ tmp;
6960 /* Check the bit position difference is a power of 2, and that the first
6961 sequence of one bits fits within 'bits' bits. */
6962 if ((mask >> bits) != 0 || bits != (bits & -bits))
6963 return false;
6965 /* Check the sequence of one bits is repeated 64/bits times. */
6966 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6969 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6970 Assumed precondition: VAL_IN Is not zero. */
6972 unsigned HOST_WIDE_INT
6973 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6975 int lowest_bit_set = ctz_hwi (val_in);
6976 int highest_bit_set = floor_log2 (val_in);
6977 gcc_assert (val_in != 0);
6979 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6980 (HOST_WIDE_INT_1U << lowest_bit_set));
6983 /* Create constant where bits outside of lowest bit set to highest bit set
6984 are set to 1. */
6986 unsigned HOST_WIDE_INT
6987 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6989 return val_in | ~aarch64_and_split_imm1 (val_in);
6992 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6994 bool
6995 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6997 scalar_int_mode int_mode;
6998 if (!is_a <scalar_int_mode> (mode, &int_mode))
6999 return false;
7001 if (aarch64_bitmask_imm (val_in, int_mode))
7002 return false;
7004 if (aarch64_move_imm (val_in, int_mode))
7005 return false;
7007 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7009 return aarch64_bitmask_imm (imm2, int_mode);
7012 /* Return true if val is an immediate that can be loaded into a
7013 register in a single instruction. */
7014 bool
7015 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7017 scalar_int_mode int_mode;
7018 if (!is_a <scalar_int_mode> (mode, &int_mode))
7019 return false;
7021 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7022 return 1;
7023 return aarch64_bitmask_imm (val, int_mode);
7026 static bool
7027 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7029 rtx base, offset;
7031 if (GET_CODE (x) == HIGH)
7032 return true;
7034 /* There's no way to calculate VL-based values using relocations. */
7035 subrtx_iterator::array_type array;
7036 FOR_EACH_SUBRTX (iter, array, x, ALL)
7037 if (GET_CODE (*iter) == CONST_POLY_INT)
7038 return true;
7040 split_const (x, &base, &offset);
7041 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7043 if (aarch64_classify_symbol (base, INTVAL (offset))
7044 != SYMBOL_FORCE_TO_MEM)
7045 return true;
7046 else
7047 /* Avoid generating a 64-bit relocation in ILP32; leave
7048 to aarch64_expand_mov_immediate to handle it properly. */
7049 return mode != ptr_mode;
7052 return aarch64_tls_referenced_p (x);
7055 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7056 The expansion for a table switch is quite expensive due to the number
7057 of instructions, the table lookup and hard to predict indirect jump.
7058 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7059 set, otherwise use tables for > 16 cases as a tradeoff between size and
7060 performance. When optimizing for size, use the default setting. */
7062 static unsigned int
7063 aarch64_case_values_threshold (void)
7065 /* Use the specified limit for the number of cases before using jump
7066 tables at higher optimization levels. */
7067 if (optimize > 2
7068 && selected_cpu->tune->max_case_values != 0)
7069 return selected_cpu->tune->max_case_values;
7070 else
7071 return optimize_size ? default_case_values_threshold () : 17;
7074 /* Return true if register REGNO is a valid index register.
7075 STRICT_P is true if REG_OK_STRICT is in effect. */
7077 bool
7078 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7080 if (!HARD_REGISTER_NUM_P (regno))
7082 if (!strict_p)
7083 return true;
7085 if (!reg_renumber)
7086 return false;
7088 regno = reg_renumber[regno];
7090 return GP_REGNUM_P (regno);
7093 /* Return true if register REGNO is a valid base register for mode MODE.
7094 STRICT_P is true if REG_OK_STRICT is in effect. */
7096 bool
7097 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7099 if (!HARD_REGISTER_NUM_P (regno))
7101 if (!strict_p)
7102 return true;
7104 if (!reg_renumber)
7105 return false;
7107 regno = reg_renumber[regno];
7110 /* The fake registers will be eliminated to either the stack or
7111 hard frame pointer, both of which are usually valid base registers.
7112 Reload deals with the cases where the eliminated form isn't valid. */
7113 return (GP_REGNUM_P (regno)
7114 || regno == SP_REGNUM
7115 || regno == FRAME_POINTER_REGNUM
7116 || regno == ARG_POINTER_REGNUM);
7119 /* Return true if X is a valid base register for mode MODE.
7120 STRICT_P is true if REG_OK_STRICT is in effect. */
7122 static bool
7123 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7125 if (!strict_p
7126 && GET_CODE (x) == SUBREG
7127 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7128 x = SUBREG_REG (x);
7130 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7133 /* Return true if address offset is a valid index. If it is, fill in INFO
7134 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7136 static bool
7137 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7138 machine_mode mode, bool strict_p)
7140 enum aarch64_address_type type;
7141 rtx index;
7142 int shift;
7144 /* (reg:P) */
7145 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7146 && GET_MODE (x) == Pmode)
7148 type = ADDRESS_REG_REG;
7149 index = x;
7150 shift = 0;
7152 /* (sign_extend:DI (reg:SI)) */
7153 else if ((GET_CODE (x) == SIGN_EXTEND
7154 || GET_CODE (x) == ZERO_EXTEND)
7155 && GET_MODE (x) == DImode
7156 && GET_MODE (XEXP (x, 0)) == SImode)
7158 type = (GET_CODE (x) == SIGN_EXTEND)
7159 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7160 index = XEXP (x, 0);
7161 shift = 0;
7163 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7164 else if (GET_CODE (x) == MULT
7165 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7166 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7167 && GET_MODE (XEXP (x, 0)) == DImode
7168 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7169 && CONST_INT_P (XEXP (x, 1)))
7171 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7172 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7173 index = XEXP (XEXP (x, 0), 0);
7174 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7176 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7177 else if (GET_CODE (x) == ASHIFT
7178 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7179 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7180 && GET_MODE (XEXP (x, 0)) == DImode
7181 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7182 && CONST_INT_P (XEXP (x, 1)))
7184 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7185 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7186 index = XEXP (XEXP (x, 0), 0);
7187 shift = INTVAL (XEXP (x, 1));
7189 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7190 else if ((GET_CODE (x) == SIGN_EXTRACT
7191 || GET_CODE (x) == ZERO_EXTRACT)
7192 && GET_MODE (x) == DImode
7193 && GET_CODE (XEXP (x, 0)) == MULT
7194 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7195 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7197 type = (GET_CODE (x) == SIGN_EXTRACT)
7198 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7199 index = XEXP (XEXP (x, 0), 0);
7200 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7201 if (INTVAL (XEXP (x, 1)) != 32 + shift
7202 || INTVAL (XEXP (x, 2)) != 0)
7203 shift = -1;
7205 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7206 (const_int 0xffffffff<<shift)) */
7207 else if (GET_CODE (x) == AND
7208 && GET_MODE (x) == DImode
7209 && GET_CODE (XEXP (x, 0)) == MULT
7210 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7211 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7212 && CONST_INT_P (XEXP (x, 1)))
7214 type = ADDRESS_REG_UXTW;
7215 index = XEXP (XEXP (x, 0), 0);
7216 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7217 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7218 shift = -1;
7220 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7221 else if ((GET_CODE (x) == SIGN_EXTRACT
7222 || GET_CODE (x) == ZERO_EXTRACT)
7223 && GET_MODE (x) == DImode
7224 && GET_CODE (XEXP (x, 0)) == ASHIFT
7225 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7226 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7228 type = (GET_CODE (x) == SIGN_EXTRACT)
7229 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7230 index = XEXP (XEXP (x, 0), 0);
7231 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7232 if (INTVAL (XEXP (x, 1)) != 32 + shift
7233 || INTVAL (XEXP (x, 2)) != 0)
7234 shift = -1;
7236 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7237 (const_int 0xffffffff<<shift)) */
7238 else if (GET_CODE (x) == AND
7239 && GET_MODE (x) == DImode
7240 && GET_CODE (XEXP (x, 0)) == ASHIFT
7241 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7242 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7243 && CONST_INT_P (XEXP (x, 1)))
7245 type = ADDRESS_REG_UXTW;
7246 index = XEXP (XEXP (x, 0), 0);
7247 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7248 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7249 shift = -1;
7251 /* (mult:P (reg:P) (const_int scale)) */
7252 else if (GET_CODE (x) == MULT
7253 && GET_MODE (x) == Pmode
7254 && GET_MODE (XEXP (x, 0)) == Pmode
7255 && CONST_INT_P (XEXP (x, 1)))
7257 type = ADDRESS_REG_REG;
7258 index = XEXP (x, 0);
7259 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7261 /* (ashift:P (reg:P) (const_int shift)) */
7262 else if (GET_CODE (x) == ASHIFT
7263 && GET_MODE (x) == Pmode
7264 && GET_MODE (XEXP (x, 0)) == Pmode
7265 && CONST_INT_P (XEXP (x, 1)))
7267 type = ADDRESS_REG_REG;
7268 index = XEXP (x, 0);
7269 shift = INTVAL (XEXP (x, 1));
7271 else
7272 return false;
7274 if (!strict_p
7275 && GET_CODE (index) == SUBREG
7276 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7277 index = SUBREG_REG (index);
7279 if (aarch64_sve_data_mode_p (mode))
7281 if (type != ADDRESS_REG_REG
7282 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7283 return false;
7285 else
7287 if (shift != 0
7288 && !(IN_RANGE (shift, 1, 3)
7289 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7290 return false;
7293 if (REG_P (index)
7294 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7296 info->type = type;
7297 info->offset = index;
7298 info->shift = shift;
7299 return true;
7302 return false;
7305 /* Return true if MODE is one of the modes for which we
7306 support LDP/STP operations. */
7308 static bool
7309 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7311 return mode == SImode || mode == DImode
7312 || mode == SFmode || mode == DFmode
7313 || (aarch64_vector_mode_supported_p (mode)
7314 && (known_eq (GET_MODE_SIZE (mode), 8)
7315 || (known_eq (GET_MODE_SIZE (mode), 16)
7316 && (aarch64_tune_params.extra_tuning_flags
7317 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7320 /* Return true if REGNO is a virtual pointer register, or an eliminable
7321 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7322 include stack_pointer or hard_frame_pointer. */
7323 static bool
7324 virt_or_elim_regno_p (unsigned regno)
7326 return ((regno >= FIRST_VIRTUAL_REGISTER
7327 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7328 || regno == FRAME_POINTER_REGNUM
7329 || regno == ARG_POINTER_REGNUM);
7332 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7333 If it is, fill in INFO appropriately. STRICT_P is true if
7334 REG_OK_STRICT is in effect. */
7336 bool
7337 aarch64_classify_address (struct aarch64_address_info *info,
7338 rtx x, machine_mode mode, bool strict_p,
7339 aarch64_addr_query_type type)
7341 enum rtx_code code = GET_CODE (x);
7342 rtx op0, op1;
7343 poly_int64 offset;
7345 HOST_WIDE_INT const_size;
7347 /* On BE, we use load/store pair for all large int mode load/stores.
7348 TI/TFmode may also use a load/store pair. */
7349 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7350 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7351 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7352 || type == ADDR_QUERY_LDP_STP_N
7353 || mode == TImode
7354 || mode == TFmode
7355 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7357 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7358 corresponds to the actual size of the memory being loaded/stored and the
7359 mode of the corresponding addressing mode is half of that. */
7360 if (type == ADDR_QUERY_LDP_STP_N
7361 && known_eq (GET_MODE_SIZE (mode), 16))
7362 mode = DFmode;
7364 bool allow_reg_index_p = (!load_store_pair_p
7365 && (known_lt (GET_MODE_SIZE (mode), 16)
7366 || vec_flags == VEC_ADVSIMD
7367 || vec_flags & VEC_SVE_DATA));
7369 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7370 [Rn, #offset, MUL VL]. */
7371 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7372 && (code != REG && code != PLUS))
7373 return false;
7375 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7376 REG addressing. */
7377 if (advsimd_struct_p
7378 && !BYTES_BIG_ENDIAN
7379 && (code != POST_INC && code != REG))
7380 return false;
7382 gcc_checking_assert (GET_MODE (x) == VOIDmode
7383 || SCALAR_INT_MODE_P (GET_MODE (x)));
7385 switch (code)
7387 case REG:
7388 case SUBREG:
7389 info->type = ADDRESS_REG_IMM;
7390 info->base = x;
7391 info->offset = const0_rtx;
7392 info->const_offset = 0;
7393 return aarch64_base_register_rtx_p (x, strict_p);
7395 case PLUS:
7396 op0 = XEXP (x, 0);
7397 op1 = XEXP (x, 1);
7399 if (! strict_p
7400 && REG_P (op0)
7401 && virt_or_elim_regno_p (REGNO (op0))
7402 && poly_int_rtx_p (op1, &offset))
7404 info->type = ADDRESS_REG_IMM;
7405 info->base = op0;
7406 info->offset = op1;
7407 info->const_offset = offset;
7409 return true;
7412 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7413 && aarch64_base_register_rtx_p (op0, strict_p)
7414 && poly_int_rtx_p (op1, &offset))
7416 info->type = ADDRESS_REG_IMM;
7417 info->base = op0;
7418 info->offset = op1;
7419 info->const_offset = offset;
7421 /* TImode and TFmode values are allowed in both pairs of X
7422 registers and individual Q registers. The available
7423 address modes are:
7424 X,X: 7-bit signed scaled offset
7425 Q: 9-bit signed offset
7426 We conservatively require an offset representable in either mode.
7427 When performing the check for pairs of X registers i.e. LDP/STP
7428 pass down DImode since that is the natural size of the LDP/STP
7429 instruction memory accesses. */
7430 if (mode == TImode || mode == TFmode)
7431 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7432 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7433 || offset_12bit_unsigned_scaled_p (mode, offset)));
7435 /* A 7bit offset check because OImode will emit a ldp/stp
7436 instruction (only big endian will get here).
7437 For ldp/stp instructions, the offset is scaled for the size of a
7438 single element of the pair. */
7439 if (mode == OImode)
7440 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7442 /* Three 9/12 bit offsets checks because CImode will emit three
7443 ldr/str instructions (only big endian will get here). */
7444 if (mode == CImode)
7445 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7446 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7447 offset + 32)
7448 || offset_12bit_unsigned_scaled_p (V16QImode,
7449 offset + 32)));
7451 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7452 instructions (only big endian will get here). */
7453 if (mode == XImode)
7454 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7455 && aarch64_offset_7bit_signed_scaled_p (TImode,
7456 offset + 32));
7458 /* Make "m" use the LD1 offset range for SVE data modes, so
7459 that pre-RTL optimizers like ivopts will work to that
7460 instead of the wider LDR/STR range. */
7461 if (vec_flags == VEC_SVE_DATA)
7462 return (type == ADDR_QUERY_M
7463 ? offset_4bit_signed_scaled_p (mode, offset)
7464 : offset_9bit_signed_scaled_p (mode, offset));
7466 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7468 poly_int64 end_offset = (offset
7469 + GET_MODE_SIZE (mode)
7470 - BYTES_PER_SVE_VECTOR);
7471 return (type == ADDR_QUERY_M
7472 ? offset_4bit_signed_scaled_p (mode, offset)
7473 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7474 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7475 end_offset)));
7478 if (vec_flags == VEC_SVE_PRED)
7479 return offset_9bit_signed_scaled_p (mode, offset);
7481 if (load_store_pair_p)
7482 return ((known_eq (GET_MODE_SIZE (mode), 4)
7483 || known_eq (GET_MODE_SIZE (mode), 8)
7484 || known_eq (GET_MODE_SIZE (mode), 16))
7485 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7486 else
7487 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7488 || offset_12bit_unsigned_scaled_p (mode, offset));
7491 if (allow_reg_index_p)
7493 /* Look for base + (scaled/extended) index register. */
7494 if (aarch64_base_register_rtx_p (op0, strict_p)
7495 && aarch64_classify_index (info, op1, mode, strict_p))
7497 info->base = op0;
7498 return true;
7500 if (aarch64_base_register_rtx_p (op1, strict_p)
7501 && aarch64_classify_index (info, op0, mode, strict_p))
7503 info->base = op1;
7504 return true;
7508 return false;
7510 case POST_INC:
7511 case POST_DEC:
7512 case PRE_INC:
7513 case PRE_DEC:
7514 info->type = ADDRESS_REG_WB;
7515 info->base = XEXP (x, 0);
7516 info->offset = NULL_RTX;
7517 return aarch64_base_register_rtx_p (info->base, strict_p);
7519 case POST_MODIFY:
7520 case PRE_MODIFY:
7521 info->type = ADDRESS_REG_WB;
7522 info->base = XEXP (x, 0);
7523 if (GET_CODE (XEXP (x, 1)) == PLUS
7524 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7525 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7526 && aarch64_base_register_rtx_p (info->base, strict_p))
7528 info->offset = XEXP (XEXP (x, 1), 1);
7529 info->const_offset = offset;
7531 /* TImode and TFmode values are allowed in both pairs of X
7532 registers and individual Q registers. The available
7533 address modes are:
7534 X,X: 7-bit signed scaled offset
7535 Q: 9-bit signed offset
7536 We conservatively require an offset representable in either mode.
7538 if (mode == TImode || mode == TFmode)
7539 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7540 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7542 if (load_store_pair_p)
7543 return ((known_eq (GET_MODE_SIZE (mode), 4)
7544 || known_eq (GET_MODE_SIZE (mode), 8)
7545 || known_eq (GET_MODE_SIZE (mode), 16))
7546 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7547 else
7548 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7550 return false;
7552 case CONST:
7553 case SYMBOL_REF:
7554 case LABEL_REF:
7555 /* load literal: pc-relative constant pool entry. Only supported
7556 for SI mode or larger. */
7557 info->type = ADDRESS_SYMBOLIC;
7559 if (!load_store_pair_p
7560 && GET_MODE_SIZE (mode).is_constant (&const_size)
7561 && const_size >= 4)
7563 rtx sym, addend;
7565 split_const (x, &sym, &addend);
7566 return ((GET_CODE (sym) == LABEL_REF
7567 || (GET_CODE (sym) == SYMBOL_REF
7568 && CONSTANT_POOL_ADDRESS_P (sym)
7569 && aarch64_pcrelative_literal_loads)));
7571 return false;
7573 case LO_SUM:
7574 info->type = ADDRESS_LO_SUM;
7575 info->base = XEXP (x, 0);
7576 info->offset = XEXP (x, 1);
7577 if (allow_reg_index_p
7578 && aarch64_base_register_rtx_p (info->base, strict_p))
7580 rtx sym, offs;
7581 split_const (info->offset, &sym, &offs);
7582 if (GET_CODE (sym) == SYMBOL_REF
7583 && (aarch64_classify_symbol (sym, INTVAL (offs))
7584 == SYMBOL_SMALL_ABSOLUTE))
7586 /* The symbol and offset must be aligned to the access size. */
7587 unsigned int align;
7589 if (CONSTANT_POOL_ADDRESS_P (sym))
7590 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7591 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7593 tree exp = SYMBOL_REF_DECL (sym);
7594 align = TYPE_ALIGN (TREE_TYPE (exp));
7595 align = aarch64_constant_alignment (exp, align);
7597 else if (SYMBOL_REF_DECL (sym))
7598 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7599 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7600 && SYMBOL_REF_BLOCK (sym) != NULL)
7601 align = SYMBOL_REF_BLOCK (sym)->alignment;
7602 else
7603 align = BITS_PER_UNIT;
7605 poly_int64 ref_size = GET_MODE_SIZE (mode);
7606 if (known_eq (ref_size, 0))
7607 ref_size = GET_MODE_SIZE (DImode);
7609 return (multiple_p (INTVAL (offs), ref_size)
7610 && multiple_p (align / BITS_PER_UNIT, ref_size));
7613 return false;
7615 default:
7616 return false;
7620 /* Return true if the address X is valid for a PRFM instruction.
7621 STRICT_P is true if we should do strict checking with
7622 aarch64_classify_address. */
7624 bool
7625 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7627 struct aarch64_address_info addr;
7629 /* PRFM accepts the same addresses as DImode... */
7630 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7631 if (!res)
7632 return false;
7634 /* ... except writeback forms. */
7635 return addr.type != ADDRESS_REG_WB;
7638 bool
7639 aarch64_symbolic_address_p (rtx x)
7641 rtx offset;
7643 split_const (x, &x, &offset);
7644 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7647 /* Classify the base of symbolic expression X. */
7649 enum aarch64_symbol_type
7650 aarch64_classify_symbolic_expression (rtx x)
7652 rtx offset;
7654 split_const (x, &x, &offset);
7655 return aarch64_classify_symbol (x, INTVAL (offset));
7659 /* Return TRUE if X is a legitimate address for accessing memory in
7660 mode MODE. */
7661 static bool
7662 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7664 struct aarch64_address_info addr;
7666 return aarch64_classify_address (&addr, x, mode, strict_p);
7669 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7670 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7671 bool
7672 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7673 aarch64_addr_query_type type)
7675 struct aarch64_address_info addr;
7677 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7680 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7682 static bool
7683 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7684 poly_int64 orig_offset,
7685 machine_mode mode)
7687 HOST_WIDE_INT size;
7688 if (GET_MODE_SIZE (mode).is_constant (&size))
7690 HOST_WIDE_INT const_offset, second_offset;
7692 /* A general SVE offset is A * VQ + B. Remove the A component from
7693 coefficient 0 in order to get the constant B. */
7694 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7696 /* Split an out-of-range address displacement into a base and
7697 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7698 range otherwise to increase opportunities for sharing the base
7699 address of different sizes. Unaligned accesses use the signed
7700 9-bit range, TImode/TFmode use the intersection of signed
7701 scaled 7-bit and signed 9-bit offset. */
7702 if (mode == TImode || mode == TFmode)
7703 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7704 else if ((const_offset & (size - 1)) != 0)
7705 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7706 else
7707 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7709 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7710 return false;
7712 /* Split the offset into second_offset and the rest. */
7713 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7714 *offset2 = gen_int_mode (second_offset, Pmode);
7715 return true;
7717 else
7719 /* Get the mode we should use as the basis of the range. For structure
7720 modes this is the mode of one vector. */
7721 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7722 machine_mode step_mode
7723 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7725 /* Get the "mul vl" multiplier we'd like to use. */
7726 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7727 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7728 if (vec_flags & VEC_SVE_DATA)
7729 /* LDR supports a 9-bit range, but the move patterns for
7730 structure modes require all vectors to be in range of the
7731 same base. The simplest way of accomodating that while still
7732 promoting reuse of anchor points between different modes is
7733 to use an 8-bit range unconditionally. */
7734 vnum = ((vnum + 128) & 255) - 128;
7735 else
7736 /* Predicates are only handled singly, so we might as well use
7737 the full range. */
7738 vnum = ((vnum + 256) & 511) - 256;
7739 if (vnum == 0)
7740 return false;
7742 /* Convert the "mul vl" multiplier into a byte offset. */
7743 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7744 if (known_eq (second_offset, orig_offset))
7745 return false;
7747 /* Split the offset into second_offset and the rest. */
7748 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7749 *offset2 = gen_int_mode (second_offset, Pmode);
7750 return true;
7754 /* Return the binary representation of floating point constant VALUE in INTVAL.
7755 If the value cannot be converted, return false without setting INTVAL.
7756 The conversion is done in the given MODE. */
7757 bool
7758 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7761 /* We make a general exception for 0. */
7762 if (aarch64_float_const_zero_rtx_p (value))
7764 *intval = 0;
7765 return true;
7768 scalar_float_mode mode;
7769 if (GET_CODE (value) != CONST_DOUBLE
7770 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7771 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7772 /* Only support up to DF mode. */
7773 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7774 return false;
7776 unsigned HOST_WIDE_INT ival = 0;
7778 long res[2];
7779 real_to_target (res,
7780 CONST_DOUBLE_REAL_VALUE (value),
7781 REAL_MODE_FORMAT (mode));
7783 if (mode == DFmode)
7785 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7786 ival = zext_hwi (res[order], 32);
7787 ival |= (zext_hwi (res[1 - order], 32) << 32);
7789 else
7790 ival = zext_hwi (res[0], 32);
7792 *intval = ival;
7793 return true;
7796 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7797 single MOV(+MOVK) followed by an FMOV. */
7798 bool
7799 aarch64_float_const_rtx_p (rtx x)
7801 machine_mode mode = GET_MODE (x);
7802 if (mode == VOIDmode)
7803 return false;
7805 /* Determine whether it's cheaper to write float constants as
7806 mov/movk pairs over ldr/adrp pairs. */
7807 unsigned HOST_WIDE_INT ival;
7809 if (GET_CODE (x) == CONST_DOUBLE
7810 && SCALAR_FLOAT_MODE_P (mode)
7811 && aarch64_reinterpret_float_as_int (x, &ival))
7813 scalar_int_mode imode = (mode == HFmode
7814 ? SImode
7815 : int_mode_for_mode (mode).require ());
7816 int num_instr = aarch64_internal_mov_immediate
7817 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7818 return num_instr < 3;
7821 return false;
7824 /* Return TRUE if rtx X is immediate constant 0.0 */
7825 bool
7826 aarch64_float_const_zero_rtx_p (rtx x)
7828 if (GET_MODE (x) == VOIDmode)
7829 return false;
7831 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7832 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7833 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7836 /* Return TRUE if rtx X is immediate constant that fits in a single
7837 MOVI immediate operation. */
7838 bool
7839 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7841 if (!TARGET_SIMD)
7842 return false;
7844 machine_mode vmode;
7845 scalar_int_mode imode;
7846 unsigned HOST_WIDE_INT ival;
7848 if (GET_CODE (x) == CONST_DOUBLE
7849 && SCALAR_FLOAT_MODE_P (mode))
7851 if (!aarch64_reinterpret_float_as_int (x, &ival))
7852 return false;
7854 /* We make a general exception for 0. */
7855 if (aarch64_float_const_zero_rtx_p (x))
7856 return true;
7858 imode = int_mode_for_mode (mode).require ();
7860 else if (GET_CODE (x) == CONST_INT
7861 && is_a <scalar_int_mode> (mode, &imode))
7862 ival = INTVAL (x);
7863 else
7864 return false;
7866 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7867 a 128 bit vector mode. */
7868 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7870 vmode = aarch64_simd_container_mode (imode, width);
7871 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7873 return aarch64_simd_valid_immediate (v_op, NULL);
7877 /* Return the fixed registers used for condition codes. */
7879 static bool
7880 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7882 *p1 = CC_REGNUM;
7883 *p2 = INVALID_REGNUM;
7884 return true;
7887 /* This function is used by the call expanders of the machine description.
7888 RESULT is the register in which the result is returned. It's NULL for
7889 "call" and "sibcall".
7890 MEM is the location of the function call.
7891 SIBCALL indicates whether this function call is normal call or sibling call.
7892 It will generate different pattern accordingly. */
7894 void
7895 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7897 rtx call, callee, tmp;
7898 rtvec vec;
7899 machine_mode mode;
7901 gcc_assert (MEM_P (mem));
7902 callee = XEXP (mem, 0);
7903 mode = GET_MODE (callee);
7904 gcc_assert (mode == Pmode);
7906 /* Decide if we should generate indirect calls by loading the
7907 address of the callee into a register before performing
7908 the branch-and-link. */
7909 if (SYMBOL_REF_P (callee)
7910 ? (aarch64_is_long_call_p (callee)
7911 || aarch64_is_noplt_call_p (callee))
7912 : !REG_P (callee))
7913 XEXP (mem, 0) = force_reg (mode, callee);
7915 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7917 if (result != NULL_RTX)
7918 call = gen_rtx_SET (result, call);
7920 if (sibcall)
7921 tmp = ret_rtx;
7922 else
7923 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7925 vec = gen_rtvec (2, call, tmp);
7926 call = gen_rtx_PARALLEL (VOIDmode, vec);
7928 aarch64_emit_call_insn (call);
7931 /* Emit call insn with PAT and do aarch64-specific handling. */
7933 void
7934 aarch64_emit_call_insn (rtx pat)
7936 rtx insn = emit_call_insn (pat);
7938 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7939 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7940 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7943 machine_mode
7944 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7946 machine_mode mode_x = GET_MODE (x);
7947 rtx_code code_x = GET_CODE (x);
7949 /* All floating point compares return CCFP if it is an equality
7950 comparison, and CCFPE otherwise. */
7951 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7953 switch (code)
7955 case EQ:
7956 case NE:
7957 case UNORDERED:
7958 case ORDERED:
7959 case UNLT:
7960 case UNLE:
7961 case UNGT:
7962 case UNGE:
7963 case UNEQ:
7964 return CCFPmode;
7966 case LT:
7967 case LE:
7968 case GT:
7969 case GE:
7970 case LTGT:
7971 return CCFPEmode;
7973 default:
7974 gcc_unreachable ();
7978 /* Equality comparisons of short modes against zero can be performed
7979 using the TST instruction with the appropriate bitmask. */
7980 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7981 && (code == EQ || code == NE)
7982 && (mode_x == HImode || mode_x == QImode))
7983 return CC_NZmode;
7985 /* Similarly, comparisons of zero_extends from shorter modes can
7986 be performed using an ANDS with an immediate mask. */
7987 if (y == const0_rtx && code_x == ZERO_EXTEND
7988 && (mode_x == SImode || mode_x == DImode)
7989 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7990 && (code == EQ || code == NE))
7991 return CC_NZmode;
7993 if ((mode_x == SImode || mode_x == DImode)
7994 && y == const0_rtx
7995 && (code == EQ || code == NE || code == LT || code == GE)
7996 && (code_x == PLUS || code_x == MINUS || code_x == AND
7997 || code_x == NEG
7998 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7999 && CONST_INT_P (XEXP (x, 2)))))
8000 return CC_NZmode;
8002 /* A compare with a shifted operand. Because of canonicalization,
8003 the comparison will have to be swapped when we emit the assembly
8004 code. */
8005 if ((mode_x == SImode || mode_x == DImode)
8006 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8007 && (code_x == ASHIFT || code_x == ASHIFTRT
8008 || code_x == LSHIFTRT
8009 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8010 return CC_SWPmode;
8012 /* Similarly for a negated operand, but we can only do this for
8013 equalities. */
8014 if ((mode_x == SImode || mode_x == DImode)
8015 && (REG_P (y) || GET_CODE (y) == SUBREG)
8016 && (code == EQ || code == NE)
8017 && code_x == NEG)
8018 return CC_Zmode;
8020 /* A test for unsigned overflow from an addition. */
8021 if ((mode_x == DImode || mode_x == TImode)
8022 && (code == LTU || code == GEU)
8023 && code_x == PLUS
8024 && rtx_equal_p (XEXP (x, 0), y))
8025 return CC_Cmode;
8027 /* A test for unsigned overflow from an add with carry. */
8028 if ((mode_x == DImode || mode_x == TImode)
8029 && (code == LTU || code == GEU)
8030 && code_x == PLUS
8031 && CONST_SCALAR_INT_P (y)
8032 && (rtx_mode_t (y, mode_x)
8033 == (wi::shwi (1, mode_x)
8034 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8035 return CC_ADCmode;
8037 /* A test for signed overflow. */
8038 if ((mode_x == DImode || mode_x == TImode)
8039 && code == NE
8040 && code_x == PLUS
8041 && GET_CODE (y) == SIGN_EXTEND)
8042 return CC_Vmode;
8044 /* For everything else, return CCmode. */
8045 return CCmode;
8048 static int
8049 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8052 aarch64_get_condition_code (rtx x)
8054 machine_mode mode = GET_MODE (XEXP (x, 0));
8055 enum rtx_code comp_code = GET_CODE (x);
8057 if (GET_MODE_CLASS (mode) != MODE_CC)
8058 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8059 return aarch64_get_condition_code_1 (mode, comp_code);
8062 static int
8063 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8065 switch (mode)
8067 case E_CCFPmode:
8068 case E_CCFPEmode:
8069 switch (comp_code)
8071 case GE: return AARCH64_GE;
8072 case GT: return AARCH64_GT;
8073 case LE: return AARCH64_LS;
8074 case LT: return AARCH64_MI;
8075 case NE: return AARCH64_NE;
8076 case EQ: return AARCH64_EQ;
8077 case ORDERED: return AARCH64_VC;
8078 case UNORDERED: return AARCH64_VS;
8079 case UNLT: return AARCH64_LT;
8080 case UNLE: return AARCH64_LE;
8081 case UNGT: return AARCH64_HI;
8082 case UNGE: return AARCH64_PL;
8083 default: return -1;
8085 break;
8087 case E_CCmode:
8088 switch (comp_code)
8090 case NE: return AARCH64_NE;
8091 case EQ: return AARCH64_EQ;
8092 case GE: return AARCH64_GE;
8093 case GT: return AARCH64_GT;
8094 case LE: return AARCH64_LE;
8095 case LT: return AARCH64_LT;
8096 case GEU: return AARCH64_CS;
8097 case GTU: return AARCH64_HI;
8098 case LEU: return AARCH64_LS;
8099 case LTU: return AARCH64_CC;
8100 default: return -1;
8102 break;
8104 case E_CC_SWPmode:
8105 switch (comp_code)
8107 case NE: return AARCH64_NE;
8108 case EQ: return AARCH64_EQ;
8109 case GE: return AARCH64_LE;
8110 case GT: return AARCH64_LT;
8111 case LE: return AARCH64_GE;
8112 case LT: return AARCH64_GT;
8113 case GEU: return AARCH64_LS;
8114 case GTU: return AARCH64_CC;
8115 case LEU: return AARCH64_CS;
8116 case LTU: return AARCH64_HI;
8117 default: return -1;
8119 break;
8121 case E_CC_NZCmode:
8122 switch (comp_code)
8124 case NE: return AARCH64_NE; /* = any */
8125 case EQ: return AARCH64_EQ; /* = none */
8126 case GE: return AARCH64_PL; /* = nfrst */
8127 case LT: return AARCH64_MI; /* = first */
8128 case GEU: return AARCH64_CS; /* = nlast */
8129 case GTU: return AARCH64_HI; /* = pmore */
8130 case LEU: return AARCH64_LS; /* = plast */
8131 case LTU: return AARCH64_CC; /* = last */
8132 default: return -1;
8134 break;
8136 case E_CC_NZmode:
8137 switch (comp_code)
8139 case NE: return AARCH64_NE;
8140 case EQ: return AARCH64_EQ;
8141 case GE: return AARCH64_PL;
8142 case LT: return AARCH64_MI;
8143 default: return -1;
8145 break;
8147 case E_CC_Zmode:
8148 switch (comp_code)
8150 case NE: return AARCH64_NE;
8151 case EQ: return AARCH64_EQ;
8152 default: return -1;
8154 break;
8156 case E_CC_Cmode:
8157 switch (comp_code)
8159 case LTU: return AARCH64_CS;
8160 case GEU: return AARCH64_CC;
8161 default: return -1;
8163 break;
8165 case E_CC_ADCmode:
8166 switch (comp_code)
8168 case GEU: return AARCH64_CS;
8169 case LTU: return AARCH64_CC;
8170 default: return -1;
8172 break;
8174 case E_CC_Vmode:
8175 switch (comp_code)
8177 case NE: return AARCH64_VS;
8178 case EQ: return AARCH64_VC;
8179 default: return -1;
8181 break;
8183 default:
8184 return -1;
8187 return -1;
8190 bool
8191 aarch64_const_vec_all_same_in_range_p (rtx x,
8192 HOST_WIDE_INT minval,
8193 HOST_WIDE_INT maxval)
8195 rtx elt;
8196 return (const_vec_duplicate_p (x, &elt)
8197 && CONST_INT_P (elt)
8198 && IN_RANGE (INTVAL (elt), minval, maxval));
8201 bool
8202 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8204 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8207 /* Return true if VEC is a constant in which every element is in the range
8208 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8210 static bool
8211 aarch64_const_vec_all_in_range_p (rtx vec,
8212 HOST_WIDE_INT minval,
8213 HOST_WIDE_INT maxval)
8215 if (GET_CODE (vec) != CONST_VECTOR
8216 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8217 return false;
8219 int nunits;
8220 if (!CONST_VECTOR_STEPPED_P (vec))
8221 nunits = const_vector_encoded_nelts (vec);
8222 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8223 return false;
8225 for (int i = 0; i < nunits; i++)
8227 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8228 if (!CONST_INT_P (vec_elem)
8229 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8230 return false;
8232 return true;
8235 /* N Z C V. */
8236 #define AARCH64_CC_V 1
8237 #define AARCH64_CC_C (1 << 1)
8238 #define AARCH64_CC_Z (1 << 2)
8239 #define AARCH64_CC_N (1 << 3)
8241 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8242 static const int aarch64_nzcv_codes[] =
8244 0, /* EQ, Z == 1. */
8245 AARCH64_CC_Z, /* NE, Z == 0. */
8246 0, /* CS, C == 1. */
8247 AARCH64_CC_C, /* CC, C == 0. */
8248 0, /* MI, N == 1. */
8249 AARCH64_CC_N, /* PL, N == 0. */
8250 0, /* VS, V == 1. */
8251 AARCH64_CC_V, /* VC, V == 0. */
8252 0, /* HI, C ==1 && Z == 0. */
8253 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8254 AARCH64_CC_V, /* GE, N == V. */
8255 0, /* LT, N != V. */
8256 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8257 0, /* LE, !(Z == 0 && N == V). */
8258 0, /* AL, Any. */
8259 0 /* NV, Any. */
8262 /* Print floating-point vector immediate operand X to F, negating it
8263 first if NEGATE is true. Return true on success, false if it isn't
8264 a constant we can handle. */
8266 static bool
8267 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8269 rtx elt;
8271 if (!const_vec_duplicate_p (x, &elt))
8272 return false;
8274 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8275 if (negate)
8276 r = real_value_negate (&r);
8278 /* We only handle the SVE single-bit immediates here. */
8279 if (real_equal (&r, &dconst0))
8280 asm_fprintf (f, "0.0");
8281 else if (real_equal (&r, &dconst1))
8282 asm_fprintf (f, "1.0");
8283 else if (real_equal (&r, &dconsthalf))
8284 asm_fprintf (f, "0.5");
8285 else
8286 return false;
8288 return true;
8291 /* Return the equivalent letter for size. */
8292 static char
8293 sizetochar (int size)
8295 switch (size)
8297 case 64: return 'd';
8298 case 32: return 's';
8299 case 16: return 'h';
8300 case 8 : return 'b';
8301 default: gcc_unreachable ();
8305 /* Print operand X to file F in a target specific manner according to CODE.
8306 The acceptable formatting commands given by CODE are:
8307 'c': An integer or symbol address without a preceding #
8308 sign.
8309 'C': Take the duplicated element in a vector constant
8310 and print it in hex.
8311 'D': Take the duplicated element in a vector constant
8312 and print it as an unsigned integer, in decimal.
8313 'e': Print the sign/zero-extend size as a character 8->b,
8314 16->h, 32->w.
8315 'p': Prints N such that 2^N == X (X must be power of 2 and
8316 const int).
8317 'P': Print the number of non-zero bits in X (a const_int).
8318 'H': Print the higher numbered register of a pair (TImode)
8319 of regs.
8320 'm': Print a condition (eq, ne, etc).
8321 'M': Same as 'm', but invert condition.
8322 'N': Take the duplicated element in a vector constant
8323 and print the negative of it in decimal.
8324 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8325 'S/T/U/V': Print a FP/SIMD register name for a register list.
8326 The register printed is the FP/SIMD register name
8327 of X + 0/1/2/3 for S/T/U/V.
8328 'R': Print a scalar FP/SIMD register name + 1.
8329 'X': Print bottom 16 bits of integer constant in hex.
8330 'w/x': Print a general register name or the zero register
8331 (32-bit or 64-bit).
8332 '0': Print a normal operand, if it's a general register,
8333 then we assume DImode.
8334 'k': Print NZCV for conditional compare instructions.
8335 'A': Output address constant representing the first
8336 argument of X, specifying a relocation offset
8337 if appropriate.
8338 'L': Output constant address specified by X
8339 with a relocation offset if appropriate.
8340 'G': Prints address of X, specifying a PC relative
8341 relocation mode if appropriate.
8342 'y': Output address of LDP or STP - this is used for
8343 some LDP/STPs which don't use a PARALLEL in their
8344 pattern (so the mode needs to be adjusted).
8345 'z': Output address of a typical LDP or STP. */
8347 static void
8348 aarch64_print_operand (FILE *f, rtx x, int code)
8350 rtx elt;
8351 switch (code)
8353 case 'c':
8354 switch (GET_CODE (x))
8356 case CONST_INT:
8357 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8358 break;
8360 case SYMBOL_REF:
8361 output_addr_const (f, x);
8362 break;
8364 case CONST:
8365 if (GET_CODE (XEXP (x, 0)) == PLUS
8366 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8368 output_addr_const (f, x);
8369 break;
8371 /* Fall through. */
8373 default:
8374 output_operand_lossage ("unsupported operand for code '%c'", code);
8376 break;
8378 case 'e':
8380 int n;
8382 if (!CONST_INT_P (x)
8383 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
8385 output_operand_lossage ("invalid operand for '%%%c'", code);
8386 return;
8389 switch (n)
8391 case 3:
8392 fputc ('b', f);
8393 break;
8394 case 4:
8395 fputc ('h', f);
8396 break;
8397 case 5:
8398 fputc ('w', f);
8399 break;
8400 default:
8401 output_operand_lossage ("invalid operand for '%%%c'", code);
8402 return;
8405 break;
8407 case 'p':
8409 int n;
8411 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8413 output_operand_lossage ("invalid operand for '%%%c'", code);
8414 return;
8417 asm_fprintf (f, "%d", n);
8419 break;
8421 case 'P':
8422 if (!CONST_INT_P (x))
8424 output_operand_lossage ("invalid operand for '%%%c'", code);
8425 return;
8428 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8429 break;
8431 case 'H':
8432 if (x == const0_rtx)
8434 asm_fprintf (f, "xzr");
8435 break;
8438 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8440 output_operand_lossage ("invalid operand for '%%%c'", code);
8441 return;
8444 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8445 break;
8447 case 'M':
8448 case 'm':
8450 int cond_code;
8451 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8452 if (x == const_true_rtx)
8454 if (code == 'M')
8455 fputs ("nv", f);
8456 return;
8459 if (!COMPARISON_P (x))
8461 output_operand_lossage ("invalid operand for '%%%c'", code);
8462 return;
8465 cond_code = aarch64_get_condition_code (x);
8466 gcc_assert (cond_code >= 0);
8467 if (code == 'M')
8468 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8469 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8470 fputs (aarch64_sve_condition_codes[cond_code], f);
8471 else
8472 fputs (aarch64_condition_codes[cond_code], f);
8474 break;
8476 case 'N':
8477 if (!const_vec_duplicate_p (x, &elt))
8479 output_operand_lossage ("invalid vector constant");
8480 return;
8483 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8484 asm_fprintf (f, "%wd", -INTVAL (elt));
8485 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8486 && aarch64_print_vector_float_operand (f, x, true))
8488 else
8490 output_operand_lossage ("invalid vector constant");
8491 return;
8493 break;
8495 case 'b':
8496 case 'h':
8497 case 's':
8498 case 'd':
8499 case 'q':
8500 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8502 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8503 return;
8505 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8506 break;
8508 case 'S':
8509 case 'T':
8510 case 'U':
8511 case 'V':
8512 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8514 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8515 return;
8517 asm_fprintf (f, "%c%d",
8518 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8519 REGNO (x) - V0_REGNUM + (code - 'S'));
8520 break;
8522 case 'R':
8523 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8525 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8526 return;
8528 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8529 break;
8531 case 'X':
8532 if (!CONST_INT_P (x))
8534 output_operand_lossage ("invalid operand for '%%%c'", code);
8535 return;
8537 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8538 break;
8540 case 'C':
8542 /* Print a replicated constant in hex. */
8543 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8545 output_operand_lossage ("invalid operand for '%%%c'", code);
8546 return;
8548 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8549 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8551 break;
8553 case 'D':
8555 /* Print a replicated constant in decimal, treating it as
8556 unsigned. */
8557 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8559 output_operand_lossage ("invalid operand for '%%%c'", code);
8560 return;
8562 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8563 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8565 break;
8567 case 'w':
8568 case 'x':
8569 if (x == const0_rtx
8570 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8572 asm_fprintf (f, "%czr", code);
8573 break;
8576 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8578 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8579 break;
8582 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8584 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8585 break;
8588 /* Fall through */
8590 case 0:
8591 if (x == NULL)
8593 output_operand_lossage ("missing operand");
8594 return;
8597 switch (GET_CODE (x))
8599 case REG:
8600 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8602 if (REG_NREGS (x) == 1)
8603 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8604 else
8606 char suffix
8607 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8608 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8609 REGNO (x) - V0_REGNUM, suffix,
8610 END_REGNO (x) - V0_REGNUM - 1, suffix);
8613 else
8614 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8615 break;
8617 case MEM:
8618 output_address (GET_MODE (x), XEXP (x, 0));
8619 break;
8621 case LABEL_REF:
8622 case SYMBOL_REF:
8623 output_addr_const (asm_out_file, x);
8624 break;
8626 case CONST_INT:
8627 asm_fprintf (f, "%wd", INTVAL (x));
8628 break;
8630 case CONST:
8631 if (!VECTOR_MODE_P (GET_MODE (x)))
8633 output_addr_const (asm_out_file, x);
8634 break;
8636 /* fall through */
8638 case CONST_VECTOR:
8639 if (!const_vec_duplicate_p (x, &elt))
8641 output_operand_lossage ("invalid vector constant");
8642 return;
8645 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8646 asm_fprintf (f, "%wd", INTVAL (elt));
8647 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8648 && aarch64_print_vector_float_operand (f, x, false))
8650 else
8652 output_operand_lossage ("invalid vector constant");
8653 return;
8655 break;
8657 case CONST_DOUBLE:
8658 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8659 be getting CONST_DOUBLEs holding integers. */
8660 gcc_assert (GET_MODE (x) != VOIDmode);
8661 if (aarch64_float_const_zero_rtx_p (x))
8663 fputc ('0', f);
8664 break;
8666 else if (aarch64_float_const_representable_p (x))
8668 #define buf_size 20
8669 char float_buf[buf_size] = {'\0'};
8670 real_to_decimal_for_mode (float_buf,
8671 CONST_DOUBLE_REAL_VALUE (x),
8672 buf_size, buf_size,
8673 1, GET_MODE (x));
8674 asm_fprintf (asm_out_file, "%s", float_buf);
8675 break;
8676 #undef buf_size
8678 output_operand_lossage ("invalid constant");
8679 return;
8680 default:
8681 output_operand_lossage ("invalid operand");
8682 return;
8684 break;
8686 case 'A':
8687 if (GET_CODE (x) == HIGH)
8688 x = XEXP (x, 0);
8690 switch (aarch64_classify_symbolic_expression (x))
8692 case SYMBOL_SMALL_GOT_4G:
8693 asm_fprintf (asm_out_file, ":got:");
8694 break;
8696 case SYMBOL_SMALL_TLSGD:
8697 asm_fprintf (asm_out_file, ":tlsgd:");
8698 break;
8700 case SYMBOL_SMALL_TLSDESC:
8701 asm_fprintf (asm_out_file, ":tlsdesc:");
8702 break;
8704 case SYMBOL_SMALL_TLSIE:
8705 asm_fprintf (asm_out_file, ":gottprel:");
8706 break;
8708 case SYMBOL_TLSLE24:
8709 asm_fprintf (asm_out_file, ":tprel:");
8710 break;
8712 case SYMBOL_TINY_GOT:
8713 gcc_unreachable ();
8714 break;
8716 default:
8717 break;
8719 output_addr_const (asm_out_file, x);
8720 break;
8722 case 'L':
8723 switch (aarch64_classify_symbolic_expression (x))
8725 case SYMBOL_SMALL_GOT_4G:
8726 asm_fprintf (asm_out_file, ":lo12:");
8727 break;
8729 case SYMBOL_SMALL_TLSGD:
8730 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8731 break;
8733 case SYMBOL_SMALL_TLSDESC:
8734 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8735 break;
8737 case SYMBOL_SMALL_TLSIE:
8738 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8739 break;
8741 case SYMBOL_TLSLE12:
8742 asm_fprintf (asm_out_file, ":tprel_lo12:");
8743 break;
8745 case SYMBOL_TLSLE24:
8746 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8747 break;
8749 case SYMBOL_TINY_GOT:
8750 asm_fprintf (asm_out_file, ":got:");
8751 break;
8753 case SYMBOL_TINY_TLSIE:
8754 asm_fprintf (asm_out_file, ":gottprel:");
8755 break;
8757 default:
8758 break;
8760 output_addr_const (asm_out_file, x);
8761 break;
8763 case 'G':
8764 switch (aarch64_classify_symbolic_expression (x))
8766 case SYMBOL_TLSLE24:
8767 asm_fprintf (asm_out_file, ":tprel_hi12:");
8768 break;
8769 default:
8770 break;
8772 output_addr_const (asm_out_file, x);
8773 break;
8775 case 'k':
8777 HOST_WIDE_INT cond_code;
8779 if (!CONST_INT_P (x))
8781 output_operand_lossage ("invalid operand for '%%%c'", code);
8782 return;
8785 cond_code = INTVAL (x);
8786 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8787 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8789 break;
8791 case 'y':
8792 case 'z':
8794 machine_mode mode = GET_MODE (x);
8796 if (GET_CODE (x) != MEM
8797 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8799 output_operand_lossage ("invalid operand for '%%%c'", code);
8800 return;
8803 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8804 code == 'y'
8805 ? ADDR_QUERY_LDP_STP_N
8806 : ADDR_QUERY_LDP_STP))
8807 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8809 break;
8811 default:
8812 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8813 return;
8817 /* Print address 'x' of a memory access with mode 'mode'.
8818 'op' is the context required by aarch64_classify_address. It can either be
8819 MEM for a normal memory access or PARALLEL for LDP/STP. */
8820 static bool
8821 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8822 aarch64_addr_query_type type)
8824 struct aarch64_address_info addr;
8825 unsigned int size;
8827 /* Check all addresses are Pmode - including ILP32. */
8828 if (GET_MODE (x) != Pmode
8829 && (!CONST_INT_P (x)
8830 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8832 output_operand_lossage ("invalid address mode");
8833 return false;
8836 if (aarch64_classify_address (&addr, x, mode, true, type))
8837 switch (addr.type)
8839 case ADDRESS_REG_IMM:
8840 if (known_eq (addr.const_offset, 0))
8841 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8842 else if (aarch64_sve_data_mode_p (mode))
8844 HOST_WIDE_INT vnum
8845 = exact_div (addr.const_offset,
8846 BYTES_PER_SVE_VECTOR).to_constant ();
8847 asm_fprintf (f, "[%s, #%wd, mul vl]",
8848 reg_names[REGNO (addr.base)], vnum);
8850 else if (aarch64_sve_pred_mode_p (mode))
8852 HOST_WIDE_INT vnum
8853 = exact_div (addr.const_offset,
8854 BYTES_PER_SVE_PRED).to_constant ();
8855 asm_fprintf (f, "[%s, #%wd, mul vl]",
8856 reg_names[REGNO (addr.base)], vnum);
8858 else
8859 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8860 INTVAL (addr.offset));
8861 return true;
8863 case ADDRESS_REG_REG:
8864 if (addr.shift == 0)
8865 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8866 reg_names [REGNO (addr.offset)]);
8867 else
8868 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8869 reg_names [REGNO (addr.offset)], addr.shift);
8870 return true;
8872 case ADDRESS_REG_UXTW:
8873 if (addr.shift == 0)
8874 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8875 REGNO (addr.offset) - R0_REGNUM);
8876 else
8877 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8878 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8879 return true;
8881 case ADDRESS_REG_SXTW:
8882 if (addr.shift == 0)
8883 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8884 REGNO (addr.offset) - R0_REGNUM);
8885 else
8886 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8887 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8888 return true;
8890 case ADDRESS_REG_WB:
8891 /* Writeback is only supported for fixed-width modes. */
8892 size = GET_MODE_SIZE (mode).to_constant ();
8893 switch (GET_CODE (x))
8895 case PRE_INC:
8896 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8897 return true;
8898 case POST_INC:
8899 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8900 return true;
8901 case PRE_DEC:
8902 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8903 return true;
8904 case POST_DEC:
8905 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8906 return true;
8907 case PRE_MODIFY:
8908 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8909 INTVAL (addr.offset));
8910 return true;
8911 case POST_MODIFY:
8912 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8913 INTVAL (addr.offset));
8914 return true;
8915 default:
8916 break;
8918 break;
8920 case ADDRESS_LO_SUM:
8921 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8922 output_addr_const (f, addr.offset);
8923 asm_fprintf (f, "]");
8924 return true;
8926 case ADDRESS_SYMBOLIC:
8927 output_addr_const (f, x);
8928 return true;
8931 return false;
8934 /* Print address 'x' of a memory access with mode 'mode'. */
8935 static void
8936 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8938 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8939 output_addr_const (f, x);
8942 bool
8943 aarch64_label_mentioned_p (rtx x)
8945 const char *fmt;
8946 int i;
8948 if (GET_CODE (x) == LABEL_REF)
8949 return true;
8951 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8952 referencing instruction, but they are constant offsets, not
8953 symbols. */
8954 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8955 return false;
8957 fmt = GET_RTX_FORMAT (GET_CODE (x));
8958 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8960 if (fmt[i] == 'E')
8962 int j;
8964 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8965 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8966 return 1;
8968 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8969 return 1;
8972 return 0;
8975 /* Implement REGNO_REG_CLASS. */
8977 enum reg_class
8978 aarch64_regno_regclass (unsigned regno)
8980 if (GP_REGNUM_P (regno))
8981 return GENERAL_REGS;
8983 if (regno == SP_REGNUM)
8984 return STACK_REG;
8986 if (regno == FRAME_POINTER_REGNUM
8987 || regno == ARG_POINTER_REGNUM)
8988 return POINTER_REGS;
8990 if (FP_REGNUM_P (regno))
8991 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
8992 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
8994 if (PR_REGNUM_P (regno))
8995 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8997 return NO_REGS;
9000 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9001 If OFFSET is out of range, return an offset of an anchor point
9002 that is in range. Return 0 otherwise. */
9004 static HOST_WIDE_INT
9005 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9006 machine_mode mode)
9008 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9009 if (size > 16)
9010 return (offset + 0x400) & ~0x7f0;
9012 /* For offsets that aren't a multiple of the access size, the limit is
9013 -256...255. */
9014 if (offset & (size - 1))
9016 /* BLKmode typically uses LDP of X-registers. */
9017 if (mode == BLKmode)
9018 return (offset + 512) & ~0x3ff;
9019 return (offset + 0x100) & ~0x1ff;
9022 /* Small negative offsets are supported. */
9023 if (IN_RANGE (offset, -256, 0))
9024 return 0;
9026 if (mode == TImode || mode == TFmode)
9027 return (offset + 0x100) & ~0x1ff;
9029 /* Use 12-bit offset by access size. */
9030 return offset & (~0xfff * size);
9033 static rtx
9034 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9036 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9037 where mask is selected by alignment and size of the offset.
9038 We try to pick as large a range for the offset as possible to
9039 maximize the chance of a CSE. However, for aligned addresses
9040 we limit the range to 4k so that structures with different sized
9041 elements are likely to use the same base. We need to be careful
9042 not to split a CONST for some forms of address expression, otherwise
9043 it will generate sub-optimal code. */
9045 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9047 rtx base = XEXP (x, 0);
9048 rtx offset_rtx = XEXP (x, 1);
9049 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9051 if (GET_CODE (base) == PLUS)
9053 rtx op0 = XEXP (base, 0);
9054 rtx op1 = XEXP (base, 1);
9056 /* Force any scaling into a temp for CSE. */
9057 op0 = force_reg (Pmode, op0);
9058 op1 = force_reg (Pmode, op1);
9060 /* Let the pointer register be in op0. */
9061 if (REG_POINTER (op1))
9062 std::swap (op0, op1);
9064 /* If the pointer is virtual or frame related, then we know that
9065 virtual register instantiation or register elimination is going
9066 to apply a second constant. We want the two constants folded
9067 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9068 if (virt_or_elim_regno_p (REGNO (op0)))
9070 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9071 NULL_RTX, true, OPTAB_DIRECT);
9072 return gen_rtx_PLUS (Pmode, base, op1);
9075 /* Otherwise, in order to encourage CSE (and thence loop strength
9076 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9077 base = expand_binop (Pmode, add_optab, op0, op1,
9078 NULL_RTX, true, OPTAB_DIRECT);
9079 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9082 HOST_WIDE_INT size;
9083 if (GET_MODE_SIZE (mode).is_constant (&size))
9085 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9086 mode);
9087 if (base_offset != 0)
9089 base = plus_constant (Pmode, base, base_offset);
9090 base = force_operand (base, NULL_RTX);
9091 return plus_constant (Pmode, base, offset - base_offset);
9096 return x;
9099 static reg_class_t
9100 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9101 reg_class_t rclass,
9102 machine_mode mode,
9103 secondary_reload_info *sri)
9105 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9106 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9107 comment at the head of aarch64-sve.md for more details about the
9108 big-endian handling. */
9109 if (BYTES_BIG_ENDIAN
9110 && reg_class_subset_p (rclass, FP_REGS)
9111 && !((REG_P (x) && HARD_REGISTER_P (x))
9112 || aarch64_simd_valid_immediate (x, NULL))
9113 && aarch64_sve_data_mode_p (mode))
9115 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9116 return NO_REGS;
9119 /* If we have to disable direct literal pool loads and stores because the
9120 function is too big, then we need a scratch register. */
9121 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9122 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9123 || targetm.vector_mode_supported_p (GET_MODE (x)))
9124 && !aarch64_pcrelative_literal_loads)
9126 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9127 return NO_REGS;
9130 /* Without the TARGET_SIMD instructions we cannot move a Q register
9131 to a Q register directly. We need a scratch. */
9132 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9133 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9134 && reg_class_subset_p (rclass, FP_REGS))
9136 sri->icode = code_for_aarch64_reload_mov (mode);
9137 return NO_REGS;
9140 /* A TFmode or TImode memory access should be handled via an FP_REGS
9141 because AArch64 has richer addressing modes for LDR/STR instructions
9142 than LDP/STP instructions. */
9143 if (TARGET_FLOAT && rclass == GENERAL_REGS
9144 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9145 return FP_REGS;
9147 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9148 return GENERAL_REGS;
9150 return NO_REGS;
9153 static bool
9154 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9156 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9158 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9159 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9160 if (frame_pointer_needed)
9161 return to == HARD_FRAME_POINTER_REGNUM;
9162 return true;
9165 poly_int64
9166 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9168 if (to == HARD_FRAME_POINTER_REGNUM)
9170 if (from == ARG_POINTER_REGNUM)
9171 return cfun->machine->frame.hard_fp_offset;
9173 if (from == FRAME_POINTER_REGNUM)
9174 return cfun->machine->frame.hard_fp_offset
9175 - cfun->machine->frame.locals_offset;
9178 if (to == STACK_POINTER_REGNUM)
9180 if (from == FRAME_POINTER_REGNUM)
9181 return cfun->machine->frame.frame_size
9182 - cfun->machine->frame.locals_offset;
9185 return cfun->machine->frame.frame_size;
9188 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9189 previous frame. */
9192 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9194 if (count != 0)
9195 return const0_rtx;
9196 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9200 static void
9201 aarch64_asm_trampoline_template (FILE *f)
9203 int offset1 = 16;
9204 int offset2 = 20;
9206 if (aarch64_bti_enabled ())
9208 asm_fprintf (f, "\thint\t34 // bti c\n");
9209 offset1 -= 4;
9210 offset2 -= 4;
9213 if (TARGET_ILP32)
9215 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9216 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9217 offset1);
9219 else
9221 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9222 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9223 offset2);
9225 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9227 /* The trampoline needs an extra padding instruction. In case if BTI is
9228 enabled the padding instruction is replaced by the BTI instruction at
9229 the beginning. */
9230 if (!aarch64_bti_enabled ())
9231 assemble_aligned_integer (4, const0_rtx);
9233 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9234 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9237 static void
9238 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9240 rtx fnaddr, mem, a_tramp;
9241 const int tramp_code_sz = 16;
9243 /* Don't need to copy the trailing D-words, we fill those in below. */
9244 emit_block_move (m_tramp, assemble_trampoline_template (),
9245 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9246 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9247 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9248 if (GET_MODE (fnaddr) != ptr_mode)
9249 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9250 emit_move_insn (mem, fnaddr);
9252 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9253 emit_move_insn (mem, chain_value);
9255 /* XXX We should really define a "clear_cache" pattern and use
9256 gen_clear_cache(). */
9257 a_tramp = XEXP (m_tramp, 0);
9258 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9259 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9260 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9261 ptr_mode);
9264 static unsigned char
9265 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9267 /* ??? Logically we should only need to provide a value when
9268 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9269 can hold MODE, but at the moment we need to handle all modes.
9270 Just ignore any runtime parts for registers that can't store them. */
9271 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9272 unsigned int nregs;
9273 switch (regclass)
9275 case TAILCALL_ADDR_REGS:
9276 case POINTER_REGS:
9277 case GENERAL_REGS:
9278 case ALL_REGS:
9279 case POINTER_AND_FP_REGS:
9280 case FP_REGS:
9281 case FP_LO_REGS:
9282 case FP_LO8_REGS:
9283 if (aarch64_sve_data_mode_p (mode)
9284 && constant_multiple_p (GET_MODE_SIZE (mode),
9285 BYTES_PER_SVE_VECTOR, &nregs))
9286 return nregs;
9287 return (aarch64_vector_data_mode_p (mode)
9288 ? CEIL (lowest_size, UNITS_PER_VREG)
9289 : CEIL (lowest_size, UNITS_PER_WORD));
9290 case STACK_REG:
9291 case PR_REGS:
9292 case PR_LO_REGS:
9293 case PR_HI_REGS:
9294 return 1;
9296 case NO_REGS:
9297 return 0;
9299 default:
9300 break;
9302 gcc_unreachable ();
9305 static reg_class_t
9306 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9308 if (regclass == POINTER_REGS)
9309 return GENERAL_REGS;
9311 if (regclass == STACK_REG)
9313 if (REG_P(x)
9314 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9315 return regclass;
9317 return NO_REGS;
9320 /* Register eliminiation can result in a request for
9321 SP+constant->FP_REGS. We cannot support such operations which
9322 use SP as source and an FP_REG as destination, so reject out
9323 right now. */
9324 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9326 rtx lhs = XEXP (x, 0);
9328 /* Look through a possible SUBREG introduced by ILP32. */
9329 if (GET_CODE (lhs) == SUBREG)
9330 lhs = SUBREG_REG (lhs);
9332 gcc_assert (REG_P (lhs));
9333 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9334 POINTER_REGS));
9335 return NO_REGS;
9338 return regclass;
9341 void
9342 aarch64_asm_output_labelref (FILE* f, const char *name)
9344 asm_fprintf (f, "%U%s", name);
9347 static void
9348 aarch64_elf_asm_constructor (rtx symbol, int priority)
9350 if (priority == DEFAULT_INIT_PRIORITY)
9351 default_ctor_section_asm_out_constructor (symbol, priority);
9352 else
9354 section *s;
9355 /* While priority is known to be in range [0, 65535], so 18 bytes
9356 would be enough, the compiler might not know that. To avoid
9357 -Wformat-truncation false positive, use a larger size. */
9358 char buf[23];
9359 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9360 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9361 switch_to_section (s);
9362 assemble_align (POINTER_SIZE);
9363 assemble_aligned_integer (POINTER_BYTES, symbol);
9367 static void
9368 aarch64_elf_asm_destructor (rtx symbol, int priority)
9370 if (priority == DEFAULT_INIT_PRIORITY)
9371 default_dtor_section_asm_out_destructor (symbol, priority);
9372 else
9374 section *s;
9375 /* While priority is known to be in range [0, 65535], so 18 bytes
9376 would be enough, the compiler might not know that. To avoid
9377 -Wformat-truncation false positive, use a larger size. */
9378 char buf[23];
9379 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9380 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9381 switch_to_section (s);
9382 assemble_align (POINTER_SIZE);
9383 assemble_aligned_integer (POINTER_BYTES, symbol);
9387 const char*
9388 aarch64_output_casesi (rtx *operands)
9390 char buf[100];
9391 char label[100];
9392 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9393 int index;
9394 static const char *const patterns[4][2] =
9397 "ldrb\t%w3, [%0,%w1,uxtw]",
9398 "add\t%3, %4, %w3, sxtb #2"
9401 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9402 "add\t%3, %4, %w3, sxth #2"
9405 "ldr\t%w3, [%0,%w1,uxtw #2]",
9406 "add\t%3, %4, %w3, sxtw #2"
9408 /* We assume that DImode is only generated when not optimizing and
9409 that we don't really need 64-bit address offsets. That would
9410 imply an object file with 8GB of code in a single function! */
9412 "ldr\t%w3, [%0,%w1,uxtw #2]",
9413 "add\t%3, %4, %w3, sxtw #2"
9417 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9419 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9420 index = exact_log2 (GET_MODE_SIZE (mode));
9422 gcc_assert (index >= 0 && index <= 3);
9424 /* Need to implement table size reduction, by chaning the code below. */
9425 output_asm_insn (patterns[index][0], operands);
9426 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9427 snprintf (buf, sizeof (buf),
9428 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9429 output_asm_insn (buf, operands);
9430 output_asm_insn (patterns[index][1], operands);
9431 output_asm_insn ("br\t%3", operands);
9432 assemble_label (asm_out_file, label);
9433 return "";
9437 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9438 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9439 operator. */
9442 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9444 if (shift >= 0 && shift <= 3)
9446 int size;
9447 for (size = 8; size <= 32; size *= 2)
9449 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9450 if (mask == bits << shift)
9451 return size;
9454 return 0;
9457 /* Constant pools are per function only when PC relative
9458 literal loads are true or we are in the large memory
9459 model. */
9461 static inline bool
9462 aarch64_can_use_per_function_literal_pools_p (void)
9464 return (aarch64_pcrelative_literal_loads
9465 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9468 static bool
9469 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9471 /* We can't use blocks for constants when we're using a per-function
9472 constant pool. */
9473 return !aarch64_can_use_per_function_literal_pools_p ();
9476 /* Select appropriate section for constants depending
9477 on where we place literal pools. */
9479 static section *
9480 aarch64_select_rtx_section (machine_mode mode,
9481 rtx x,
9482 unsigned HOST_WIDE_INT align)
9484 if (aarch64_can_use_per_function_literal_pools_p ())
9485 return function_section (current_function_decl);
9487 return default_elf_select_rtx_section (mode, x, align);
9490 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9491 void
9492 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9493 HOST_WIDE_INT offset)
9495 /* When using per-function literal pools, we must ensure that any code
9496 section is aligned to the minimal instruction length, lest we get
9497 errors from the assembler re "unaligned instructions". */
9498 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9499 ASM_OUTPUT_ALIGN (f, 2);
9502 /* Costs. */
9504 /* Helper function for rtx cost calculation. Strip a shift expression
9505 from X. Returns the inner operand if successful, or the original
9506 expression on failure. */
9507 static rtx
9508 aarch64_strip_shift (rtx x)
9510 rtx op = x;
9512 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9513 we can convert both to ROR during final output. */
9514 if ((GET_CODE (op) == ASHIFT
9515 || GET_CODE (op) == ASHIFTRT
9516 || GET_CODE (op) == LSHIFTRT
9517 || GET_CODE (op) == ROTATERT
9518 || GET_CODE (op) == ROTATE)
9519 && CONST_INT_P (XEXP (op, 1)))
9520 return XEXP (op, 0);
9522 if (GET_CODE (op) == MULT
9523 && CONST_INT_P (XEXP (op, 1))
9524 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9525 return XEXP (op, 0);
9527 return x;
9530 /* Helper function for rtx cost calculation. Strip an extend
9531 expression from X. Returns the inner operand if successful, or the
9532 original expression on failure. We deal with a number of possible
9533 canonicalization variations here. If STRIP_SHIFT is true, then
9534 we can strip off a shift also. */
9535 static rtx
9536 aarch64_strip_extend (rtx x, bool strip_shift)
9538 scalar_int_mode mode;
9539 rtx op = x;
9541 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9542 return op;
9544 /* Zero and sign extraction of a widened value. */
9545 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9546 && XEXP (op, 2) == const0_rtx
9547 && GET_CODE (XEXP (op, 0)) == MULT
9548 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9549 XEXP (op, 1)))
9550 return XEXP (XEXP (op, 0), 0);
9552 /* It can also be represented (for zero-extend) as an AND with an
9553 immediate. */
9554 if (GET_CODE (op) == AND
9555 && GET_CODE (XEXP (op, 0)) == MULT
9556 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9557 && CONST_INT_P (XEXP (op, 1))
9558 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9559 INTVAL (XEXP (op, 1))) != 0)
9560 return XEXP (XEXP (op, 0), 0);
9562 /* Now handle extended register, as this may also have an optional
9563 left shift by 1..4. */
9564 if (strip_shift
9565 && GET_CODE (op) == ASHIFT
9566 && CONST_INT_P (XEXP (op, 1))
9567 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9568 op = XEXP (op, 0);
9570 if (GET_CODE (op) == ZERO_EXTEND
9571 || GET_CODE (op) == SIGN_EXTEND)
9572 op = XEXP (op, 0);
9574 if (op != x)
9575 return op;
9577 return x;
9580 /* Return true iff CODE is a shift supported in combination
9581 with arithmetic instructions. */
9583 static bool
9584 aarch64_shift_p (enum rtx_code code)
9586 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9590 /* Return true iff X is a cheap shift without a sign extend. */
9592 static bool
9593 aarch64_cheap_mult_shift_p (rtx x)
9595 rtx op0, op1;
9597 op0 = XEXP (x, 0);
9598 op1 = XEXP (x, 1);
9600 if (!(aarch64_tune_params.extra_tuning_flags
9601 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9602 return false;
9604 if (GET_CODE (op0) == SIGN_EXTEND)
9605 return false;
9607 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9608 && UINTVAL (op1) <= 4)
9609 return true;
9611 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9612 return false;
9614 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9616 if (l2 > 0 && l2 <= 4)
9617 return true;
9619 return false;
9622 /* Helper function for rtx cost calculation. Calculate the cost of
9623 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9624 Return the calculated cost of the expression, recursing manually in to
9625 operands where needed. */
9627 static int
9628 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9630 rtx op0, op1;
9631 const struct cpu_cost_table *extra_cost
9632 = aarch64_tune_params.insn_extra_cost;
9633 int cost = 0;
9634 bool compound_p = (outer == PLUS || outer == MINUS);
9635 machine_mode mode = GET_MODE (x);
9637 gcc_checking_assert (code == MULT);
9639 op0 = XEXP (x, 0);
9640 op1 = XEXP (x, 1);
9642 if (VECTOR_MODE_P (mode))
9643 mode = GET_MODE_INNER (mode);
9645 /* Integer multiply/fma. */
9646 if (GET_MODE_CLASS (mode) == MODE_INT)
9648 /* The multiply will be canonicalized as a shift, cost it as such. */
9649 if (aarch64_shift_p (GET_CODE (x))
9650 || (CONST_INT_P (op1)
9651 && exact_log2 (INTVAL (op1)) > 0))
9653 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9654 || GET_CODE (op0) == SIGN_EXTEND;
9655 if (speed)
9657 if (compound_p)
9659 /* If the shift is considered cheap,
9660 then don't add any cost. */
9661 if (aarch64_cheap_mult_shift_p (x))
9663 else if (REG_P (op1))
9664 /* ARITH + shift-by-register. */
9665 cost += extra_cost->alu.arith_shift_reg;
9666 else if (is_extend)
9667 /* ARITH + extended register. We don't have a cost field
9668 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9669 cost += extra_cost->alu.extend_arith;
9670 else
9671 /* ARITH + shift-by-immediate. */
9672 cost += extra_cost->alu.arith_shift;
9674 else
9675 /* LSL (immediate). */
9676 cost += extra_cost->alu.shift;
9679 /* Strip extends as we will have costed them in the case above. */
9680 if (is_extend)
9681 op0 = aarch64_strip_extend (op0, true);
9683 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9685 return cost;
9688 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9689 compound and let the below cases handle it. After all, MNEG is a
9690 special-case alias of MSUB. */
9691 if (GET_CODE (op0) == NEG)
9693 op0 = XEXP (op0, 0);
9694 compound_p = true;
9697 /* Integer multiplies or FMAs have zero/sign extending variants. */
9698 if ((GET_CODE (op0) == ZERO_EXTEND
9699 && GET_CODE (op1) == ZERO_EXTEND)
9700 || (GET_CODE (op0) == SIGN_EXTEND
9701 && GET_CODE (op1) == SIGN_EXTEND))
9703 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9704 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9706 if (speed)
9708 if (compound_p)
9709 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9710 cost += extra_cost->mult[0].extend_add;
9711 else
9712 /* MUL/SMULL/UMULL. */
9713 cost += extra_cost->mult[0].extend;
9716 return cost;
9719 /* This is either an integer multiply or a MADD. In both cases
9720 we want to recurse and cost the operands. */
9721 cost += rtx_cost (op0, mode, MULT, 0, speed);
9722 cost += rtx_cost (op1, mode, MULT, 1, speed);
9724 if (speed)
9726 if (compound_p)
9727 /* MADD/MSUB. */
9728 cost += extra_cost->mult[mode == DImode].add;
9729 else
9730 /* MUL. */
9731 cost += extra_cost->mult[mode == DImode].simple;
9734 return cost;
9736 else
9738 if (speed)
9740 /* Floating-point FMA/FMUL can also support negations of the
9741 operands, unless the rounding mode is upward or downward in
9742 which case FNMUL is different than FMUL with operand negation. */
9743 bool neg0 = GET_CODE (op0) == NEG;
9744 bool neg1 = GET_CODE (op1) == NEG;
9745 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9747 if (neg0)
9748 op0 = XEXP (op0, 0);
9749 if (neg1)
9750 op1 = XEXP (op1, 0);
9753 if (compound_p)
9754 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9755 cost += extra_cost->fp[mode == DFmode].fma;
9756 else
9757 /* FMUL/FNMUL. */
9758 cost += extra_cost->fp[mode == DFmode].mult;
9761 cost += rtx_cost (op0, mode, MULT, 0, speed);
9762 cost += rtx_cost (op1, mode, MULT, 1, speed);
9763 return cost;
9767 static int
9768 aarch64_address_cost (rtx x,
9769 machine_mode mode,
9770 addr_space_t as ATTRIBUTE_UNUSED,
9771 bool speed)
9773 enum rtx_code c = GET_CODE (x);
9774 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9775 struct aarch64_address_info info;
9776 int cost = 0;
9777 info.shift = 0;
9779 if (!aarch64_classify_address (&info, x, mode, false))
9781 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9783 /* This is a CONST or SYMBOL ref which will be split
9784 in a different way depending on the code model in use.
9785 Cost it through the generic infrastructure. */
9786 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9787 /* Divide through by the cost of one instruction to
9788 bring it to the same units as the address costs. */
9789 cost_symbol_ref /= COSTS_N_INSNS (1);
9790 /* The cost is then the cost of preparing the address,
9791 followed by an immediate (possibly 0) offset. */
9792 return cost_symbol_ref + addr_cost->imm_offset;
9794 else
9796 /* This is most likely a jump table from a case
9797 statement. */
9798 return addr_cost->register_offset;
9802 switch (info.type)
9804 case ADDRESS_LO_SUM:
9805 case ADDRESS_SYMBOLIC:
9806 case ADDRESS_REG_IMM:
9807 cost += addr_cost->imm_offset;
9808 break;
9810 case ADDRESS_REG_WB:
9811 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9812 cost += addr_cost->pre_modify;
9813 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9814 cost += addr_cost->post_modify;
9815 else
9816 gcc_unreachable ();
9818 break;
9820 case ADDRESS_REG_REG:
9821 cost += addr_cost->register_offset;
9822 break;
9824 case ADDRESS_REG_SXTW:
9825 cost += addr_cost->register_sextend;
9826 break;
9828 case ADDRESS_REG_UXTW:
9829 cost += addr_cost->register_zextend;
9830 break;
9832 default:
9833 gcc_unreachable ();
9837 if (info.shift > 0)
9839 /* For the sake of calculating the cost of the shifted register
9840 component, we can treat same sized modes in the same way. */
9841 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9842 cost += addr_cost->addr_scale_costs.hi;
9843 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9844 cost += addr_cost->addr_scale_costs.si;
9845 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9846 cost += addr_cost->addr_scale_costs.di;
9847 else
9848 /* We can't tell, or this is a 128-bit vector. */
9849 cost += addr_cost->addr_scale_costs.ti;
9852 return cost;
9855 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9856 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9857 to be taken. */
9860 aarch64_branch_cost (bool speed_p, bool predictable_p)
9862 /* When optimizing for speed, use the cost of unpredictable branches. */
9863 const struct cpu_branch_cost *branch_costs =
9864 aarch64_tune_params.branch_costs;
9866 if (!speed_p || predictable_p)
9867 return branch_costs->predictable;
9868 else
9869 return branch_costs->unpredictable;
9872 /* Return true if the RTX X in mode MODE is a zero or sign extract
9873 usable in an ADD or SUB (extended register) instruction. */
9874 static bool
9875 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9877 /* Catch add with a sign extract.
9878 This is add_<optab><mode>_multp2. */
9879 if (GET_CODE (x) == SIGN_EXTRACT
9880 || GET_CODE (x) == ZERO_EXTRACT)
9882 rtx op0 = XEXP (x, 0);
9883 rtx op1 = XEXP (x, 1);
9884 rtx op2 = XEXP (x, 2);
9886 if (GET_CODE (op0) == MULT
9887 && CONST_INT_P (op1)
9888 && op2 == const0_rtx
9889 && CONST_INT_P (XEXP (op0, 1))
9890 && aarch64_is_extend_from_extract (mode,
9891 XEXP (op0, 1),
9892 op1))
9894 return true;
9897 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9898 No shift. */
9899 else if (GET_CODE (x) == SIGN_EXTEND
9900 || GET_CODE (x) == ZERO_EXTEND)
9901 return REG_P (XEXP (x, 0));
9903 return false;
9906 static bool
9907 aarch64_frint_unspec_p (unsigned int u)
9909 switch (u)
9911 case UNSPEC_FRINTZ:
9912 case UNSPEC_FRINTP:
9913 case UNSPEC_FRINTM:
9914 case UNSPEC_FRINTA:
9915 case UNSPEC_FRINTN:
9916 case UNSPEC_FRINTX:
9917 case UNSPEC_FRINTI:
9918 return true;
9920 default:
9921 return false;
9925 /* Return true iff X is an rtx that will match an extr instruction
9926 i.e. as described in the *extr<mode>5_insn family of patterns.
9927 OP0 and OP1 will be set to the operands of the shifts involved
9928 on success and will be NULL_RTX otherwise. */
9930 static bool
9931 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9933 rtx op0, op1;
9934 scalar_int_mode mode;
9935 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9936 return false;
9938 *res_op0 = NULL_RTX;
9939 *res_op1 = NULL_RTX;
9941 if (GET_CODE (x) != IOR)
9942 return false;
9944 op0 = XEXP (x, 0);
9945 op1 = XEXP (x, 1);
9947 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9948 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9950 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9951 if (GET_CODE (op1) == ASHIFT)
9952 std::swap (op0, op1);
9954 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9955 return false;
9957 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9958 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9960 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9961 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9963 *res_op0 = XEXP (op0, 0);
9964 *res_op1 = XEXP (op1, 0);
9965 return true;
9969 return false;
9972 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9973 storing it in *COST. Result is true if the total cost of the operation
9974 has now been calculated. */
9975 static bool
9976 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9978 rtx inner;
9979 rtx comparator;
9980 enum rtx_code cmpcode;
9982 if (COMPARISON_P (op0))
9984 inner = XEXP (op0, 0);
9985 comparator = XEXP (op0, 1);
9986 cmpcode = GET_CODE (op0);
9988 else
9990 inner = op0;
9991 comparator = const0_rtx;
9992 cmpcode = NE;
9995 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9997 /* Conditional branch. */
9998 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9999 return true;
10000 else
10002 if (cmpcode == NE || cmpcode == EQ)
10004 if (comparator == const0_rtx)
10006 /* TBZ/TBNZ/CBZ/CBNZ. */
10007 if (GET_CODE (inner) == ZERO_EXTRACT)
10008 /* TBZ/TBNZ. */
10009 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10010 ZERO_EXTRACT, 0, speed);
10011 else
10012 /* CBZ/CBNZ. */
10013 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10015 return true;
10018 else if (cmpcode == LT || cmpcode == GE)
10020 /* TBZ/TBNZ. */
10021 if (comparator == const0_rtx)
10022 return true;
10026 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10028 /* CCMP. */
10029 if (GET_CODE (op1) == COMPARE)
10031 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10032 if (XEXP (op1, 1) == const0_rtx)
10033 *cost += 1;
10034 if (speed)
10036 machine_mode mode = GET_MODE (XEXP (op1, 0));
10037 const struct cpu_cost_table *extra_cost
10038 = aarch64_tune_params.insn_extra_cost;
10040 if (GET_MODE_CLASS (mode) == MODE_INT)
10041 *cost += extra_cost->alu.arith;
10042 else
10043 *cost += extra_cost->fp[mode == DFmode].compare;
10045 return true;
10048 /* It's a conditional operation based on the status flags,
10049 so it must be some flavor of CSEL. */
10051 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10052 if (GET_CODE (op1) == NEG
10053 || GET_CODE (op1) == NOT
10054 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10055 op1 = XEXP (op1, 0);
10056 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10058 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10059 op1 = XEXP (op1, 0);
10060 op2 = XEXP (op2, 0);
10063 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10064 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10065 return true;
10068 /* We don't know what this is, cost all operands. */
10069 return false;
10072 /* Check whether X is a bitfield operation of the form shift + extend that
10073 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10074 operand to which the bitfield operation is applied. Otherwise return
10075 NULL_RTX. */
10077 static rtx
10078 aarch64_extend_bitfield_pattern_p (rtx x)
10080 rtx_code outer_code = GET_CODE (x);
10081 machine_mode outer_mode = GET_MODE (x);
10083 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10084 && outer_mode != SImode && outer_mode != DImode)
10085 return NULL_RTX;
10087 rtx inner = XEXP (x, 0);
10088 rtx_code inner_code = GET_CODE (inner);
10089 machine_mode inner_mode = GET_MODE (inner);
10090 rtx op = NULL_RTX;
10092 switch (inner_code)
10094 case ASHIFT:
10095 if (CONST_INT_P (XEXP (inner, 1))
10096 && (inner_mode == QImode || inner_mode == HImode))
10097 op = XEXP (inner, 0);
10098 break;
10099 case LSHIFTRT:
10100 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10101 && (inner_mode == QImode || inner_mode == HImode))
10102 op = XEXP (inner, 0);
10103 break;
10104 case ASHIFTRT:
10105 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10106 && (inner_mode == QImode || inner_mode == HImode))
10107 op = XEXP (inner, 0);
10108 break;
10109 default:
10110 break;
10113 return op;
10116 /* Return true if the mask and a shift amount from an RTX of the form
10117 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10118 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10120 bool
10121 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10122 rtx shft_amnt)
10124 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10125 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10126 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10127 && (INTVAL (mask)
10128 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10131 /* Return true if the masks and a shift amount from an RTX of the form
10132 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10133 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10135 bool
10136 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10137 unsigned HOST_WIDE_INT mask1,
10138 unsigned HOST_WIDE_INT shft_amnt,
10139 unsigned HOST_WIDE_INT mask2)
10141 unsigned HOST_WIDE_INT t;
10143 /* Verify that there is no overlap in what bits are set in the two masks. */
10144 if (mask1 != ~mask2)
10145 return false;
10147 /* Verify that mask2 is not all zeros or ones. */
10148 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10149 return false;
10151 /* The shift amount should always be less than the mode size. */
10152 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10154 /* Verify that the mask being shifted is contiguous and would be in the
10155 least significant bits after shifting by shft_amnt. */
10156 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10157 return (t == (t & -t));
10160 /* Calculate the cost of calculating X, storing it in *COST. Result
10161 is true if the total cost of the operation has now been calculated. */
10162 static bool
10163 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10164 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10166 rtx op0, op1, op2;
10167 const struct cpu_cost_table *extra_cost
10168 = aarch64_tune_params.insn_extra_cost;
10169 int code = GET_CODE (x);
10170 scalar_int_mode int_mode;
10172 /* By default, assume that everything has equivalent cost to the
10173 cheapest instruction. Any additional costs are applied as a delta
10174 above this default. */
10175 *cost = COSTS_N_INSNS (1);
10177 switch (code)
10179 case SET:
10180 /* The cost depends entirely on the operands to SET. */
10181 *cost = 0;
10182 op0 = SET_DEST (x);
10183 op1 = SET_SRC (x);
10185 switch (GET_CODE (op0))
10187 case MEM:
10188 if (speed)
10190 rtx address = XEXP (op0, 0);
10191 if (VECTOR_MODE_P (mode))
10192 *cost += extra_cost->ldst.storev;
10193 else if (GET_MODE_CLASS (mode) == MODE_INT)
10194 *cost += extra_cost->ldst.store;
10195 else if (mode == SFmode)
10196 *cost += extra_cost->ldst.storef;
10197 else if (mode == DFmode)
10198 *cost += extra_cost->ldst.stored;
10200 *cost +=
10201 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10202 0, speed));
10205 *cost += rtx_cost (op1, mode, SET, 1, speed);
10206 return true;
10208 case SUBREG:
10209 if (! REG_P (SUBREG_REG (op0)))
10210 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10212 /* Fall through. */
10213 case REG:
10214 /* The cost is one per vector-register copied. */
10215 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10217 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10218 *cost = COSTS_N_INSNS (nregs);
10220 /* const0_rtx is in general free, but we will use an
10221 instruction to set a register to 0. */
10222 else if (REG_P (op1) || op1 == const0_rtx)
10224 /* The cost is 1 per register copied. */
10225 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10226 *cost = COSTS_N_INSNS (nregs);
10228 else
10229 /* Cost is just the cost of the RHS of the set. */
10230 *cost += rtx_cost (op1, mode, SET, 1, speed);
10231 return true;
10233 case ZERO_EXTRACT:
10234 case SIGN_EXTRACT:
10235 /* Bit-field insertion. Strip any redundant widening of
10236 the RHS to meet the width of the target. */
10237 if (GET_CODE (op1) == SUBREG)
10238 op1 = SUBREG_REG (op1);
10239 if ((GET_CODE (op1) == ZERO_EXTEND
10240 || GET_CODE (op1) == SIGN_EXTEND)
10241 && CONST_INT_P (XEXP (op0, 1))
10242 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10243 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10244 op1 = XEXP (op1, 0);
10246 if (CONST_INT_P (op1))
10248 /* MOV immediate is assumed to always be cheap. */
10249 *cost = COSTS_N_INSNS (1);
10251 else
10253 /* BFM. */
10254 if (speed)
10255 *cost += extra_cost->alu.bfi;
10256 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10259 return true;
10261 default:
10262 /* We can't make sense of this, assume default cost. */
10263 *cost = COSTS_N_INSNS (1);
10264 return false;
10266 return false;
10268 case CONST_INT:
10269 /* If an instruction can incorporate a constant within the
10270 instruction, the instruction's expression avoids calling
10271 rtx_cost() on the constant. If rtx_cost() is called on a
10272 constant, then it is usually because the constant must be
10273 moved into a register by one or more instructions.
10275 The exception is constant 0, which can be expressed
10276 as XZR/WZR and is therefore free. The exception to this is
10277 if we have (set (reg) (const0_rtx)) in which case we must cost
10278 the move. However, we can catch that when we cost the SET, so
10279 we don't need to consider that here. */
10280 if (x == const0_rtx)
10281 *cost = 0;
10282 else
10284 /* To an approximation, building any other constant is
10285 proportionally expensive to the number of instructions
10286 required to build that constant. This is true whether we
10287 are compiling for SPEED or otherwise. */
10288 if (!is_a <scalar_int_mode> (mode, &int_mode))
10289 int_mode = word_mode;
10290 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10291 (NULL_RTX, x, false, int_mode));
10293 return true;
10295 case CONST_DOUBLE:
10297 /* First determine number of instructions to do the move
10298 as an integer constant. */
10299 if (!aarch64_float_const_representable_p (x)
10300 && !aarch64_can_const_movi_rtx_p (x, mode)
10301 && aarch64_float_const_rtx_p (x))
10303 unsigned HOST_WIDE_INT ival;
10304 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10305 gcc_assert (succeed);
10307 scalar_int_mode imode = (mode == HFmode
10308 ? SImode
10309 : int_mode_for_mode (mode).require ());
10310 int ncost = aarch64_internal_mov_immediate
10311 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10312 *cost += COSTS_N_INSNS (ncost);
10313 return true;
10316 if (speed)
10318 /* mov[df,sf]_aarch64. */
10319 if (aarch64_float_const_representable_p (x))
10320 /* FMOV (scalar immediate). */
10321 *cost += extra_cost->fp[mode == DFmode].fpconst;
10322 else if (!aarch64_float_const_zero_rtx_p (x))
10324 /* This will be a load from memory. */
10325 if (mode == DFmode)
10326 *cost += extra_cost->ldst.loadd;
10327 else
10328 *cost += extra_cost->ldst.loadf;
10330 else
10331 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10332 or MOV v0.s[0], wzr - neither of which are modeled by the
10333 cost tables. Just use the default cost. */
10338 return true;
10340 case MEM:
10341 if (speed)
10343 /* For loads we want the base cost of a load, plus an
10344 approximation for the additional cost of the addressing
10345 mode. */
10346 rtx address = XEXP (x, 0);
10347 if (VECTOR_MODE_P (mode))
10348 *cost += extra_cost->ldst.loadv;
10349 else if (GET_MODE_CLASS (mode) == MODE_INT)
10350 *cost += extra_cost->ldst.load;
10351 else if (mode == SFmode)
10352 *cost += extra_cost->ldst.loadf;
10353 else if (mode == DFmode)
10354 *cost += extra_cost->ldst.loadd;
10356 *cost +=
10357 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10358 0, speed));
10361 return true;
10363 case NEG:
10364 op0 = XEXP (x, 0);
10366 if (VECTOR_MODE_P (mode))
10368 if (speed)
10370 /* FNEG. */
10371 *cost += extra_cost->vect.alu;
10373 return false;
10376 if (GET_MODE_CLASS (mode) == MODE_INT)
10378 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10379 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10381 /* CSETM. */
10382 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10383 return true;
10386 /* Cost this as SUB wzr, X. */
10387 op0 = CONST0_RTX (mode);
10388 op1 = XEXP (x, 0);
10389 goto cost_minus;
10392 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10394 /* Support (neg(fma...)) as a single instruction only if
10395 sign of zeros is unimportant. This matches the decision
10396 making in aarch64.md. */
10397 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10399 /* FNMADD. */
10400 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10401 return true;
10403 if (GET_CODE (op0) == MULT)
10405 /* FNMUL. */
10406 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10407 return true;
10409 if (speed)
10410 /* FNEG. */
10411 *cost += extra_cost->fp[mode == DFmode].neg;
10412 return false;
10415 return false;
10417 case CLRSB:
10418 case CLZ:
10419 if (speed)
10421 if (VECTOR_MODE_P (mode))
10422 *cost += extra_cost->vect.alu;
10423 else
10424 *cost += extra_cost->alu.clz;
10427 return false;
10429 case COMPARE:
10430 op0 = XEXP (x, 0);
10431 op1 = XEXP (x, 1);
10433 if (op1 == const0_rtx
10434 && GET_CODE (op0) == AND)
10436 x = op0;
10437 mode = GET_MODE (op0);
10438 goto cost_logic;
10441 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10443 /* TODO: A write to the CC flags possibly costs extra, this
10444 needs encoding in the cost tables. */
10446 mode = GET_MODE (op0);
10447 /* ANDS. */
10448 if (GET_CODE (op0) == AND)
10450 x = op0;
10451 goto cost_logic;
10454 if (GET_CODE (op0) == PLUS)
10456 /* ADDS (and CMN alias). */
10457 x = op0;
10458 goto cost_plus;
10461 if (GET_CODE (op0) == MINUS)
10463 /* SUBS. */
10464 x = op0;
10465 goto cost_minus;
10468 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10469 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10470 && CONST_INT_P (XEXP (op0, 2)))
10472 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10473 Handle it here directly rather than going to cost_logic
10474 since we know the immediate generated for the TST is valid
10475 so we can avoid creating an intermediate rtx for it only
10476 for costing purposes. */
10477 if (speed)
10478 *cost += extra_cost->alu.logical;
10480 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10481 ZERO_EXTRACT, 0, speed);
10482 return true;
10485 if (GET_CODE (op1) == NEG)
10487 /* CMN. */
10488 if (speed)
10489 *cost += extra_cost->alu.arith;
10491 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10492 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10493 return true;
10496 /* CMP.
10498 Compare can freely swap the order of operands, and
10499 canonicalization puts the more complex operation first.
10500 But the integer MINUS logic expects the shift/extend
10501 operation in op1. */
10502 if (! (REG_P (op0)
10503 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10505 op0 = XEXP (x, 1);
10506 op1 = XEXP (x, 0);
10508 goto cost_minus;
10511 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10513 /* FCMP. */
10514 if (speed)
10515 *cost += extra_cost->fp[mode == DFmode].compare;
10517 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10519 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10520 /* FCMP supports constant 0.0 for no extra cost. */
10521 return true;
10523 return false;
10526 if (VECTOR_MODE_P (mode))
10528 /* Vector compare. */
10529 if (speed)
10530 *cost += extra_cost->vect.alu;
10532 if (aarch64_float_const_zero_rtx_p (op1))
10534 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10535 cost. */
10536 return true;
10538 return false;
10540 return false;
10542 case MINUS:
10544 op0 = XEXP (x, 0);
10545 op1 = XEXP (x, 1);
10547 cost_minus:
10548 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10550 /* Detect valid immediates. */
10551 if ((GET_MODE_CLASS (mode) == MODE_INT
10552 || (GET_MODE_CLASS (mode) == MODE_CC
10553 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10554 && CONST_INT_P (op1)
10555 && aarch64_uimm12_shift (INTVAL (op1)))
10557 if (speed)
10558 /* SUB(S) (immediate). */
10559 *cost += extra_cost->alu.arith;
10560 return true;
10563 /* Look for SUB (extended register). */
10564 if (is_a <scalar_int_mode> (mode, &int_mode)
10565 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10567 if (speed)
10568 *cost += extra_cost->alu.extend_arith;
10570 op1 = aarch64_strip_extend (op1, true);
10571 *cost += rtx_cost (op1, VOIDmode,
10572 (enum rtx_code) GET_CODE (op1), 0, speed);
10573 return true;
10576 rtx new_op1 = aarch64_strip_extend (op1, false);
10578 /* Cost this as an FMA-alike operation. */
10579 if ((GET_CODE (new_op1) == MULT
10580 || aarch64_shift_p (GET_CODE (new_op1)))
10581 && code != COMPARE)
10583 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10584 (enum rtx_code) code,
10585 speed);
10586 return true;
10589 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10591 if (speed)
10593 if (VECTOR_MODE_P (mode))
10595 /* Vector SUB. */
10596 *cost += extra_cost->vect.alu;
10598 else if (GET_MODE_CLASS (mode) == MODE_INT)
10600 /* SUB(S). */
10601 *cost += extra_cost->alu.arith;
10603 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10605 /* FSUB. */
10606 *cost += extra_cost->fp[mode == DFmode].addsub;
10609 return true;
10612 case PLUS:
10614 rtx new_op0;
10616 op0 = XEXP (x, 0);
10617 op1 = XEXP (x, 1);
10619 cost_plus:
10620 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10621 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10623 /* CSINC. */
10624 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10625 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10626 return true;
10629 if (GET_MODE_CLASS (mode) == MODE_INT
10630 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10631 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10633 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10635 if (speed)
10636 /* ADD (immediate). */
10637 *cost += extra_cost->alu.arith;
10638 return true;
10641 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10643 /* Look for ADD (extended register). */
10644 if (is_a <scalar_int_mode> (mode, &int_mode)
10645 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10647 if (speed)
10648 *cost += extra_cost->alu.extend_arith;
10650 op0 = aarch64_strip_extend (op0, true);
10651 *cost += rtx_cost (op0, VOIDmode,
10652 (enum rtx_code) GET_CODE (op0), 0, speed);
10653 return true;
10656 /* Strip any extend, leave shifts behind as we will
10657 cost them through mult_cost. */
10658 new_op0 = aarch64_strip_extend (op0, false);
10660 if (GET_CODE (new_op0) == MULT
10661 || aarch64_shift_p (GET_CODE (new_op0)))
10663 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10664 speed);
10665 return true;
10668 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10670 if (speed)
10672 if (VECTOR_MODE_P (mode))
10674 /* Vector ADD. */
10675 *cost += extra_cost->vect.alu;
10677 else if (GET_MODE_CLASS (mode) == MODE_INT)
10679 /* ADD. */
10680 *cost += extra_cost->alu.arith;
10682 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10684 /* FADD. */
10685 *cost += extra_cost->fp[mode == DFmode].addsub;
10688 return true;
10691 case BSWAP:
10692 *cost = COSTS_N_INSNS (1);
10694 if (speed)
10696 if (VECTOR_MODE_P (mode))
10697 *cost += extra_cost->vect.alu;
10698 else
10699 *cost += extra_cost->alu.rev;
10701 return false;
10703 case IOR:
10704 if (aarch_rev16_p (x))
10706 *cost = COSTS_N_INSNS (1);
10708 if (speed)
10710 if (VECTOR_MODE_P (mode))
10711 *cost += extra_cost->vect.alu;
10712 else
10713 *cost += extra_cost->alu.rev;
10715 return true;
10718 if (aarch64_extr_rtx_p (x, &op0, &op1))
10720 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10721 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10722 if (speed)
10723 *cost += extra_cost->alu.shift;
10725 return true;
10727 /* Fall through. */
10728 case XOR:
10729 case AND:
10730 cost_logic:
10731 op0 = XEXP (x, 0);
10732 op1 = XEXP (x, 1);
10734 if (VECTOR_MODE_P (mode))
10736 if (speed)
10737 *cost += extra_cost->vect.alu;
10738 return true;
10741 if (code == AND
10742 && GET_CODE (op0) == MULT
10743 && CONST_INT_P (XEXP (op0, 1))
10744 && CONST_INT_P (op1)
10745 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10746 INTVAL (op1)) != 0)
10748 /* This is a UBFM/SBFM. */
10749 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10750 if (speed)
10751 *cost += extra_cost->alu.bfx;
10752 return true;
10755 if (is_int_mode (mode, &int_mode))
10757 if (CONST_INT_P (op1))
10759 /* We have a mask + shift version of a UBFIZ
10760 i.e. the *andim_ashift<mode>_bfiz pattern. */
10761 if (GET_CODE (op0) == ASHIFT
10762 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10763 XEXP (op0, 1)))
10765 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10766 (enum rtx_code) code, 0, speed);
10767 if (speed)
10768 *cost += extra_cost->alu.bfx;
10770 return true;
10772 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10774 /* We possibly get the immediate for free, this is not
10775 modelled. */
10776 *cost += rtx_cost (op0, int_mode,
10777 (enum rtx_code) code, 0, speed);
10778 if (speed)
10779 *cost += extra_cost->alu.logical;
10781 return true;
10784 else
10786 rtx new_op0 = op0;
10788 /* Handle ORN, EON, or BIC. */
10789 if (GET_CODE (op0) == NOT)
10790 op0 = XEXP (op0, 0);
10792 new_op0 = aarch64_strip_shift (op0);
10794 /* If we had a shift on op0 then this is a logical-shift-
10795 by-register/immediate operation. Otherwise, this is just
10796 a logical operation. */
10797 if (speed)
10799 if (new_op0 != op0)
10801 /* Shift by immediate. */
10802 if (CONST_INT_P (XEXP (op0, 1)))
10803 *cost += extra_cost->alu.log_shift;
10804 else
10805 *cost += extra_cost->alu.log_shift_reg;
10807 else
10808 *cost += extra_cost->alu.logical;
10811 /* In both cases we want to cost both operands. */
10812 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10813 0, speed);
10814 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10815 1, speed);
10817 return true;
10820 return false;
10822 case NOT:
10823 x = XEXP (x, 0);
10824 op0 = aarch64_strip_shift (x);
10826 if (VECTOR_MODE_P (mode))
10828 /* Vector NOT. */
10829 *cost += extra_cost->vect.alu;
10830 return false;
10833 /* MVN-shifted-reg. */
10834 if (op0 != x)
10836 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10838 if (speed)
10839 *cost += extra_cost->alu.log_shift;
10841 return true;
10843 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10844 Handle the second form here taking care that 'a' in the above can
10845 be a shift. */
10846 else if (GET_CODE (op0) == XOR)
10848 rtx newop0 = XEXP (op0, 0);
10849 rtx newop1 = XEXP (op0, 1);
10850 rtx op0_stripped = aarch64_strip_shift (newop0);
10852 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10853 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10855 if (speed)
10857 if (op0_stripped != newop0)
10858 *cost += extra_cost->alu.log_shift;
10859 else
10860 *cost += extra_cost->alu.logical;
10863 return true;
10865 /* MVN. */
10866 if (speed)
10867 *cost += extra_cost->alu.logical;
10869 return false;
10871 case ZERO_EXTEND:
10873 op0 = XEXP (x, 0);
10874 /* If a value is written in SI mode, then zero extended to DI
10875 mode, the operation will in general be free as a write to
10876 a 'w' register implicitly zeroes the upper bits of an 'x'
10877 register. However, if this is
10879 (set (reg) (zero_extend (reg)))
10881 we must cost the explicit register move. */
10882 if (mode == DImode
10883 && GET_MODE (op0) == SImode
10884 && outer == SET)
10886 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10888 /* If OP_COST is non-zero, then the cost of the zero extend
10889 is effectively the cost of the inner operation. Otherwise
10890 we have a MOV instruction and we take the cost from the MOV
10891 itself. This is true independently of whether we are
10892 optimizing for space or time. */
10893 if (op_cost)
10894 *cost = op_cost;
10896 return true;
10898 else if (MEM_P (op0))
10900 /* All loads can zero extend to any size for free. */
10901 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10902 return true;
10905 op0 = aarch64_extend_bitfield_pattern_p (x);
10906 if (op0)
10908 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10909 if (speed)
10910 *cost += extra_cost->alu.bfx;
10911 return true;
10914 if (speed)
10916 if (VECTOR_MODE_P (mode))
10918 /* UMOV. */
10919 *cost += extra_cost->vect.alu;
10921 else
10923 /* We generate an AND instead of UXTB/UXTH. */
10924 *cost += extra_cost->alu.logical;
10927 return false;
10929 case SIGN_EXTEND:
10930 if (MEM_P (XEXP (x, 0)))
10932 /* LDRSH. */
10933 if (speed)
10935 rtx address = XEXP (XEXP (x, 0), 0);
10936 *cost += extra_cost->ldst.load_sign_extend;
10938 *cost +=
10939 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10940 0, speed));
10942 return true;
10945 op0 = aarch64_extend_bitfield_pattern_p (x);
10946 if (op0)
10948 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10949 if (speed)
10950 *cost += extra_cost->alu.bfx;
10951 return true;
10954 if (speed)
10956 if (VECTOR_MODE_P (mode))
10957 *cost += extra_cost->vect.alu;
10958 else
10959 *cost += extra_cost->alu.extend;
10961 return false;
10963 case ASHIFT:
10964 op0 = XEXP (x, 0);
10965 op1 = XEXP (x, 1);
10967 if (CONST_INT_P (op1))
10969 if (speed)
10971 if (VECTOR_MODE_P (mode))
10973 /* Vector shift (immediate). */
10974 *cost += extra_cost->vect.alu;
10976 else
10978 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10979 aliases. */
10980 *cost += extra_cost->alu.shift;
10984 /* We can incorporate zero/sign extend for free. */
10985 if (GET_CODE (op0) == ZERO_EXTEND
10986 || GET_CODE (op0) == SIGN_EXTEND)
10987 op0 = XEXP (op0, 0);
10989 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10990 return true;
10992 else
10994 if (VECTOR_MODE_P (mode))
10996 if (speed)
10997 /* Vector shift (register). */
10998 *cost += extra_cost->vect.alu;
11000 else
11002 if (speed)
11003 /* LSLV. */
11004 *cost += extra_cost->alu.shift_reg;
11006 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11007 && CONST_INT_P (XEXP (op1, 1))
11008 && known_eq (INTVAL (XEXP (op1, 1)),
11009 GET_MODE_BITSIZE (mode) - 1))
11011 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11012 /* We already demanded XEXP (op1, 0) to be REG_P, so
11013 don't recurse into it. */
11014 return true;
11017 return false; /* All arguments need to be in registers. */
11020 case ROTATE:
11021 case ROTATERT:
11022 case LSHIFTRT:
11023 case ASHIFTRT:
11024 op0 = XEXP (x, 0);
11025 op1 = XEXP (x, 1);
11027 if (CONST_INT_P (op1))
11029 /* ASR (immediate) and friends. */
11030 if (speed)
11032 if (VECTOR_MODE_P (mode))
11033 *cost += extra_cost->vect.alu;
11034 else
11035 *cost += extra_cost->alu.shift;
11038 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11039 return true;
11041 else
11043 if (VECTOR_MODE_P (mode))
11045 if (speed)
11046 /* Vector shift (register). */
11047 *cost += extra_cost->vect.alu;
11049 else
11051 if (speed)
11052 /* ASR (register) and friends. */
11053 *cost += extra_cost->alu.shift_reg;
11055 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11056 && CONST_INT_P (XEXP (op1, 1))
11057 && known_eq (INTVAL (XEXP (op1, 1)),
11058 GET_MODE_BITSIZE (mode) - 1))
11060 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11061 /* We already demanded XEXP (op1, 0) to be REG_P, so
11062 don't recurse into it. */
11063 return true;
11066 return false; /* All arguments need to be in registers. */
11069 case SYMBOL_REF:
11071 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11072 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11074 /* LDR. */
11075 if (speed)
11076 *cost += extra_cost->ldst.load;
11078 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11079 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11081 /* ADRP, followed by ADD. */
11082 *cost += COSTS_N_INSNS (1);
11083 if (speed)
11084 *cost += 2 * extra_cost->alu.arith;
11086 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11087 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11089 /* ADR. */
11090 if (speed)
11091 *cost += extra_cost->alu.arith;
11094 if (flag_pic)
11096 /* One extra load instruction, after accessing the GOT. */
11097 *cost += COSTS_N_INSNS (1);
11098 if (speed)
11099 *cost += extra_cost->ldst.load;
11101 return true;
11103 case HIGH:
11104 case LO_SUM:
11105 /* ADRP/ADD (immediate). */
11106 if (speed)
11107 *cost += extra_cost->alu.arith;
11108 return true;
11110 case ZERO_EXTRACT:
11111 case SIGN_EXTRACT:
11112 /* UBFX/SBFX. */
11113 if (speed)
11115 if (VECTOR_MODE_P (mode))
11116 *cost += extra_cost->vect.alu;
11117 else
11118 *cost += extra_cost->alu.bfx;
11121 /* We can trust that the immediates used will be correct (there
11122 are no by-register forms), so we need only cost op0. */
11123 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11124 return true;
11126 case MULT:
11127 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11128 /* aarch64_rtx_mult_cost always handles recursion to its
11129 operands. */
11130 return true;
11132 case MOD:
11133 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11134 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11135 an unconditional negate. This case should only ever be reached through
11136 the set_smod_pow2_cheap check in expmed.c. */
11137 if (CONST_INT_P (XEXP (x, 1))
11138 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11139 && (mode == SImode || mode == DImode))
11141 /* We expand to 4 instructions. Reset the baseline. */
11142 *cost = COSTS_N_INSNS (4);
11144 if (speed)
11145 *cost += 2 * extra_cost->alu.logical
11146 + 2 * extra_cost->alu.arith;
11148 return true;
11151 /* Fall-through. */
11152 case UMOD:
11153 if (speed)
11155 /* Slighly prefer UMOD over SMOD. */
11156 if (VECTOR_MODE_P (mode))
11157 *cost += extra_cost->vect.alu;
11158 else if (GET_MODE_CLASS (mode) == MODE_INT)
11159 *cost += (extra_cost->mult[mode == DImode].add
11160 + extra_cost->mult[mode == DImode].idiv
11161 + (code == MOD ? 1 : 0));
11163 return false; /* All arguments need to be in registers. */
11165 case DIV:
11166 case UDIV:
11167 case SQRT:
11168 if (speed)
11170 if (VECTOR_MODE_P (mode))
11171 *cost += extra_cost->vect.alu;
11172 else if (GET_MODE_CLASS (mode) == MODE_INT)
11173 /* There is no integer SQRT, so only DIV and UDIV can get
11174 here. */
11175 *cost += (extra_cost->mult[mode == DImode].idiv
11176 /* Slighly prefer UDIV over SDIV. */
11177 + (code == DIV ? 1 : 0));
11178 else
11179 *cost += extra_cost->fp[mode == DFmode].div;
11181 return false; /* All arguments need to be in registers. */
11183 case IF_THEN_ELSE:
11184 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11185 XEXP (x, 2), cost, speed);
11187 case EQ:
11188 case NE:
11189 case GT:
11190 case GTU:
11191 case LT:
11192 case LTU:
11193 case GE:
11194 case GEU:
11195 case LE:
11196 case LEU:
11198 return false; /* All arguments must be in registers. */
11200 case FMA:
11201 op0 = XEXP (x, 0);
11202 op1 = XEXP (x, 1);
11203 op2 = XEXP (x, 2);
11205 if (speed)
11207 if (VECTOR_MODE_P (mode))
11208 *cost += extra_cost->vect.alu;
11209 else
11210 *cost += extra_cost->fp[mode == DFmode].fma;
11213 /* FMSUB, FNMADD, and FNMSUB are free. */
11214 if (GET_CODE (op0) == NEG)
11215 op0 = XEXP (op0, 0);
11217 if (GET_CODE (op2) == NEG)
11218 op2 = XEXP (op2, 0);
11220 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11221 and the by-element operand as operand 0. */
11222 if (GET_CODE (op1) == NEG)
11223 op1 = XEXP (op1, 0);
11225 /* Catch vector-by-element operations. The by-element operand can
11226 either be (vec_duplicate (vec_select (x))) or just
11227 (vec_select (x)), depending on whether we are multiplying by
11228 a vector or a scalar.
11230 Canonicalization is not very good in these cases, FMA4 will put the
11231 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11232 if (GET_CODE (op0) == VEC_DUPLICATE)
11233 op0 = XEXP (op0, 0);
11234 else if (GET_CODE (op1) == VEC_DUPLICATE)
11235 op1 = XEXP (op1, 0);
11237 if (GET_CODE (op0) == VEC_SELECT)
11238 op0 = XEXP (op0, 0);
11239 else if (GET_CODE (op1) == VEC_SELECT)
11240 op1 = XEXP (op1, 0);
11242 /* If the remaining parameters are not registers,
11243 get the cost to put them into registers. */
11244 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11245 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11246 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11247 return true;
11249 case FLOAT:
11250 case UNSIGNED_FLOAT:
11251 if (speed)
11252 *cost += extra_cost->fp[mode == DFmode].fromint;
11253 return false;
11255 case FLOAT_EXTEND:
11256 if (speed)
11258 if (VECTOR_MODE_P (mode))
11260 /*Vector truncate. */
11261 *cost += extra_cost->vect.alu;
11263 else
11264 *cost += extra_cost->fp[mode == DFmode].widen;
11266 return false;
11268 case FLOAT_TRUNCATE:
11269 if (speed)
11271 if (VECTOR_MODE_P (mode))
11273 /*Vector conversion. */
11274 *cost += extra_cost->vect.alu;
11276 else
11277 *cost += extra_cost->fp[mode == DFmode].narrow;
11279 return false;
11281 case FIX:
11282 case UNSIGNED_FIX:
11283 x = XEXP (x, 0);
11284 /* Strip the rounding part. They will all be implemented
11285 by the fcvt* family of instructions anyway. */
11286 if (GET_CODE (x) == UNSPEC)
11288 unsigned int uns_code = XINT (x, 1);
11290 if (uns_code == UNSPEC_FRINTA
11291 || uns_code == UNSPEC_FRINTM
11292 || uns_code == UNSPEC_FRINTN
11293 || uns_code == UNSPEC_FRINTP
11294 || uns_code == UNSPEC_FRINTZ)
11295 x = XVECEXP (x, 0, 0);
11298 if (speed)
11300 if (VECTOR_MODE_P (mode))
11301 *cost += extra_cost->vect.alu;
11302 else
11303 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11306 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11307 fixed-point fcvt. */
11308 if (GET_CODE (x) == MULT
11309 && ((VECTOR_MODE_P (mode)
11310 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11311 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11313 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11314 0, speed);
11315 return true;
11318 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11319 return true;
11321 case ABS:
11322 if (VECTOR_MODE_P (mode))
11324 /* ABS (vector). */
11325 if (speed)
11326 *cost += extra_cost->vect.alu;
11328 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11330 op0 = XEXP (x, 0);
11332 /* FABD, which is analogous to FADD. */
11333 if (GET_CODE (op0) == MINUS)
11335 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11336 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11337 if (speed)
11338 *cost += extra_cost->fp[mode == DFmode].addsub;
11340 return true;
11342 /* Simple FABS is analogous to FNEG. */
11343 if (speed)
11344 *cost += extra_cost->fp[mode == DFmode].neg;
11346 else
11348 /* Integer ABS will either be split to
11349 two arithmetic instructions, or will be an ABS
11350 (scalar), which we don't model. */
11351 *cost = COSTS_N_INSNS (2);
11352 if (speed)
11353 *cost += 2 * extra_cost->alu.arith;
11355 return false;
11357 case SMAX:
11358 case SMIN:
11359 if (speed)
11361 if (VECTOR_MODE_P (mode))
11362 *cost += extra_cost->vect.alu;
11363 else
11365 /* FMAXNM/FMINNM/FMAX/FMIN.
11366 TODO: This may not be accurate for all implementations, but
11367 we do not model this in the cost tables. */
11368 *cost += extra_cost->fp[mode == DFmode].addsub;
11371 return false;
11373 case UNSPEC:
11374 /* The floating point round to integer frint* instructions. */
11375 if (aarch64_frint_unspec_p (XINT (x, 1)))
11377 if (speed)
11378 *cost += extra_cost->fp[mode == DFmode].roundint;
11380 return false;
11383 if (XINT (x, 1) == UNSPEC_RBIT)
11385 if (speed)
11386 *cost += extra_cost->alu.rev;
11388 return false;
11390 break;
11392 case TRUNCATE:
11394 /* Decompose <su>muldi3_highpart. */
11395 if (/* (truncate:DI */
11396 mode == DImode
11397 /* (lshiftrt:TI */
11398 && GET_MODE (XEXP (x, 0)) == TImode
11399 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11400 /* (mult:TI */
11401 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11402 /* (ANY_EXTEND:TI (reg:DI))
11403 (ANY_EXTEND:TI (reg:DI))) */
11404 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11405 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11406 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11407 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11408 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11409 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11410 /* (const_int 64) */
11411 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11412 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11414 /* UMULH/SMULH. */
11415 if (speed)
11416 *cost += extra_cost->mult[mode == DImode].extend;
11417 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11418 mode, MULT, 0, speed);
11419 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11420 mode, MULT, 1, speed);
11421 return true;
11424 /* Fall through. */
11425 default:
11426 break;
11429 if (dump_file
11430 && flag_aarch64_verbose_cost)
11431 fprintf (dump_file,
11432 "\nFailed to cost RTX. Assuming default cost.\n");
11434 return true;
11437 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11438 calculated for X. This cost is stored in *COST. Returns true
11439 if the total cost of X was calculated. */
11440 static bool
11441 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11442 int param, int *cost, bool speed)
11444 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11446 if (dump_file
11447 && flag_aarch64_verbose_cost)
11449 print_rtl_single (dump_file, x);
11450 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11451 speed ? "Hot" : "Cold",
11452 *cost, result ? "final" : "partial");
11455 return result;
11458 static int
11459 aarch64_register_move_cost (machine_mode mode,
11460 reg_class_t from_i, reg_class_t to_i)
11462 enum reg_class from = (enum reg_class) from_i;
11463 enum reg_class to = (enum reg_class) to_i;
11464 const struct cpu_regmove_cost *regmove_cost
11465 = aarch64_tune_params.regmove_cost;
11467 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11468 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11469 to = GENERAL_REGS;
11471 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11472 from = GENERAL_REGS;
11474 /* Moving between GPR and stack cost is the same as GP2GP. */
11475 if ((from == GENERAL_REGS && to == STACK_REG)
11476 || (to == GENERAL_REGS && from == STACK_REG))
11477 return regmove_cost->GP2GP;
11479 /* To/From the stack register, we move via the gprs. */
11480 if (to == STACK_REG || from == STACK_REG)
11481 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11482 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11484 if (known_eq (GET_MODE_SIZE (mode), 16))
11486 /* 128-bit operations on general registers require 2 instructions. */
11487 if (from == GENERAL_REGS && to == GENERAL_REGS)
11488 return regmove_cost->GP2GP * 2;
11489 else if (from == GENERAL_REGS)
11490 return regmove_cost->GP2FP * 2;
11491 else if (to == GENERAL_REGS)
11492 return regmove_cost->FP2GP * 2;
11494 /* When AdvSIMD instructions are disabled it is not possible to move
11495 a 128-bit value directly between Q registers. This is handled in
11496 secondary reload. A general register is used as a scratch to move
11497 the upper DI value and the lower DI value is moved directly,
11498 hence the cost is the sum of three moves. */
11499 if (! TARGET_SIMD)
11500 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11502 return regmove_cost->FP2FP;
11505 if (from == GENERAL_REGS && to == GENERAL_REGS)
11506 return regmove_cost->GP2GP;
11507 else if (from == GENERAL_REGS)
11508 return regmove_cost->GP2FP;
11509 else if (to == GENERAL_REGS)
11510 return regmove_cost->FP2GP;
11512 return regmove_cost->FP2FP;
11515 static int
11516 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11517 reg_class_t rclass ATTRIBUTE_UNUSED,
11518 bool in ATTRIBUTE_UNUSED)
11520 return aarch64_tune_params.memmov_cost;
11523 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11524 to optimize 1.0/sqrt. */
11526 static bool
11527 use_rsqrt_p (machine_mode mode)
11529 return (!flag_trapping_math
11530 && flag_unsafe_math_optimizations
11531 && ((aarch64_tune_params.approx_modes->recip_sqrt
11532 & AARCH64_APPROX_MODE (mode))
11533 || flag_mrecip_low_precision_sqrt));
11536 /* Function to decide when to use the approximate reciprocal square root
11537 builtin. */
11539 static tree
11540 aarch64_builtin_reciprocal (tree fndecl)
11542 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11544 if (!use_rsqrt_p (mode))
11545 return NULL_TREE;
11546 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11549 /* Emit instruction sequence to compute either the approximate square root
11550 or its approximate reciprocal, depending on the flag RECP, and return
11551 whether the sequence was emitted or not. */
11553 bool
11554 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11556 machine_mode mode = GET_MODE (dst);
11558 if (GET_MODE_INNER (mode) == HFmode)
11560 gcc_assert (!recp);
11561 return false;
11564 if (!recp)
11566 if (!(flag_mlow_precision_sqrt
11567 || (aarch64_tune_params.approx_modes->sqrt
11568 & AARCH64_APPROX_MODE (mode))))
11569 return false;
11571 if (flag_finite_math_only
11572 || flag_trapping_math
11573 || !flag_unsafe_math_optimizations
11574 || optimize_function_for_size_p (cfun))
11575 return false;
11577 else
11578 /* Caller assumes we cannot fail. */
11579 gcc_assert (use_rsqrt_p (mode));
11581 machine_mode mmsk = mode_for_int_vector (mode).require ();
11582 rtx xmsk = gen_reg_rtx (mmsk);
11583 if (!recp)
11584 /* When calculating the approximate square root, compare the
11585 argument with 0.0 and create a mask. */
11586 emit_insn (gen_rtx_SET (xmsk,
11587 gen_rtx_NEG (mmsk,
11588 gen_rtx_EQ (mmsk, src,
11589 CONST0_RTX (mode)))));
11591 /* Estimate the approximate reciprocal square root. */
11592 rtx xdst = gen_reg_rtx (mode);
11593 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11595 /* Iterate over the series twice for SF and thrice for DF. */
11596 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11598 /* Optionally iterate over the series once less for faster performance
11599 while sacrificing the accuracy. */
11600 if ((recp && flag_mrecip_low_precision_sqrt)
11601 || (!recp && flag_mlow_precision_sqrt))
11602 iterations--;
11604 /* Iterate over the series to calculate the approximate reciprocal square
11605 root. */
11606 rtx x1 = gen_reg_rtx (mode);
11607 while (iterations--)
11609 rtx x2 = gen_reg_rtx (mode);
11610 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11612 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11614 if (iterations > 0)
11615 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11618 if (!recp)
11620 /* Qualify the approximate reciprocal square root when the argument is
11621 0.0 by squashing the intermediary result to 0.0. */
11622 rtx xtmp = gen_reg_rtx (mmsk);
11623 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11624 gen_rtx_SUBREG (mmsk, xdst, 0)));
11625 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11627 /* Calculate the approximate square root. */
11628 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11631 /* Finalize the approximation. */
11632 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11634 return true;
11637 /* Emit the instruction sequence to compute the approximation for the division
11638 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11640 bool
11641 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11643 machine_mode mode = GET_MODE (quo);
11645 if (GET_MODE_INNER (mode) == HFmode)
11646 return false;
11648 bool use_approx_division_p = (flag_mlow_precision_div
11649 || (aarch64_tune_params.approx_modes->division
11650 & AARCH64_APPROX_MODE (mode)));
11652 if (!flag_finite_math_only
11653 || flag_trapping_math
11654 || !flag_unsafe_math_optimizations
11655 || optimize_function_for_size_p (cfun)
11656 || !use_approx_division_p)
11657 return false;
11659 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11660 return false;
11662 /* Estimate the approximate reciprocal. */
11663 rtx xrcp = gen_reg_rtx (mode);
11664 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11666 /* Iterate over the series twice for SF and thrice for DF. */
11667 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11669 /* Optionally iterate over the series once less for faster performance,
11670 while sacrificing the accuracy. */
11671 if (flag_mlow_precision_div)
11672 iterations--;
11674 /* Iterate over the series to calculate the approximate reciprocal. */
11675 rtx xtmp = gen_reg_rtx (mode);
11676 while (iterations--)
11678 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11680 if (iterations > 0)
11681 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11684 if (num != CONST1_RTX (mode))
11686 /* As the approximate reciprocal of DEN is already calculated, only
11687 calculate the approximate division when NUM is not 1.0. */
11688 rtx xnum = force_reg (mode, num);
11689 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11692 /* Finalize the approximation. */
11693 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11694 return true;
11697 /* Return the number of instructions that can be issued per cycle. */
11698 static int
11699 aarch64_sched_issue_rate (void)
11701 return aarch64_tune_params.issue_rate;
11704 static int
11705 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11707 int issue_rate = aarch64_sched_issue_rate ();
11709 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11713 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11714 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11715 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11717 static int
11718 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11719 int ready_index)
11721 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11725 /* Vectorizer cost model target hooks. */
11727 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11728 static int
11729 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11730 tree vectype,
11731 int misalign ATTRIBUTE_UNUSED)
11733 unsigned elements;
11734 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11735 bool fp = false;
11737 if (vectype != NULL)
11738 fp = FLOAT_TYPE_P (vectype);
11740 switch (type_of_cost)
11742 case scalar_stmt:
11743 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11745 case scalar_load:
11746 return costs->scalar_load_cost;
11748 case scalar_store:
11749 return costs->scalar_store_cost;
11751 case vector_stmt:
11752 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11754 case vector_load:
11755 return costs->vec_align_load_cost;
11757 case vector_store:
11758 return costs->vec_store_cost;
11760 case vec_to_scalar:
11761 return costs->vec_to_scalar_cost;
11763 case scalar_to_vec:
11764 return costs->scalar_to_vec_cost;
11766 case unaligned_load:
11767 case vector_gather_load:
11768 return costs->vec_unalign_load_cost;
11770 case unaligned_store:
11771 case vector_scatter_store:
11772 return costs->vec_unalign_store_cost;
11774 case cond_branch_taken:
11775 return costs->cond_taken_branch_cost;
11777 case cond_branch_not_taken:
11778 return costs->cond_not_taken_branch_cost;
11780 case vec_perm:
11781 return costs->vec_permute_cost;
11783 case vec_promote_demote:
11784 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11786 case vec_construct:
11787 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11788 return elements / 2 + 1;
11790 default:
11791 gcc_unreachable ();
11795 /* Implement targetm.vectorize.add_stmt_cost. */
11796 static unsigned
11797 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11798 struct _stmt_vec_info *stmt_info, int misalign,
11799 enum vect_cost_model_location where)
11801 unsigned *cost = (unsigned *) data;
11802 unsigned retval = 0;
11804 if (flag_vect_cost_model)
11806 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11807 int stmt_cost =
11808 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11810 /* Statements in an inner loop relative to the loop being
11811 vectorized are weighted more heavily. The value here is
11812 arbitrary and could potentially be improved with analysis. */
11813 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11814 count *= 50; /* FIXME */
11816 retval = (unsigned) (count * stmt_cost);
11817 cost[where] += retval;
11820 return retval;
11823 static void initialize_aarch64_code_model (struct gcc_options *);
11825 /* Parse the TO_PARSE string and put the architecture struct that it
11826 selects into RES and the architectural features into ISA_FLAGS.
11827 Return an aarch64_parse_opt_result describing the parse result.
11828 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11829 When the TO_PARSE string contains an invalid extension,
11830 a copy of the string is created and stored to INVALID_EXTENSION. */
11832 static enum aarch64_parse_opt_result
11833 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11834 uint64_t *isa_flags, std::string *invalid_extension)
11836 const char *ext;
11837 const struct processor *arch;
11838 size_t len;
11840 ext = strchr (to_parse, '+');
11842 if (ext != NULL)
11843 len = ext - to_parse;
11844 else
11845 len = strlen (to_parse);
11847 if (len == 0)
11848 return AARCH64_PARSE_MISSING_ARG;
11851 /* Loop through the list of supported ARCHes to find a match. */
11852 for (arch = all_architectures; arch->name != NULL; arch++)
11854 if (strlen (arch->name) == len
11855 && strncmp (arch->name, to_parse, len) == 0)
11857 uint64_t isa_temp = arch->flags;
11859 if (ext != NULL)
11861 /* TO_PARSE string contains at least one extension. */
11862 enum aarch64_parse_opt_result ext_res
11863 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11865 if (ext_res != AARCH64_PARSE_OK)
11866 return ext_res;
11868 /* Extension parsing was successful. Confirm the result
11869 arch and ISA flags. */
11870 *res = arch;
11871 *isa_flags = isa_temp;
11872 return AARCH64_PARSE_OK;
11876 /* ARCH name not found in list. */
11877 return AARCH64_PARSE_INVALID_ARG;
11880 /* Parse the TO_PARSE string and put the result tuning in RES and the
11881 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11882 describing the parse result. If there is an error parsing, RES and
11883 ISA_FLAGS are left unchanged.
11884 When the TO_PARSE string contains an invalid extension,
11885 a copy of the string is created and stored to INVALID_EXTENSION. */
11887 static enum aarch64_parse_opt_result
11888 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11889 uint64_t *isa_flags, std::string *invalid_extension)
11891 const char *ext;
11892 const struct processor *cpu;
11893 size_t len;
11895 ext = strchr (to_parse, '+');
11897 if (ext != NULL)
11898 len = ext - to_parse;
11899 else
11900 len = strlen (to_parse);
11902 if (len == 0)
11903 return AARCH64_PARSE_MISSING_ARG;
11906 /* Loop through the list of supported CPUs to find a match. */
11907 for (cpu = all_cores; cpu->name != NULL; cpu++)
11909 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11911 uint64_t isa_temp = cpu->flags;
11914 if (ext != NULL)
11916 /* TO_PARSE string contains at least one extension. */
11917 enum aarch64_parse_opt_result ext_res
11918 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11920 if (ext_res != AARCH64_PARSE_OK)
11921 return ext_res;
11923 /* Extension parsing was successfull. Confirm the result
11924 cpu and ISA flags. */
11925 *res = cpu;
11926 *isa_flags = isa_temp;
11927 return AARCH64_PARSE_OK;
11931 /* CPU name not found in list. */
11932 return AARCH64_PARSE_INVALID_ARG;
11935 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11936 Return an aarch64_parse_opt_result describing the parse result.
11937 If the parsing fails the RES does not change. */
11939 static enum aarch64_parse_opt_result
11940 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11942 const struct processor *cpu;
11944 /* Loop through the list of supported CPUs to find a match. */
11945 for (cpu = all_cores; cpu->name != NULL; cpu++)
11947 if (strcmp (cpu->name, to_parse) == 0)
11949 *res = cpu;
11950 return AARCH64_PARSE_OK;
11954 /* CPU name not found in list. */
11955 return AARCH64_PARSE_INVALID_ARG;
11958 /* Parse TOKEN, which has length LENGTH to see if it is an option
11959 described in FLAG. If it is, return the index bit for that fusion type.
11960 If not, error (printing OPTION_NAME) and return zero. */
11962 static unsigned int
11963 aarch64_parse_one_option_token (const char *token,
11964 size_t length,
11965 const struct aarch64_flag_desc *flag,
11966 const char *option_name)
11968 for (; flag->name != NULL; flag++)
11970 if (length == strlen (flag->name)
11971 && !strncmp (flag->name, token, length))
11972 return flag->flag;
11975 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11976 return 0;
11979 /* Parse OPTION which is a comma-separated list of flags to enable.
11980 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11981 default state we inherit from the CPU tuning structures. OPTION_NAME
11982 gives the top-level option we are parsing in the -moverride string,
11983 for use in error messages. */
11985 static unsigned int
11986 aarch64_parse_boolean_options (const char *option,
11987 const struct aarch64_flag_desc *flags,
11988 unsigned int initial_state,
11989 const char *option_name)
11991 const char separator = '.';
11992 const char* specs = option;
11993 const char* ntoken = option;
11994 unsigned int found_flags = initial_state;
11996 while ((ntoken = strchr (specs, separator)))
11998 size_t token_length = ntoken - specs;
11999 unsigned token_ops = aarch64_parse_one_option_token (specs,
12000 token_length,
12001 flags,
12002 option_name);
12003 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12004 in the token stream, reset the supported operations. So:
12006 adrp+add.cmp+branch.none.adrp+add
12008 would have the result of turning on only adrp+add fusion. */
12009 if (!token_ops)
12010 found_flags = 0;
12012 found_flags |= token_ops;
12013 specs = ++ntoken;
12016 /* We ended with a comma, print something. */
12017 if (!(*specs))
12019 error ("%s string ill-formed\n", option_name);
12020 return 0;
12023 /* We still have one more token to parse. */
12024 size_t token_length = strlen (specs);
12025 unsigned token_ops = aarch64_parse_one_option_token (specs,
12026 token_length,
12027 flags,
12028 option_name);
12029 if (!token_ops)
12030 found_flags = 0;
12032 found_flags |= token_ops;
12033 return found_flags;
12036 /* Support for overriding instruction fusion. */
12038 static void
12039 aarch64_parse_fuse_string (const char *fuse_string,
12040 struct tune_params *tune)
12042 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12043 aarch64_fusible_pairs,
12044 tune->fusible_ops,
12045 "fuse=");
12048 /* Support for overriding other tuning flags. */
12050 static void
12051 aarch64_parse_tune_string (const char *tune_string,
12052 struct tune_params *tune)
12054 tune->extra_tuning_flags
12055 = aarch64_parse_boolean_options (tune_string,
12056 aarch64_tuning_flags,
12057 tune->extra_tuning_flags,
12058 "tune=");
12061 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12062 Accept the valid SVE vector widths allowed by
12063 aarch64_sve_vector_bits_enum and use it to override sve_width
12064 in TUNE. */
12066 static void
12067 aarch64_parse_sve_width_string (const char *tune_string,
12068 struct tune_params *tune)
12070 int width = -1;
12072 int n = sscanf (tune_string, "%d", &width);
12073 if (n == EOF)
12075 error ("invalid format for sve_width");
12076 return;
12078 switch (width)
12080 case SVE_128:
12081 case SVE_256:
12082 case SVE_512:
12083 case SVE_1024:
12084 case SVE_2048:
12085 break;
12086 default:
12087 error ("invalid sve_width value: %d", width);
12089 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12092 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12093 we understand. If it is, extract the option string and handoff to
12094 the appropriate function. */
12096 void
12097 aarch64_parse_one_override_token (const char* token,
12098 size_t length,
12099 struct tune_params *tune)
12101 const struct aarch64_tuning_override_function *fn
12102 = aarch64_tuning_override_functions;
12104 const char *option_part = strchr (token, '=');
12105 if (!option_part)
12107 error ("tuning string missing in option (%s)", token);
12108 return;
12111 /* Get the length of the option name. */
12112 length = option_part - token;
12113 /* Skip the '=' to get to the option string. */
12114 option_part++;
12116 for (; fn->name != NULL; fn++)
12118 if (!strncmp (fn->name, token, length))
12120 fn->parse_override (option_part, tune);
12121 return;
12125 error ("unknown tuning option (%s)",token);
12126 return;
12129 /* A checking mechanism for the implementation of the tls size. */
12131 static void
12132 initialize_aarch64_tls_size (struct gcc_options *opts)
12134 if (aarch64_tls_size == 0)
12135 aarch64_tls_size = 24;
12137 switch (opts->x_aarch64_cmodel_var)
12139 case AARCH64_CMODEL_TINY:
12140 /* Both the default and maximum TLS size allowed under tiny is 1M which
12141 needs two instructions to address, so we clamp the size to 24. */
12142 if (aarch64_tls_size > 24)
12143 aarch64_tls_size = 24;
12144 break;
12145 case AARCH64_CMODEL_SMALL:
12146 /* The maximum TLS size allowed under small is 4G. */
12147 if (aarch64_tls_size > 32)
12148 aarch64_tls_size = 32;
12149 break;
12150 case AARCH64_CMODEL_LARGE:
12151 /* The maximum TLS size allowed under large is 16E.
12152 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12153 if (aarch64_tls_size > 48)
12154 aarch64_tls_size = 48;
12155 break;
12156 default:
12157 gcc_unreachable ();
12160 return;
12163 /* Parse STRING looking for options in the format:
12164 string :: option:string
12165 option :: name=substring
12166 name :: {a-z}
12167 substring :: defined by option. */
12169 static void
12170 aarch64_parse_override_string (const char* input_string,
12171 struct tune_params* tune)
12173 const char separator = ':';
12174 size_t string_length = strlen (input_string) + 1;
12175 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12176 char *string = string_root;
12177 strncpy (string, input_string, string_length);
12178 string[string_length - 1] = '\0';
12180 char* ntoken = string;
12182 while ((ntoken = strchr (string, separator)))
12184 size_t token_length = ntoken - string;
12185 /* Make this substring look like a string. */
12186 *ntoken = '\0';
12187 aarch64_parse_one_override_token (string, token_length, tune);
12188 string = ++ntoken;
12191 /* One last option to parse. */
12192 aarch64_parse_one_override_token (string, strlen (string), tune);
12193 free (string_root);
12197 static void
12198 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12200 if (accepted_branch_protection_string)
12202 opts->x_aarch64_branch_protection_string
12203 = xstrdup (accepted_branch_protection_string);
12206 /* PR 70044: We have to be careful about being called multiple times for the
12207 same function. This means all changes should be repeatable. */
12209 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12210 Disable the frame pointer flag so the mid-end will not use a frame
12211 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12212 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12213 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12214 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12215 if (opts->x_flag_omit_frame_pointer == 0)
12216 opts->x_flag_omit_frame_pointer = 2;
12218 /* If not optimizing for size, set the default
12219 alignment to what the target wants. */
12220 if (!opts->x_optimize_size)
12222 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12223 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12224 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12225 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12226 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12227 opts->x_str_align_functions = aarch64_tune_params.function_align;
12230 /* We default to no pc-relative literal loads. */
12232 aarch64_pcrelative_literal_loads = false;
12234 /* If -mpc-relative-literal-loads is set on the command line, this
12235 implies that the user asked for PC relative literal loads. */
12236 if (opts->x_pcrelative_literal_loads == 1)
12237 aarch64_pcrelative_literal_loads = true;
12239 /* In the tiny memory model it makes no sense to disallow PC relative
12240 literal pool loads. */
12241 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12242 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12243 aarch64_pcrelative_literal_loads = true;
12245 /* When enabling the lower precision Newton series for the square root, also
12246 enable it for the reciprocal square root, since the latter is an
12247 intermediary step for the former. */
12248 if (flag_mlow_precision_sqrt)
12249 flag_mrecip_low_precision_sqrt = true;
12252 /* 'Unpack' up the internal tuning structs and update the options
12253 in OPTS. The caller must have set up selected_tune and selected_arch
12254 as all the other target-specific codegen decisions are
12255 derived from them. */
12257 void
12258 aarch64_override_options_internal (struct gcc_options *opts)
12260 aarch64_tune_flags = selected_tune->flags;
12261 aarch64_tune = selected_tune->sched_core;
12262 /* Make a copy of the tuning parameters attached to the core, which
12263 we may later overwrite. */
12264 aarch64_tune_params = *(selected_tune->tune);
12265 aarch64_architecture_version = selected_arch->architecture_version;
12267 if (opts->x_aarch64_override_tune_string)
12268 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12269 &aarch64_tune_params);
12271 /* This target defaults to strict volatile bitfields. */
12272 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12273 opts->x_flag_strict_volatile_bitfields = 1;
12275 if (aarch64_stack_protector_guard == SSP_GLOBAL
12276 && opts->x_aarch64_stack_protector_guard_offset_str)
12278 error ("incompatible options %<-mstack-protector-guard=global%> and "
12279 "%<-mstack-protector-guard-offset=%s%>",
12280 aarch64_stack_protector_guard_offset_str);
12283 if (aarch64_stack_protector_guard == SSP_SYSREG
12284 && !(opts->x_aarch64_stack_protector_guard_offset_str
12285 && opts->x_aarch64_stack_protector_guard_reg_str))
12287 error ("both %<-mstack-protector-guard-offset%> and "
12288 "%<-mstack-protector-guard-reg%> must be used "
12289 "with %<-mstack-protector-guard=sysreg%>");
12292 if (opts->x_aarch64_stack_protector_guard_reg_str)
12294 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12295 error ("specify a system register with a small string length.");
12298 if (opts->x_aarch64_stack_protector_guard_offset_str)
12300 char *end;
12301 const char *str = aarch64_stack_protector_guard_offset_str;
12302 errno = 0;
12303 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12304 if (!*str || *end || errno)
12305 error ("%qs is not a valid offset in %qs", str,
12306 "-mstack-protector-guard-offset=");
12307 aarch64_stack_protector_guard_offset = offs;
12310 initialize_aarch64_code_model (opts);
12311 initialize_aarch64_tls_size (opts);
12313 int queue_depth = 0;
12314 switch (aarch64_tune_params.autoprefetcher_model)
12316 case tune_params::AUTOPREFETCHER_OFF:
12317 queue_depth = -1;
12318 break;
12319 case tune_params::AUTOPREFETCHER_WEAK:
12320 queue_depth = 0;
12321 break;
12322 case tune_params::AUTOPREFETCHER_STRONG:
12323 queue_depth = max_insn_queue_index + 1;
12324 break;
12325 default:
12326 gcc_unreachable ();
12329 /* We don't mind passing in global_options_set here as we don't use
12330 the *options_set structs anyway. */
12331 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12332 queue_depth,
12333 opts->x_param_values,
12334 global_options_set.x_param_values);
12336 /* Set up parameters to be used in prefetching algorithm. Do not
12337 override the defaults unless we are tuning for a core we have
12338 researched values for. */
12339 if (aarch64_tune_params.prefetch->num_slots > 0)
12340 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12341 aarch64_tune_params.prefetch->num_slots,
12342 opts->x_param_values,
12343 global_options_set.x_param_values);
12344 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12345 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12346 aarch64_tune_params.prefetch->l1_cache_size,
12347 opts->x_param_values,
12348 global_options_set.x_param_values);
12349 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12350 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12351 aarch64_tune_params.prefetch->l1_cache_line_size,
12352 opts->x_param_values,
12353 global_options_set.x_param_values);
12354 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12355 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12356 aarch64_tune_params.prefetch->l2_cache_size,
12357 opts->x_param_values,
12358 global_options_set.x_param_values);
12359 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12360 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12362 opts->x_param_values,
12363 global_options_set.x_param_values);
12364 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12365 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12366 aarch64_tune_params.prefetch->minimum_stride,
12367 opts->x_param_values,
12368 global_options_set.x_param_values);
12370 /* Use the alternative scheduling-pressure algorithm by default. */
12371 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12372 opts->x_param_values,
12373 global_options_set.x_param_values);
12375 /* If the user hasn't changed it via configure then set the default to 64 KB
12376 for the backend. */
12377 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12378 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12379 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12380 opts->x_param_values,
12381 global_options_set.x_param_values);
12383 /* Validate the guard size. */
12384 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12386 /* Enforce that interval is the same size as size so the mid-end does the
12387 right thing. */
12388 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12389 guard_size,
12390 opts->x_param_values,
12391 global_options_set.x_param_values);
12393 /* The maybe_set calls won't update the value if the user has explicitly set
12394 one. Which means we need to validate that probing interval and guard size
12395 are equal. */
12396 int probe_interval
12397 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12398 if (guard_size != probe_interval)
12399 error ("stack clash guard size %<%d%> must be equal to probing interval "
12400 "%<%d%>", guard_size, probe_interval);
12402 /* Enable sw prefetching at specified optimization level for
12403 CPUS that have prefetch. Lower optimization level threshold by 1
12404 when profiling is enabled. */
12405 if (opts->x_flag_prefetch_loop_arrays < 0
12406 && !opts->x_optimize_size
12407 && aarch64_tune_params.prefetch->default_opt_level >= 0
12408 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12409 opts->x_flag_prefetch_loop_arrays = 1;
12411 if (opts->x_aarch64_arch_string == NULL)
12412 opts->x_aarch64_arch_string = selected_arch->name;
12413 if (opts->x_aarch64_cpu_string == NULL)
12414 opts->x_aarch64_cpu_string = selected_cpu->name;
12415 if (opts->x_aarch64_tune_string == NULL)
12416 opts->x_aarch64_tune_string = selected_tune->name;
12418 aarch64_override_options_after_change_1 (opts);
12421 /* Print a hint with a suggestion for a core or architecture name that
12422 most closely resembles what the user passed in STR. ARCH is true if
12423 the user is asking for an architecture name. ARCH is false if the user
12424 is asking for a core name. */
12426 static void
12427 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12429 auto_vec<const char *> candidates;
12430 const struct processor *entry = arch ? all_architectures : all_cores;
12431 for (; entry->name != NULL; entry++)
12432 candidates.safe_push (entry->name);
12434 #ifdef HAVE_LOCAL_CPU_DETECT
12435 /* Add also "native" as possible value. */
12436 if (arch)
12437 candidates.safe_push ("native");
12438 #endif
12440 char *s;
12441 const char *hint = candidates_list_and_hint (str, s, candidates);
12442 if (hint)
12443 inform (input_location, "valid arguments are: %s;"
12444 " did you mean %qs?", s, hint);
12445 else
12446 inform (input_location, "valid arguments are: %s", s);
12448 XDELETEVEC (s);
12451 /* Print a hint with a suggestion for a core name that most closely resembles
12452 what the user passed in STR. */
12454 inline static void
12455 aarch64_print_hint_for_core (const char *str)
12457 aarch64_print_hint_for_core_or_arch (str, false);
12460 /* Print a hint with a suggestion for an architecture name that most closely
12461 resembles what the user passed in STR. */
12463 inline static void
12464 aarch64_print_hint_for_arch (const char *str)
12466 aarch64_print_hint_for_core_or_arch (str, true);
12470 /* Print a hint with a suggestion for an extension name
12471 that most closely resembles what the user passed in STR. */
12473 void
12474 aarch64_print_hint_for_extensions (const std::string &str)
12476 auto_vec<const char *> candidates;
12477 aarch64_get_all_extension_candidates (&candidates);
12478 char *s;
12479 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12480 if (hint)
12481 inform (input_location, "valid arguments are: %s;"
12482 " did you mean %qs?", s, hint);
12483 else
12484 inform (input_location, "valid arguments are: %s;", s);
12486 XDELETEVEC (s);
12489 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12490 specified in STR and throw errors if appropriate. Put the results if
12491 they are valid in RES and ISA_FLAGS. Return whether the option is
12492 valid. */
12494 static bool
12495 aarch64_validate_mcpu (const char *str, const struct processor **res,
12496 uint64_t *isa_flags)
12498 std::string invalid_extension;
12499 enum aarch64_parse_opt_result parse_res
12500 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12502 if (parse_res == AARCH64_PARSE_OK)
12503 return true;
12505 switch (parse_res)
12507 case AARCH64_PARSE_MISSING_ARG:
12508 error ("missing cpu name in %<-mcpu=%s%>", str);
12509 break;
12510 case AARCH64_PARSE_INVALID_ARG:
12511 error ("unknown value %qs for %<-mcpu%>", str);
12512 aarch64_print_hint_for_core (str);
12513 break;
12514 case AARCH64_PARSE_INVALID_FEATURE:
12515 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12516 invalid_extension.c_str (), str);
12517 aarch64_print_hint_for_extensions (invalid_extension);
12518 break;
12519 default:
12520 gcc_unreachable ();
12523 return false;
12526 /* Parses CONST_STR for branch protection features specified in
12527 aarch64_branch_protect_types, and set any global variables required. Returns
12528 the parsing result and assigns LAST_STR to the last processed token from
12529 CONST_STR so that it can be used for error reporting. */
12531 static enum
12532 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12533 char** last_str)
12535 char *str_root = xstrdup (const_str);
12536 char* token_save = NULL;
12537 char *str = strtok_r (str_root, "+", &token_save);
12538 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12539 if (!str)
12540 res = AARCH64_PARSE_MISSING_ARG;
12541 else
12543 char *next_str = strtok_r (NULL, "+", &token_save);
12544 /* Reset the branch protection features to their defaults. */
12545 aarch64_handle_no_branch_protection (NULL, NULL);
12547 while (str && res == AARCH64_PARSE_OK)
12549 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12550 bool found = false;
12551 /* Search for this type. */
12552 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12554 if (strcmp (str, type->name) == 0)
12556 found = true;
12557 res = type->handler (str, next_str);
12558 str = next_str;
12559 next_str = strtok_r (NULL, "+", &token_save);
12561 else
12562 type++;
12564 if (found && res == AARCH64_PARSE_OK)
12566 bool found_subtype = true;
12567 /* Loop through each token until we find one that isn't a
12568 subtype. */
12569 while (found_subtype)
12571 found_subtype = false;
12572 const aarch64_branch_protect_type *subtype = type->subtypes;
12573 /* Search for the subtype. */
12574 while (str && subtype && subtype->name && !found_subtype
12575 && res == AARCH64_PARSE_OK)
12577 if (strcmp (str, subtype->name) == 0)
12579 found_subtype = true;
12580 res = subtype->handler (str, next_str);
12581 str = next_str;
12582 next_str = strtok_r (NULL, "+", &token_save);
12584 else
12585 subtype++;
12589 else if (!found)
12590 res = AARCH64_PARSE_INVALID_ARG;
12593 /* Copy the last processed token into the argument to pass it back.
12594 Used by option and attribute validation to print the offending token. */
12595 if (last_str)
12597 if (str) strcpy (*last_str, str);
12598 else *last_str = NULL;
12600 if (res == AARCH64_PARSE_OK)
12602 /* If needed, alloc the accepted string then copy in const_str.
12603 Used by override_option_after_change_1. */
12604 if (!accepted_branch_protection_string)
12605 accepted_branch_protection_string = (char *) xmalloc (
12606 BRANCH_PROTECT_STR_MAX
12607 + 1);
12608 strncpy (accepted_branch_protection_string, const_str,
12609 BRANCH_PROTECT_STR_MAX + 1);
12610 /* Forcibly null-terminate. */
12611 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12613 return res;
12616 static bool
12617 aarch64_validate_mbranch_protection (const char *const_str)
12619 char *str = (char *) xmalloc (strlen (const_str));
12620 enum aarch64_parse_opt_result res =
12621 aarch64_parse_branch_protection (const_str, &str);
12622 if (res == AARCH64_PARSE_INVALID_ARG)
12623 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12624 else if (res == AARCH64_PARSE_MISSING_ARG)
12625 error ("missing argument for %<-mbranch-protection=%>");
12626 free (str);
12627 return res == AARCH64_PARSE_OK;
12630 /* Validate a command-line -march option. Parse the arch and extensions
12631 (if any) specified in STR and throw errors if appropriate. Put the
12632 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12633 option is valid. */
12635 static bool
12636 aarch64_validate_march (const char *str, const struct processor **res,
12637 uint64_t *isa_flags)
12639 std::string invalid_extension;
12640 enum aarch64_parse_opt_result parse_res
12641 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12643 if (parse_res == AARCH64_PARSE_OK)
12644 return true;
12646 switch (parse_res)
12648 case AARCH64_PARSE_MISSING_ARG:
12649 error ("missing arch name in %<-march=%s%>", str);
12650 break;
12651 case AARCH64_PARSE_INVALID_ARG:
12652 error ("unknown value %qs for %<-march%>", str);
12653 aarch64_print_hint_for_arch (str);
12654 break;
12655 case AARCH64_PARSE_INVALID_FEATURE:
12656 error ("invalid feature modifier %qs in %<-march=%s%>",
12657 invalid_extension.c_str (), str);
12658 aarch64_print_hint_for_extensions (invalid_extension);
12659 break;
12660 default:
12661 gcc_unreachable ();
12664 return false;
12667 /* Validate a command-line -mtune option. Parse the cpu
12668 specified in STR and throw errors if appropriate. Put the
12669 result, if it is valid, in RES. Return whether the option is
12670 valid. */
12672 static bool
12673 aarch64_validate_mtune (const char *str, const struct processor **res)
12675 enum aarch64_parse_opt_result parse_res
12676 = aarch64_parse_tune (str, res);
12678 if (parse_res == AARCH64_PARSE_OK)
12679 return true;
12681 switch (parse_res)
12683 case AARCH64_PARSE_MISSING_ARG:
12684 error ("missing cpu name in %<-mtune=%s%>", str);
12685 break;
12686 case AARCH64_PARSE_INVALID_ARG:
12687 error ("unknown value %qs for %<-mtune%>", str);
12688 aarch64_print_hint_for_core (str);
12689 break;
12690 default:
12691 gcc_unreachable ();
12693 return false;
12696 /* Return the CPU corresponding to the enum CPU.
12697 If it doesn't specify a cpu, return the default. */
12699 static const struct processor *
12700 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12702 if (cpu != aarch64_none)
12703 return &all_cores[cpu];
12705 /* The & 0x3f is to extract the bottom 6 bits that encode the
12706 default cpu as selected by the --with-cpu GCC configure option
12707 in config.gcc.
12708 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12709 flags mechanism should be reworked to make it more sane. */
12710 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12713 /* Return the architecture corresponding to the enum ARCH.
12714 If it doesn't specify a valid architecture, return the default. */
12716 static const struct processor *
12717 aarch64_get_arch (enum aarch64_arch arch)
12719 if (arch != aarch64_no_arch)
12720 return &all_architectures[arch];
12722 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12724 return &all_architectures[cpu->arch];
12727 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12729 static poly_uint16
12730 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12732 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12733 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12734 deciding which .md file patterns to use and when deciding whether
12735 something is a legitimate address or constant. */
12736 if (value == SVE_SCALABLE || value == SVE_128)
12737 return poly_uint16 (2, 2);
12738 else
12739 return (int) value / 64;
12742 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12743 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12744 tuning structs. In particular it must set selected_tune and
12745 aarch64_isa_flags that define the available ISA features and tuning
12746 decisions. It must also set selected_arch as this will be used to
12747 output the .arch asm tags for each function. */
12749 static void
12750 aarch64_override_options (void)
12752 uint64_t cpu_isa = 0;
12753 uint64_t arch_isa = 0;
12754 aarch64_isa_flags = 0;
12756 bool valid_cpu = true;
12757 bool valid_tune = true;
12758 bool valid_arch = true;
12760 selected_cpu = NULL;
12761 selected_arch = NULL;
12762 selected_tune = NULL;
12764 if (aarch64_branch_protection_string)
12765 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12767 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12768 If either of -march or -mtune is given, they override their
12769 respective component of -mcpu. */
12770 if (aarch64_cpu_string)
12771 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12772 &cpu_isa);
12774 if (aarch64_arch_string)
12775 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12776 &arch_isa);
12778 if (aarch64_tune_string)
12779 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12781 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12782 SUBTARGET_OVERRIDE_OPTIONS;
12783 #endif
12785 /* If the user did not specify a processor, choose the default
12786 one for them. This will be the CPU set during configuration using
12787 --with-cpu, otherwise it is "generic". */
12788 if (!selected_cpu)
12790 if (selected_arch)
12792 selected_cpu = &all_cores[selected_arch->ident];
12793 aarch64_isa_flags = arch_isa;
12794 explicit_arch = selected_arch->arch;
12796 else
12798 /* Get default configure-time CPU. */
12799 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12800 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12803 if (selected_tune)
12804 explicit_tune_core = selected_tune->ident;
12806 /* If both -mcpu and -march are specified check that they are architecturally
12807 compatible, warn if they're not and prefer the -march ISA flags. */
12808 else if (selected_arch)
12810 if (selected_arch->arch != selected_cpu->arch)
12812 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12813 all_architectures[selected_cpu->arch].name,
12814 selected_arch->name);
12816 aarch64_isa_flags = arch_isa;
12817 explicit_arch = selected_arch->arch;
12818 explicit_tune_core = selected_tune ? selected_tune->ident
12819 : selected_cpu->ident;
12821 else
12823 /* -mcpu but no -march. */
12824 aarch64_isa_flags = cpu_isa;
12825 explicit_tune_core = selected_tune ? selected_tune->ident
12826 : selected_cpu->ident;
12827 gcc_assert (selected_cpu);
12828 selected_arch = &all_architectures[selected_cpu->arch];
12829 explicit_arch = selected_arch->arch;
12832 /* Set the arch as well as we will need it when outputing
12833 the .arch directive in assembly. */
12834 if (!selected_arch)
12836 gcc_assert (selected_cpu);
12837 selected_arch = &all_architectures[selected_cpu->arch];
12840 if (!selected_tune)
12841 selected_tune = selected_cpu;
12843 if (aarch64_enable_bti == 2)
12845 #ifdef TARGET_ENABLE_BTI
12846 aarch64_enable_bti = 1;
12847 #else
12848 aarch64_enable_bti = 0;
12849 #endif
12852 /* Return address signing is currently not supported for ILP32 targets. For
12853 LP64 targets use the configured option in the absence of a command-line
12854 option for -mbranch-protection. */
12855 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12857 #ifdef TARGET_ENABLE_PAC_RET
12858 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12859 #else
12860 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12861 #endif
12864 #ifndef HAVE_AS_MABI_OPTION
12865 /* The compiler may have been configured with 2.23.* binutils, which does
12866 not have support for ILP32. */
12867 if (TARGET_ILP32)
12868 error ("assembler does not support %<-mabi=ilp32%>");
12869 #endif
12871 /* Convert -msve-vector-bits to a VG count. */
12872 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12874 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12875 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12877 /* Make sure we properly set up the explicit options. */
12878 if ((aarch64_cpu_string && valid_cpu)
12879 || (aarch64_tune_string && valid_tune))
12880 gcc_assert (explicit_tune_core != aarch64_none);
12882 if ((aarch64_cpu_string && valid_cpu)
12883 || (aarch64_arch_string && valid_arch))
12884 gcc_assert (explicit_arch != aarch64_no_arch);
12886 /* The pass to insert speculation tracking runs before
12887 shrink-wrapping and the latter does not know how to update the
12888 tracking status. So disable it in this case. */
12889 if (aarch64_track_speculation)
12890 flag_shrink_wrap = 0;
12892 aarch64_override_options_internal (&global_options);
12894 /* Save these options as the default ones in case we push and pop them later
12895 while processing functions with potential target attributes. */
12896 target_option_default_node = target_option_current_node
12897 = build_target_option_node (&global_options);
12900 /* Implement targetm.override_options_after_change. */
12902 static void
12903 aarch64_override_options_after_change (void)
12905 aarch64_override_options_after_change_1 (&global_options);
12908 static struct machine_function *
12909 aarch64_init_machine_status (void)
12911 struct machine_function *machine;
12912 machine = ggc_cleared_alloc<machine_function> ();
12913 return machine;
12916 void
12917 aarch64_init_expanders (void)
12919 init_machine_status = aarch64_init_machine_status;
12922 /* A checking mechanism for the implementation of the various code models. */
12923 static void
12924 initialize_aarch64_code_model (struct gcc_options *opts)
12926 if (opts->x_flag_pic)
12928 switch (opts->x_aarch64_cmodel_var)
12930 case AARCH64_CMODEL_TINY:
12931 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12932 break;
12933 case AARCH64_CMODEL_SMALL:
12934 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12935 aarch64_cmodel = (flag_pic == 2
12936 ? AARCH64_CMODEL_SMALL_PIC
12937 : AARCH64_CMODEL_SMALL_SPIC);
12938 #else
12939 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12940 #endif
12941 break;
12942 case AARCH64_CMODEL_LARGE:
12943 sorry ("code model %qs with %<-f%s%>", "large",
12944 opts->x_flag_pic > 1 ? "PIC" : "pic");
12945 break;
12946 default:
12947 gcc_unreachable ();
12950 else
12951 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12954 /* Implement TARGET_OPTION_SAVE. */
12956 static void
12957 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12959 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12960 ptr->x_aarch64_branch_protection_string
12961 = opts->x_aarch64_branch_protection_string;
12964 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12965 using the information saved in PTR. */
12967 static void
12968 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12970 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12971 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12972 opts->x_explicit_arch = ptr->x_explicit_arch;
12973 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12974 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12975 opts->x_aarch64_branch_protection_string
12976 = ptr->x_aarch64_branch_protection_string;
12977 if (opts->x_aarch64_branch_protection_string)
12979 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12980 NULL);
12983 aarch64_override_options_internal (opts);
12986 /* Implement TARGET_OPTION_PRINT. */
12988 static void
12989 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12991 const struct processor *cpu
12992 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12993 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12994 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12995 std::string extension
12996 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12998 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12999 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13000 arch->name, extension.c_str ());
13003 static GTY(()) tree aarch64_previous_fndecl;
13005 void
13006 aarch64_reset_previous_fndecl (void)
13008 aarch64_previous_fndecl = NULL;
13011 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13012 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13013 make sure optab availability predicates are recomputed when necessary. */
13015 void
13016 aarch64_save_restore_target_globals (tree new_tree)
13018 if (TREE_TARGET_GLOBALS (new_tree))
13019 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13020 else if (new_tree == target_option_default_node)
13021 restore_target_globals (&default_target_globals);
13022 else
13023 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13026 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13027 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13028 of the function, if such exists. This function may be called multiple
13029 times on a single function so use aarch64_previous_fndecl to avoid
13030 setting up identical state. */
13032 static void
13033 aarch64_set_current_function (tree fndecl)
13035 if (!fndecl || fndecl == aarch64_previous_fndecl)
13036 return;
13038 tree old_tree = (aarch64_previous_fndecl
13039 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13040 : NULL_TREE);
13042 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13044 /* If current function has no attributes but the previous one did,
13045 use the default node. */
13046 if (!new_tree && old_tree)
13047 new_tree = target_option_default_node;
13049 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13050 the default have been handled by aarch64_save_restore_target_globals from
13051 aarch64_pragma_target_parse. */
13052 if (old_tree == new_tree)
13053 return;
13055 aarch64_previous_fndecl = fndecl;
13057 /* First set the target options. */
13058 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13060 aarch64_save_restore_target_globals (new_tree);
13063 /* Enum describing the various ways we can handle attributes.
13064 In many cases we can reuse the generic option handling machinery. */
13066 enum aarch64_attr_opt_type
13068 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
13069 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
13070 aarch64_attr_enum, /* Attribute sets an enum variable. */
13071 aarch64_attr_custom /* Attribute requires a custom handling function. */
13074 /* All the information needed to handle a target attribute.
13075 NAME is the name of the attribute.
13076 ATTR_TYPE specifies the type of behavior of the attribute as described
13077 in the definition of enum aarch64_attr_opt_type.
13078 ALLOW_NEG is true if the attribute supports a "no-" form.
13079 HANDLER is the function that takes the attribute string as an argument
13080 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13081 OPT_NUM is the enum specifying the option that the attribute modifies.
13082 This is needed for attributes that mirror the behavior of a command-line
13083 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13084 aarch64_attr_enum. */
13086 struct aarch64_attribute_info
13088 const char *name;
13089 enum aarch64_attr_opt_type attr_type;
13090 bool allow_neg;
13091 bool (*handler) (const char *);
13092 enum opt_code opt_num;
13095 /* Handle the ARCH_STR argument to the arch= target attribute. */
13097 static bool
13098 aarch64_handle_attr_arch (const char *str)
13100 const struct processor *tmp_arch = NULL;
13101 std::string invalid_extension;
13102 enum aarch64_parse_opt_result parse_res
13103 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13105 if (parse_res == AARCH64_PARSE_OK)
13107 gcc_assert (tmp_arch);
13108 selected_arch = tmp_arch;
13109 explicit_arch = selected_arch->arch;
13110 return true;
13113 switch (parse_res)
13115 case AARCH64_PARSE_MISSING_ARG:
13116 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13117 break;
13118 case AARCH64_PARSE_INVALID_ARG:
13119 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13120 aarch64_print_hint_for_arch (str);
13121 break;
13122 case AARCH64_PARSE_INVALID_FEATURE:
13123 error ("invalid feature modifier %s of value (\"%s\") in "
13124 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13125 aarch64_print_hint_for_extensions (invalid_extension);
13126 break;
13127 default:
13128 gcc_unreachable ();
13131 return false;
13134 /* Handle the argument CPU_STR to the cpu= target attribute. */
13136 static bool
13137 aarch64_handle_attr_cpu (const char *str)
13139 const struct processor *tmp_cpu = NULL;
13140 std::string invalid_extension;
13141 enum aarch64_parse_opt_result parse_res
13142 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13144 if (parse_res == AARCH64_PARSE_OK)
13146 gcc_assert (tmp_cpu);
13147 selected_tune = tmp_cpu;
13148 explicit_tune_core = selected_tune->ident;
13150 selected_arch = &all_architectures[tmp_cpu->arch];
13151 explicit_arch = selected_arch->arch;
13152 return true;
13155 switch (parse_res)
13157 case AARCH64_PARSE_MISSING_ARG:
13158 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13159 break;
13160 case AARCH64_PARSE_INVALID_ARG:
13161 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13162 aarch64_print_hint_for_core (str);
13163 break;
13164 case AARCH64_PARSE_INVALID_FEATURE:
13165 error ("invalid feature modifier %s of value (\"%s\") in "
13166 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13167 aarch64_print_hint_for_extensions (invalid_extension);
13168 break;
13169 default:
13170 gcc_unreachable ();
13173 return false;
13176 /* Handle the argument STR to the branch-protection= attribute. */
13178 static bool
13179 aarch64_handle_attr_branch_protection (const char* str)
13181 char *err_str = (char *) xmalloc (strlen (str));
13182 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13183 &err_str);
13184 bool success = false;
13185 switch (res)
13187 case AARCH64_PARSE_MISSING_ARG:
13188 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13189 " attribute");
13190 break;
13191 case AARCH64_PARSE_INVALID_ARG:
13192 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13193 "=\")%> pragma or attribute", err_str);
13194 break;
13195 case AARCH64_PARSE_OK:
13196 success = true;
13197 /* Fall through. */
13198 case AARCH64_PARSE_INVALID_FEATURE:
13199 break;
13200 default:
13201 gcc_unreachable ();
13203 free (err_str);
13204 return success;
13207 /* Handle the argument STR to the tune= target attribute. */
13209 static bool
13210 aarch64_handle_attr_tune (const char *str)
13212 const struct processor *tmp_tune = NULL;
13213 enum aarch64_parse_opt_result parse_res
13214 = aarch64_parse_tune (str, &tmp_tune);
13216 if (parse_res == AARCH64_PARSE_OK)
13218 gcc_assert (tmp_tune);
13219 selected_tune = tmp_tune;
13220 explicit_tune_core = selected_tune->ident;
13221 return true;
13224 switch (parse_res)
13226 case AARCH64_PARSE_INVALID_ARG:
13227 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13228 aarch64_print_hint_for_core (str);
13229 break;
13230 default:
13231 gcc_unreachable ();
13234 return false;
13237 /* Parse an architecture extensions target attribute string specified in STR.
13238 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13239 if successful. Update aarch64_isa_flags to reflect the ISA features
13240 modified. */
13242 static bool
13243 aarch64_handle_attr_isa_flags (char *str)
13245 enum aarch64_parse_opt_result parse_res;
13246 uint64_t isa_flags = aarch64_isa_flags;
13248 /* We allow "+nothing" in the beginning to clear out all architectural
13249 features if the user wants to handpick specific features. */
13250 if (strncmp ("+nothing", str, 8) == 0)
13252 isa_flags = 0;
13253 str += 8;
13256 std::string invalid_extension;
13257 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13259 if (parse_res == AARCH64_PARSE_OK)
13261 aarch64_isa_flags = isa_flags;
13262 return true;
13265 switch (parse_res)
13267 case AARCH64_PARSE_MISSING_ARG:
13268 error ("missing value in %<target()%> pragma or attribute");
13269 break;
13271 case AARCH64_PARSE_INVALID_FEATURE:
13272 error ("invalid feature modifier %s of value (\"%s\") in "
13273 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13274 break;
13276 default:
13277 gcc_unreachable ();
13280 return false;
13283 /* The target attributes that we support. On top of these we also support just
13284 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13285 handled explicitly in aarch64_process_one_target_attr. */
13287 static const struct aarch64_attribute_info aarch64_attributes[] =
13289 { "general-regs-only", aarch64_attr_mask, false, NULL,
13290 OPT_mgeneral_regs_only },
13291 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13292 OPT_mfix_cortex_a53_835769 },
13293 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13294 OPT_mfix_cortex_a53_843419 },
13295 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13296 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13297 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13298 OPT_momit_leaf_frame_pointer },
13299 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13300 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13301 OPT_march_ },
13302 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13303 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13304 OPT_mtune_ },
13305 { "branch-protection", aarch64_attr_custom, false,
13306 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13307 { "sign-return-address", aarch64_attr_enum, false, NULL,
13308 OPT_msign_return_address_ },
13309 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13312 /* Parse ARG_STR which contains the definition of one target attribute.
13313 Show appropriate errors if any or return true if the attribute is valid. */
13315 static bool
13316 aarch64_process_one_target_attr (char *arg_str)
13318 bool invert = false;
13320 size_t len = strlen (arg_str);
13322 if (len == 0)
13324 error ("malformed %<target()%> pragma or attribute");
13325 return false;
13328 char *str_to_check = (char *) alloca (len + 1);
13329 strcpy (str_to_check, arg_str);
13331 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13332 It is easier to detect and handle it explicitly here rather than going
13333 through the machinery for the rest of the target attributes in this
13334 function. */
13335 if (*str_to_check == '+')
13336 return aarch64_handle_attr_isa_flags (str_to_check);
13338 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13340 invert = true;
13341 str_to_check += 3;
13343 char *arg = strchr (str_to_check, '=');
13345 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13346 and point ARG to "foo". */
13347 if (arg)
13349 *arg = '\0';
13350 arg++;
13352 const struct aarch64_attribute_info *p_attr;
13353 bool found = false;
13354 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13356 /* If the names don't match up, or the user has given an argument
13357 to an attribute that doesn't accept one, or didn't give an argument
13358 to an attribute that expects one, fail to match. */
13359 if (strcmp (str_to_check, p_attr->name) != 0)
13360 continue;
13362 found = true;
13363 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13364 || p_attr->attr_type == aarch64_attr_enum;
13366 if (attr_need_arg_p ^ (arg != NULL))
13368 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13369 return false;
13372 /* If the name matches but the attribute does not allow "no-" versions
13373 then we can't match. */
13374 if (invert && !p_attr->allow_neg)
13376 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13377 return false;
13380 switch (p_attr->attr_type)
13382 /* Has a custom handler registered.
13383 For example, cpu=, arch=, tune=. */
13384 case aarch64_attr_custom:
13385 gcc_assert (p_attr->handler);
13386 if (!p_attr->handler (arg))
13387 return false;
13388 break;
13390 /* Either set or unset a boolean option. */
13391 case aarch64_attr_bool:
13393 struct cl_decoded_option decoded;
13395 generate_option (p_attr->opt_num, NULL, !invert,
13396 CL_TARGET, &decoded);
13397 aarch64_handle_option (&global_options, &global_options_set,
13398 &decoded, input_location);
13399 break;
13401 /* Set or unset a bit in the target_flags. aarch64_handle_option
13402 should know what mask to apply given the option number. */
13403 case aarch64_attr_mask:
13405 struct cl_decoded_option decoded;
13406 /* We only need to specify the option number.
13407 aarch64_handle_option will know which mask to apply. */
13408 decoded.opt_index = p_attr->opt_num;
13409 decoded.value = !invert;
13410 aarch64_handle_option (&global_options, &global_options_set,
13411 &decoded, input_location);
13412 break;
13414 /* Use the option setting machinery to set an option to an enum. */
13415 case aarch64_attr_enum:
13417 gcc_assert (arg);
13418 bool valid;
13419 int value;
13420 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13421 &value, CL_TARGET);
13422 if (valid)
13424 set_option (&global_options, NULL, p_attr->opt_num, value,
13425 NULL, DK_UNSPECIFIED, input_location,
13426 global_dc);
13428 else
13430 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13432 break;
13434 default:
13435 gcc_unreachable ();
13439 /* If we reached here we either have found an attribute and validated
13440 it or didn't match any. If we matched an attribute but its arguments
13441 were malformed we will have returned false already. */
13442 return found;
13445 /* Count how many times the character C appears in
13446 NULL-terminated string STR. */
13448 static unsigned int
13449 num_occurences_in_str (char c, char *str)
13451 unsigned int res = 0;
13452 while (*str != '\0')
13454 if (*str == c)
13455 res++;
13457 str++;
13460 return res;
13463 /* Parse the tree in ARGS that contains the target attribute information
13464 and update the global target options space. */
13466 bool
13467 aarch64_process_target_attr (tree args)
13469 if (TREE_CODE (args) == TREE_LIST)
13473 tree head = TREE_VALUE (args);
13474 if (head)
13476 if (!aarch64_process_target_attr (head))
13477 return false;
13479 args = TREE_CHAIN (args);
13480 } while (args);
13482 return true;
13485 if (TREE_CODE (args) != STRING_CST)
13487 error ("attribute %<target%> argument not a string");
13488 return false;
13491 size_t len = strlen (TREE_STRING_POINTER (args));
13492 char *str_to_check = (char *) alloca (len + 1);
13493 strcpy (str_to_check, TREE_STRING_POINTER (args));
13495 if (len == 0)
13497 error ("malformed %<target()%> pragma or attribute");
13498 return false;
13501 /* Used to catch empty spaces between commas i.e.
13502 attribute ((target ("attr1,,attr2"))). */
13503 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13505 /* Handle multiple target attributes separated by ','. */
13506 char *token = strtok_r (str_to_check, ",", &str_to_check);
13508 unsigned int num_attrs = 0;
13509 while (token)
13511 num_attrs++;
13512 if (!aarch64_process_one_target_attr (token))
13514 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13515 return false;
13518 token = strtok_r (NULL, ",", &str_to_check);
13521 if (num_attrs != num_commas + 1)
13523 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13524 return false;
13527 return true;
13530 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13531 process attribute ((target ("..."))). */
13533 static bool
13534 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13536 struct cl_target_option cur_target;
13537 bool ret;
13538 tree old_optimize;
13539 tree new_target, new_optimize;
13540 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13542 /* If what we're processing is the current pragma string then the
13543 target option node is already stored in target_option_current_node
13544 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13545 having to re-parse the string. This is especially useful to keep
13546 arm_neon.h compile times down since that header contains a lot
13547 of intrinsics enclosed in pragmas. */
13548 if (!existing_target && args == current_target_pragma)
13550 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13551 return true;
13553 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13555 old_optimize = build_optimization_node (&global_options);
13556 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13558 /* If the function changed the optimization levels as well as setting
13559 target options, start with the optimizations specified. */
13560 if (func_optimize && func_optimize != old_optimize)
13561 cl_optimization_restore (&global_options,
13562 TREE_OPTIMIZATION (func_optimize));
13564 /* Save the current target options to restore at the end. */
13565 cl_target_option_save (&cur_target, &global_options);
13567 /* If fndecl already has some target attributes applied to it, unpack
13568 them so that we add this attribute on top of them, rather than
13569 overwriting them. */
13570 if (existing_target)
13572 struct cl_target_option *existing_options
13573 = TREE_TARGET_OPTION (existing_target);
13575 if (existing_options)
13576 cl_target_option_restore (&global_options, existing_options);
13578 else
13579 cl_target_option_restore (&global_options,
13580 TREE_TARGET_OPTION (target_option_current_node));
13582 ret = aarch64_process_target_attr (args);
13584 /* Set up any additional state. */
13585 if (ret)
13587 aarch64_override_options_internal (&global_options);
13588 /* Initialize SIMD builtins if we haven't already.
13589 Set current_target_pragma to NULL for the duration so that
13590 the builtin initialization code doesn't try to tag the functions
13591 being built with the attributes specified by any current pragma, thus
13592 going into an infinite recursion. */
13593 if (TARGET_SIMD)
13595 tree saved_current_target_pragma = current_target_pragma;
13596 current_target_pragma = NULL;
13597 aarch64_init_simd_builtins ();
13598 current_target_pragma = saved_current_target_pragma;
13600 new_target = build_target_option_node (&global_options);
13602 else
13603 new_target = NULL;
13605 new_optimize = build_optimization_node (&global_options);
13607 if (fndecl && ret)
13609 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13611 if (old_optimize != new_optimize)
13612 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13615 cl_target_option_restore (&global_options, &cur_target);
13617 if (old_optimize != new_optimize)
13618 cl_optimization_restore (&global_options,
13619 TREE_OPTIMIZATION (old_optimize));
13620 return ret;
13623 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13624 tri-bool options (yes, no, don't care) and the default value is
13625 DEF, determine whether to reject inlining. */
13627 static bool
13628 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13629 int dont_care, int def)
13631 /* If the callee doesn't care, always allow inlining. */
13632 if (callee == dont_care)
13633 return true;
13635 /* If the caller doesn't care, always allow inlining. */
13636 if (caller == dont_care)
13637 return true;
13639 /* Otherwise, allow inlining if either the callee and caller values
13640 agree, or if the callee is using the default value. */
13641 return (callee == caller || callee == def);
13644 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13645 to inline CALLEE into CALLER based on target-specific info.
13646 Make sure that the caller and callee have compatible architectural
13647 features. Then go through the other possible target attributes
13648 and see if they can block inlining. Try not to reject always_inline
13649 callees unless they are incompatible architecturally. */
13651 static bool
13652 aarch64_can_inline_p (tree caller, tree callee)
13654 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13655 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13657 struct cl_target_option *caller_opts
13658 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13659 : target_option_default_node);
13661 struct cl_target_option *callee_opts
13662 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13663 : target_option_default_node);
13665 /* Callee's ISA flags should be a subset of the caller's. */
13666 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13667 != callee_opts->x_aarch64_isa_flags)
13668 return false;
13670 /* Allow non-strict aligned functions inlining into strict
13671 aligned ones. */
13672 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13673 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13674 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13675 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13676 return false;
13678 bool always_inline = lookup_attribute ("always_inline",
13679 DECL_ATTRIBUTES (callee));
13681 /* If the architectural features match up and the callee is always_inline
13682 then the other attributes don't matter. */
13683 if (always_inline)
13684 return true;
13686 if (caller_opts->x_aarch64_cmodel_var
13687 != callee_opts->x_aarch64_cmodel_var)
13688 return false;
13690 if (caller_opts->x_aarch64_tls_dialect
13691 != callee_opts->x_aarch64_tls_dialect)
13692 return false;
13694 /* Honour explicit requests to workaround errata. */
13695 if (!aarch64_tribools_ok_for_inlining_p (
13696 caller_opts->x_aarch64_fix_a53_err835769,
13697 callee_opts->x_aarch64_fix_a53_err835769,
13698 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13699 return false;
13701 if (!aarch64_tribools_ok_for_inlining_p (
13702 caller_opts->x_aarch64_fix_a53_err843419,
13703 callee_opts->x_aarch64_fix_a53_err843419,
13704 2, TARGET_FIX_ERR_A53_843419))
13705 return false;
13707 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13708 caller and calle and they don't match up, reject inlining. */
13709 if (!aarch64_tribools_ok_for_inlining_p (
13710 caller_opts->x_flag_omit_leaf_frame_pointer,
13711 callee_opts->x_flag_omit_leaf_frame_pointer,
13712 2, 1))
13713 return false;
13715 /* If the callee has specific tuning overrides, respect them. */
13716 if (callee_opts->x_aarch64_override_tune_string != NULL
13717 && caller_opts->x_aarch64_override_tune_string == NULL)
13718 return false;
13720 /* If the user specified tuning override strings for the
13721 caller and callee and they don't match up, reject inlining.
13722 We just do a string compare here, we don't analyze the meaning
13723 of the string, as it would be too costly for little gain. */
13724 if (callee_opts->x_aarch64_override_tune_string
13725 && caller_opts->x_aarch64_override_tune_string
13726 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13727 caller_opts->x_aarch64_override_tune_string) != 0))
13728 return false;
13730 return true;
13733 /* Return true if SYMBOL_REF X binds locally. */
13735 static bool
13736 aarch64_symbol_binds_local_p (const_rtx x)
13738 return (SYMBOL_REF_DECL (x)
13739 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13740 : SYMBOL_REF_LOCAL_P (x));
13743 /* Return true if SYMBOL_REF X is thread local */
13744 static bool
13745 aarch64_tls_symbol_p (rtx x)
13747 if (! TARGET_HAVE_TLS)
13748 return false;
13750 if (GET_CODE (x) != SYMBOL_REF)
13751 return false;
13753 return SYMBOL_REF_TLS_MODEL (x) != 0;
13756 /* Classify a TLS symbol into one of the TLS kinds. */
13757 enum aarch64_symbol_type
13758 aarch64_classify_tls_symbol (rtx x)
13760 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13762 switch (tls_kind)
13764 case TLS_MODEL_GLOBAL_DYNAMIC:
13765 case TLS_MODEL_LOCAL_DYNAMIC:
13766 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13768 case TLS_MODEL_INITIAL_EXEC:
13769 switch (aarch64_cmodel)
13771 case AARCH64_CMODEL_TINY:
13772 case AARCH64_CMODEL_TINY_PIC:
13773 return SYMBOL_TINY_TLSIE;
13774 default:
13775 return SYMBOL_SMALL_TLSIE;
13778 case TLS_MODEL_LOCAL_EXEC:
13779 if (aarch64_tls_size == 12)
13780 return SYMBOL_TLSLE12;
13781 else if (aarch64_tls_size == 24)
13782 return SYMBOL_TLSLE24;
13783 else if (aarch64_tls_size == 32)
13784 return SYMBOL_TLSLE32;
13785 else if (aarch64_tls_size == 48)
13786 return SYMBOL_TLSLE48;
13787 else
13788 gcc_unreachable ();
13790 case TLS_MODEL_EMULATED:
13791 case TLS_MODEL_NONE:
13792 return SYMBOL_FORCE_TO_MEM;
13794 default:
13795 gcc_unreachable ();
13799 /* Return the correct method for accessing X + OFFSET, where X is either
13800 a SYMBOL_REF or LABEL_REF. */
13802 enum aarch64_symbol_type
13803 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13805 if (GET_CODE (x) == LABEL_REF)
13807 switch (aarch64_cmodel)
13809 case AARCH64_CMODEL_LARGE:
13810 return SYMBOL_FORCE_TO_MEM;
13812 case AARCH64_CMODEL_TINY_PIC:
13813 case AARCH64_CMODEL_TINY:
13814 return SYMBOL_TINY_ABSOLUTE;
13816 case AARCH64_CMODEL_SMALL_SPIC:
13817 case AARCH64_CMODEL_SMALL_PIC:
13818 case AARCH64_CMODEL_SMALL:
13819 return SYMBOL_SMALL_ABSOLUTE;
13821 default:
13822 gcc_unreachable ();
13826 if (GET_CODE (x) == SYMBOL_REF)
13828 if (aarch64_tls_symbol_p (x))
13829 return aarch64_classify_tls_symbol (x);
13831 switch (aarch64_cmodel)
13833 case AARCH64_CMODEL_TINY:
13834 /* When we retrieve symbol + offset address, we have to make sure
13835 the offset does not cause overflow of the final address. But
13836 we have no way of knowing the address of symbol at compile time
13837 so we can't accurately say if the distance between the PC and
13838 symbol + offset is outside the addressible range of +/-1M in the
13839 TINY code model. So we rely on images not being greater than
13840 1M and cap the offset at 1M and anything beyond 1M will have to
13841 be loaded using an alternative mechanism. Furthermore if the
13842 symbol is a weak reference to something that isn't known to
13843 resolve to a symbol in this module, then force to memory. */
13844 if ((SYMBOL_REF_WEAK (x)
13845 && !aarch64_symbol_binds_local_p (x))
13846 || !IN_RANGE (offset, -1048575, 1048575))
13847 return SYMBOL_FORCE_TO_MEM;
13848 return SYMBOL_TINY_ABSOLUTE;
13850 case AARCH64_CMODEL_SMALL:
13851 /* Same reasoning as the tiny code model, but the offset cap here is
13852 4G. */
13853 if ((SYMBOL_REF_WEAK (x)
13854 && !aarch64_symbol_binds_local_p (x))
13855 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13856 HOST_WIDE_INT_C (4294967264)))
13857 return SYMBOL_FORCE_TO_MEM;
13858 return SYMBOL_SMALL_ABSOLUTE;
13860 case AARCH64_CMODEL_TINY_PIC:
13861 if (!aarch64_symbol_binds_local_p (x))
13862 return SYMBOL_TINY_GOT;
13863 return SYMBOL_TINY_ABSOLUTE;
13865 case AARCH64_CMODEL_SMALL_SPIC:
13866 case AARCH64_CMODEL_SMALL_PIC:
13867 if (!aarch64_symbol_binds_local_p (x))
13868 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13869 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13870 return SYMBOL_SMALL_ABSOLUTE;
13872 case AARCH64_CMODEL_LARGE:
13873 /* This is alright even in PIC code as the constant
13874 pool reference is always PC relative and within
13875 the same translation unit. */
13876 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13877 return SYMBOL_SMALL_ABSOLUTE;
13878 else
13879 return SYMBOL_FORCE_TO_MEM;
13881 default:
13882 gcc_unreachable ();
13886 /* By default push everything into the constant pool. */
13887 return SYMBOL_FORCE_TO_MEM;
13890 bool
13891 aarch64_constant_address_p (rtx x)
13893 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13896 bool
13897 aarch64_legitimate_pic_operand_p (rtx x)
13899 if (GET_CODE (x) == SYMBOL_REF
13900 || (GET_CODE (x) == CONST
13901 && GET_CODE (XEXP (x, 0)) == PLUS
13902 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13903 return false;
13905 return true;
13908 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13909 that should be rematerialized rather than spilled. */
13911 static bool
13912 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13914 /* Support CSE and rematerialization of common constants. */
13915 if (CONST_INT_P (x)
13916 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13917 || GET_CODE (x) == CONST_VECTOR)
13918 return true;
13920 /* Do not allow vector struct mode constants for Advanced SIMD.
13921 We could support 0 and -1 easily, but they need support in
13922 aarch64-simd.md. */
13923 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13924 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13925 return false;
13927 /* Only accept variable-length vector constants if they can be
13928 handled directly.
13930 ??? It would be possible to handle rematerialization of other
13931 constants via secondary reloads. */
13932 if (vec_flags & VEC_ANY_SVE)
13933 return aarch64_simd_valid_immediate (x, NULL);
13935 if (GET_CODE (x) == HIGH)
13936 x = XEXP (x, 0);
13938 /* Accept polynomial constants that can be calculated by using the
13939 destination of a move as the sole temporary. Constants that
13940 require a second temporary cannot be rematerialized (they can't be
13941 forced to memory and also aren't legitimate constants). */
13942 poly_int64 offset;
13943 if (poly_int_rtx_p (x, &offset))
13944 return aarch64_offset_temporaries (false, offset) <= 1;
13946 /* If an offset is being added to something else, we need to allow the
13947 base to be moved into the destination register, meaning that there
13948 are no free temporaries for the offset. */
13949 x = strip_offset (x, &offset);
13950 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13951 return false;
13953 /* Do not allow const (plus (anchor_symbol, const_int)). */
13954 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13955 return false;
13957 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13958 so spilling them is better than rematerialization. */
13959 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13960 return true;
13962 /* Label references are always constant. */
13963 if (GET_CODE (x) == LABEL_REF)
13964 return true;
13966 return false;
13970 aarch64_load_tp (rtx target)
13972 if (!target
13973 || GET_MODE (target) != Pmode
13974 || !register_operand (target, Pmode))
13975 target = gen_reg_rtx (Pmode);
13977 /* Can return in any reg. */
13978 emit_insn (gen_aarch64_load_tp_hard (target));
13979 return target;
13982 /* On AAPCS systems, this is the "struct __va_list". */
13983 static GTY(()) tree va_list_type;
13985 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13986 Return the type to use as __builtin_va_list.
13988 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13990 struct __va_list
13992 void *__stack;
13993 void *__gr_top;
13994 void *__vr_top;
13995 int __gr_offs;
13996 int __vr_offs;
13997 }; */
13999 static tree
14000 aarch64_build_builtin_va_list (void)
14002 tree va_list_name;
14003 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14005 /* Create the type. */
14006 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14007 /* Give it the required name. */
14008 va_list_name = build_decl (BUILTINS_LOCATION,
14009 TYPE_DECL,
14010 get_identifier ("__va_list"),
14011 va_list_type);
14012 DECL_ARTIFICIAL (va_list_name) = 1;
14013 TYPE_NAME (va_list_type) = va_list_name;
14014 TYPE_STUB_DECL (va_list_type) = va_list_name;
14016 /* Create the fields. */
14017 f_stack = build_decl (BUILTINS_LOCATION,
14018 FIELD_DECL, get_identifier ("__stack"),
14019 ptr_type_node);
14020 f_grtop = build_decl (BUILTINS_LOCATION,
14021 FIELD_DECL, get_identifier ("__gr_top"),
14022 ptr_type_node);
14023 f_vrtop = build_decl (BUILTINS_LOCATION,
14024 FIELD_DECL, get_identifier ("__vr_top"),
14025 ptr_type_node);
14026 f_groff = build_decl (BUILTINS_LOCATION,
14027 FIELD_DECL, get_identifier ("__gr_offs"),
14028 integer_type_node);
14029 f_vroff = build_decl (BUILTINS_LOCATION,
14030 FIELD_DECL, get_identifier ("__vr_offs"),
14031 integer_type_node);
14033 /* Tell tree-stdarg pass about our internal offset fields.
14034 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14035 purpose to identify whether the code is updating va_list internal
14036 offset fields through irregular way. */
14037 va_list_gpr_counter_field = f_groff;
14038 va_list_fpr_counter_field = f_vroff;
14040 DECL_ARTIFICIAL (f_stack) = 1;
14041 DECL_ARTIFICIAL (f_grtop) = 1;
14042 DECL_ARTIFICIAL (f_vrtop) = 1;
14043 DECL_ARTIFICIAL (f_groff) = 1;
14044 DECL_ARTIFICIAL (f_vroff) = 1;
14046 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14047 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14048 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14049 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14050 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14052 TYPE_FIELDS (va_list_type) = f_stack;
14053 DECL_CHAIN (f_stack) = f_grtop;
14054 DECL_CHAIN (f_grtop) = f_vrtop;
14055 DECL_CHAIN (f_vrtop) = f_groff;
14056 DECL_CHAIN (f_groff) = f_vroff;
14058 /* Compute its layout. */
14059 layout_type (va_list_type);
14061 return va_list_type;
14064 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14065 static void
14066 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14068 const CUMULATIVE_ARGS *cum;
14069 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14070 tree stack, grtop, vrtop, groff, vroff;
14071 tree t;
14072 int gr_save_area_size = cfun->va_list_gpr_size;
14073 int vr_save_area_size = cfun->va_list_fpr_size;
14074 int vr_offset;
14076 cum = &crtl->args.info;
14077 if (cfun->va_list_gpr_size)
14078 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14079 cfun->va_list_gpr_size);
14080 if (cfun->va_list_fpr_size)
14081 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14082 * UNITS_PER_VREG, cfun->va_list_fpr_size);
14084 if (!TARGET_FLOAT)
14086 gcc_assert (cum->aapcs_nvrn == 0);
14087 vr_save_area_size = 0;
14090 f_stack = TYPE_FIELDS (va_list_type_node);
14091 f_grtop = DECL_CHAIN (f_stack);
14092 f_vrtop = DECL_CHAIN (f_grtop);
14093 f_groff = DECL_CHAIN (f_vrtop);
14094 f_vroff = DECL_CHAIN (f_groff);
14096 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14097 NULL_TREE);
14098 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14099 NULL_TREE);
14100 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14101 NULL_TREE);
14102 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14103 NULL_TREE);
14104 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14105 NULL_TREE);
14107 /* Emit code to initialize STACK, which points to the next varargs stack
14108 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14109 by named arguments. STACK is 8-byte aligned. */
14110 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14111 if (cum->aapcs_stack_size > 0)
14112 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14113 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14114 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14116 /* Emit code to initialize GRTOP, the top of the GR save area.
14117 virtual_incoming_args_rtx should have been 16 byte aligned. */
14118 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14119 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14120 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14122 /* Emit code to initialize VRTOP, the top of the VR save area.
14123 This address is gr_save_area_bytes below GRTOP, rounded
14124 down to the next 16-byte boundary. */
14125 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14126 vr_offset = ROUND_UP (gr_save_area_size,
14127 STACK_BOUNDARY / BITS_PER_UNIT);
14129 if (vr_offset)
14130 t = fold_build_pointer_plus_hwi (t, -vr_offset);
14131 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14132 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14134 /* Emit code to initialize GROFF, the offset from GRTOP of the
14135 next GPR argument. */
14136 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14137 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14138 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14140 /* Likewise emit code to initialize VROFF, the offset from FTOP
14141 of the next VR argument. */
14142 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14143 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14144 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14147 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14149 static tree
14150 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14151 gimple_seq *post_p ATTRIBUTE_UNUSED)
14153 tree addr;
14154 bool indirect_p;
14155 bool is_ha; /* is HFA or HVA. */
14156 bool dw_align; /* double-word align. */
14157 machine_mode ag_mode = VOIDmode;
14158 int nregs;
14159 machine_mode mode;
14161 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14162 tree stack, f_top, f_off, off, arg, roundup, on_stack;
14163 HOST_WIDE_INT size, rsize, adjust, align;
14164 tree t, u, cond1, cond2;
14166 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14167 if (indirect_p)
14168 type = build_pointer_type (type);
14170 mode = TYPE_MODE (type);
14172 f_stack = TYPE_FIELDS (va_list_type_node);
14173 f_grtop = DECL_CHAIN (f_stack);
14174 f_vrtop = DECL_CHAIN (f_grtop);
14175 f_groff = DECL_CHAIN (f_vrtop);
14176 f_vroff = DECL_CHAIN (f_groff);
14178 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14179 f_stack, NULL_TREE);
14180 size = int_size_in_bytes (type);
14182 bool abi_break;
14183 align
14184 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14186 dw_align = false;
14187 adjust = 0;
14188 if (aarch64_vfp_is_call_or_return_candidate (mode,
14189 type,
14190 &ag_mode,
14191 &nregs,
14192 &is_ha))
14194 /* No frontends can create types with variable-sized modes, so we
14195 shouldn't be asked to pass or return them. */
14196 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14198 /* TYPE passed in fp/simd registers. */
14199 if (!TARGET_FLOAT)
14200 aarch64_err_no_fpadvsimd (mode);
14202 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14203 unshare_expr (valist), f_vrtop, NULL_TREE);
14204 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14205 unshare_expr (valist), f_vroff, NULL_TREE);
14207 rsize = nregs * UNITS_PER_VREG;
14209 if (is_ha)
14211 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14212 adjust = UNITS_PER_VREG - ag_size;
14214 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14215 && size < UNITS_PER_VREG)
14217 adjust = UNITS_PER_VREG - size;
14220 else
14222 /* TYPE passed in general registers. */
14223 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14224 unshare_expr (valist), f_grtop, NULL_TREE);
14225 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14226 unshare_expr (valist), f_groff, NULL_TREE);
14227 rsize = ROUND_UP (size, UNITS_PER_WORD);
14228 nregs = rsize / UNITS_PER_WORD;
14230 if (align > 8)
14232 if (abi_break && warn_psabi)
14233 inform (input_location, "parameter passing for argument of type "
14234 "%qT changed in GCC 9.1", type);
14235 dw_align = true;
14238 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14239 && size < UNITS_PER_WORD)
14241 adjust = UNITS_PER_WORD - size;
14245 /* Get a local temporary for the field value. */
14246 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14248 /* Emit code to branch if off >= 0. */
14249 t = build2 (GE_EXPR, boolean_type_node, off,
14250 build_int_cst (TREE_TYPE (off), 0));
14251 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14253 if (dw_align)
14255 /* Emit: offs = (offs + 15) & -16. */
14256 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14257 build_int_cst (TREE_TYPE (off), 15));
14258 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14259 build_int_cst (TREE_TYPE (off), -16));
14260 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14262 else
14263 roundup = NULL;
14265 /* Update ap.__[g|v]r_offs */
14266 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14267 build_int_cst (TREE_TYPE (off), rsize));
14268 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14270 /* String up. */
14271 if (roundup)
14272 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14274 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14275 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14276 build_int_cst (TREE_TYPE (f_off), 0));
14277 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14279 /* String up: make sure the assignment happens before the use. */
14280 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14281 COND_EXPR_ELSE (cond1) = t;
14283 /* Prepare the trees handling the argument that is passed on the stack;
14284 the top level node will store in ON_STACK. */
14285 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14286 if (align > 8)
14288 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14289 t = fold_build_pointer_plus_hwi (arg, 15);
14290 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14291 build_int_cst (TREE_TYPE (t), -16));
14292 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14294 else
14295 roundup = NULL;
14296 /* Advance ap.__stack */
14297 t = fold_build_pointer_plus_hwi (arg, size + 7);
14298 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14299 build_int_cst (TREE_TYPE (t), -8));
14300 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14301 /* String up roundup and advance. */
14302 if (roundup)
14303 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14304 /* String up with arg */
14305 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14306 /* Big-endianness related address adjustment. */
14307 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14308 && size < UNITS_PER_WORD)
14310 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14311 size_int (UNITS_PER_WORD - size));
14312 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14315 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14316 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14318 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14319 t = off;
14320 if (adjust)
14321 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14322 build_int_cst (TREE_TYPE (off), adjust));
14324 t = fold_convert (sizetype, t);
14325 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14327 if (is_ha)
14329 /* type ha; // treat as "struct {ftype field[n];}"
14330 ... [computing offs]
14331 for (i = 0; i <nregs; ++i, offs += 16)
14332 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14333 return ha; */
14334 int i;
14335 tree tmp_ha, field_t, field_ptr_t;
14337 /* Declare a local variable. */
14338 tmp_ha = create_tmp_var_raw (type, "ha");
14339 gimple_add_tmp_var (tmp_ha);
14341 /* Establish the base type. */
14342 switch (ag_mode)
14344 case E_SFmode:
14345 field_t = float_type_node;
14346 field_ptr_t = float_ptr_type_node;
14347 break;
14348 case E_DFmode:
14349 field_t = double_type_node;
14350 field_ptr_t = double_ptr_type_node;
14351 break;
14352 case E_TFmode:
14353 field_t = long_double_type_node;
14354 field_ptr_t = long_double_ptr_type_node;
14355 break;
14356 case E_HFmode:
14357 field_t = aarch64_fp16_type_node;
14358 field_ptr_t = aarch64_fp16_ptr_type_node;
14359 break;
14360 case E_V2SImode:
14361 case E_V4SImode:
14363 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14364 field_t = build_vector_type_for_mode (innertype, ag_mode);
14365 field_ptr_t = build_pointer_type (field_t);
14367 break;
14368 default:
14369 gcc_assert (0);
14372 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14373 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14374 addr = t;
14375 t = fold_convert (field_ptr_t, addr);
14376 t = build2 (MODIFY_EXPR, field_t,
14377 build1 (INDIRECT_REF, field_t, tmp_ha),
14378 build1 (INDIRECT_REF, field_t, t));
14380 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14381 for (i = 1; i < nregs; ++i)
14383 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14384 u = fold_convert (field_ptr_t, addr);
14385 u = build2 (MODIFY_EXPR, field_t,
14386 build2 (MEM_REF, field_t, tmp_ha,
14387 build_int_cst (field_ptr_t,
14388 (i *
14389 int_size_in_bytes (field_t)))),
14390 build1 (INDIRECT_REF, field_t, u));
14391 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14394 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14395 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14398 COND_EXPR_ELSE (cond2) = t;
14399 addr = fold_convert (build_pointer_type (type), cond1);
14400 addr = build_va_arg_indirect_ref (addr);
14402 if (indirect_p)
14403 addr = build_va_arg_indirect_ref (addr);
14405 return addr;
14408 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14410 static void
14411 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14412 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14413 int no_rtl)
14415 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14416 CUMULATIVE_ARGS local_cum;
14417 int gr_saved = cfun->va_list_gpr_size;
14418 int vr_saved = cfun->va_list_fpr_size;
14420 /* The caller has advanced CUM up to, but not beyond, the last named
14421 argument. Advance a local copy of CUM past the last "real" named
14422 argument, to find out how many registers are left over. */
14423 local_cum = *cum;
14424 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14426 /* Found out how many registers we need to save.
14427 Honor tree-stdvar analysis results. */
14428 if (cfun->va_list_gpr_size)
14429 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14430 cfun->va_list_gpr_size / UNITS_PER_WORD);
14431 if (cfun->va_list_fpr_size)
14432 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14433 cfun->va_list_fpr_size / UNITS_PER_VREG);
14435 if (!TARGET_FLOAT)
14437 gcc_assert (local_cum.aapcs_nvrn == 0);
14438 vr_saved = 0;
14441 if (!no_rtl)
14443 if (gr_saved > 0)
14445 rtx ptr, mem;
14447 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14448 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14449 - gr_saved * UNITS_PER_WORD);
14450 mem = gen_frame_mem (BLKmode, ptr);
14451 set_mem_alias_set (mem, get_varargs_alias_set ());
14453 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14454 mem, gr_saved);
14456 if (vr_saved > 0)
14458 /* We can't use move_block_from_reg, because it will use
14459 the wrong mode, storing D regs only. */
14460 machine_mode mode = TImode;
14461 int off, i, vr_start;
14463 /* Set OFF to the offset from virtual_incoming_args_rtx of
14464 the first vector register. The VR save area lies below
14465 the GR one, and is aligned to 16 bytes. */
14466 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14467 STACK_BOUNDARY / BITS_PER_UNIT);
14468 off -= vr_saved * UNITS_PER_VREG;
14470 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14471 for (i = 0; i < vr_saved; ++i)
14473 rtx ptr, mem;
14475 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14476 mem = gen_frame_mem (mode, ptr);
14477 set_mem_alias_set (mem, get_varargs_alias_set ());
14478 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14479 off += UNITS_PER_VREG;
14484 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14485 any complication of having crtl->args.pretend_args_size changed. */
14486 cfun->machine->frame.saved_varargs_size
14487 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14488 STACK_BOUNDARY / BITS_PER_UNIT)
14489 + vr_saved * UNITS_PER_VREG);
14492 static void
14493 aarch64_conditional_register_usage (void)
14495 int i;
14496 if (!TARGET_FLOAT)
14498 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14500 fixed_regs[i] = 1;
14501 call_used_regs[i] = 1;
14504 if (!TARGET_SVE)
14505 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14507 fixed_regs[i] = 1;
14508 call_used_regs[i] = 1;
14511 /* When tracking speculation, we need a couple of call-clobbered registers
14512 to track the speculation state. It would be nice to just use
14513 IP0 and IP1, but currently there are numerous places that just
14514 assume these registers are free for other uses (eg pointer
14515 authentication). */
14516 if (aarch64_track_speculation)
14518 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14519 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14520 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14521 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14525 /* Walk down the type tree of TYPE counting consecutive base elements.
14526 If *MODEP is VOIDmode, then set it to the first valid floating point
14527 type. If a non-floating point type is found, or if a floating point
14528 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14529 otherwise return the count in the sub-tree. */
14530 static int
14531 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14533 machine_mode mode;
14534 HOST_WIDE_INT size;
14536 switch (TREE_CODE (type))
14538 case REAL_TYPE:
14539 mode = TYPE_MODE (type);
14540 if (mode != DFmode && mode != SFmode
14541 && mode != TFmode && mode != HFmode)
14542 return -1;
14544 if (*modep == VOIDmode)
14545 *modep = mode;
14547 if (*modep == mode)
14548 return 1;
14550 break;
14552 case COMPLEX_TYPE:
14553 mode = TYPE_MODE (TREE_TYPE (type));
14554 if (mode != DFmode && mode != SFmode
14555 && mode != TFmode && mode != HFmode)
14556 return -1;
14558 if (*modep == VOIDmode)
14559 *modep = mode;
14561 if (*modep == mode)
14562 return 2;
14564 break;
14566 case VECTOR_TYPE:
14567 /* Use V2SImode and V4SImode as representatives of all 64-bit
14568 and 128-bit vector types. */
14569 size = int_size_in_bytes (type);
14570 switch (size)
14572 case 8:
14573 mode = V2SImode;
14574 break;
14575 case 16:
14576 mode = V4SImode;
14577 break;
14578 default:
14579 return -1;
14582 if (*modep == VOIDmode)
14583 *modep = mode;
14585 /* Vector modes are considered to be opaque: two vectors are
14586 equivalent for the purposes of being homogeneous aggregates
14587 if they are the same size. */
14588 if (*modep == mode)
14589 return 1;
14591 break;
14593 case ARRAY_TYPE:
14595 int count;
14596 tree index = TYPE_DOMAIN (type);
14598 /* Can't handle incomplete types nor sizes that are not
14599 fixed. */
14600 if (!COMPLETE_TYPE_P (type)
14601 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14602 return -1;
14604 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14605 if (count == -1
14606 || !index
14607 || !TYPE_MAX_VALUE (index)
14608 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14609 || !TYPE_MIN_VALUE (index)
14610 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14611 || count < 0)
14612 return -1;
14614 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14615 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14617 /* There must be no padding. */
14618 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14619 count * GET_MODE_BITSIZE (*modep)))
14620 return -1;
14622 return count;
14625 case RECORD_TYPE:
14627 int count = 0;
14628 int sub_count;
14629 tree field;
14631 /* Can't handle incomplete types nor sizes that are not
14632 fixed. */
14633 if (!COMPLETE_TYPE_P (type)
14634 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14635 return -1;
14637 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14639 if (TREE_CODE (field) != FIELD_DECL)
14640 continue;
14642 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14643 if (sub_count < 0)
14644 return -1;
14645 count += sub_count;
14648 /* There must be no padding. */
14649 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14650 count * GET_MODE_BITSIZE (*modep)))
14651 return -1;
14653 return count;
14656 case UNION_TYPE:
14657 case QUAL_UNION_TYPE:
14659 /* These aren't very interesting except in a degenerate case. */
14660 int count = 0;
14661 int sub_count;
14662 tree field;
14664 /* Can't handle incomplete types nor sizes that are not
14665 fixed. */
14666 if (!COMPLETE_TYPE_P (type)
14667 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14668 return -1;
14670 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14672 if (TREE_CODE (field) != FIELD_DECL)
14673 continue;
14675 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14676 if (sub_count < 0)
14677 return -1;
14678 count = count > sub_count ? count : sub_count;
14681 /* There must be no padding. */
14682 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14683 count * GET_MODE_BITSIZE (*modep)))
14684 return -1;
14686 return count;
14689 default:
14690 break;
14693 return -1;
14696 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14697 type as described in AAPCS64 \S 4.1.2.
14699 See the comment above aarch64_composite_type_p for the notes on MODE. */
14701 static bool
14702 aarch64_short_vector_p (const_tree type,
14703 machine_mode mode)
14705 poly_int64 size = -1;
14707 if (type && TREE_CODE (type) == VECTOR_TYPE)
14708 size = int_size_in_bytes (type);
14709 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14710 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14711 size = GET_MODE_SIZE (mode);
14713 return known_eq (size, 8) || known_eq (size, 16);
14716 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14717 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14718 array types. The C99 floating-point complex types are also considered
14719 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14720 types, which are GCC extensions and out of the scope of AAPCS64, are
14721 treated as composite types here as well.
14723 Note that MODE itself is not sufficient in determining whether a type
14724 is such a composite type or not. This is because
14725 stor-layout.c:compute_record_mode may have already changed the MODE
14726 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14727 structure with only one field may have its MODE set to the mode of the
14728 field. Also an integer mode whose size matches the size of the
14729 RECORD_TYPE type may be used to substitute the original mode
14730 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14731 solely relied on. */
14733 static bool
14734 aarch64_composite_type_p (const_tree type,
14735 machine_mode mode)
14737 if (aarch64_short_vector_p (type, mode))
14738 return false;
14740 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14741 return true;
14743 if (mode == BLKmode
14744 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14745 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14746 return true;
14748 return false;
14751 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14752 shall be passed or returned in simd/fp register(s) (providing these
14753 parameter passing registers are available).
14755 Upon successful return, *COUNT returns the number of needed registers,
14756 *BASE_MODE returns the mode of the individual register and when IS_HAF
14757 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14758 floating-point aggregate or a homogeneous short-vector aggregate. */
14760 static bool
14761 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14762 const_tree type,
14763 machine_mode *base_mode,
14764 int *count,
14765 bool *is_ha)
14767 machine_mode new_mode = VOIDmode;
14768 bool composite_p = aarch64_composite_type_p (type, mode);
14770 if (is_ha != NULL) *is_ha = false;
14772 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14773 || aarch64_short_vector_p (type, mode))
14775 *count = 1;
14776 new_mode = mode;
14778 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14780 if (is_ha != NULL) *is_ha = true;
14781 *count = 2;
14782 new_mode = GET_MODE_INNER (mode);
14784 else if (type && composite_p)
14786 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14788 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14790 if (is_ha != NULL) *is_ha = true;
14791 *count = ag_count;
14793 else
14794 return false;
14796 else
14797 return false;
14799 *base_mode = new_mode;
14800 return true;
14803 /* Implement TARGET_STRUCT_VALUE_RTX. */
14805 static rtx
14806 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14807 int incoming ATTRIBUTE_UNUSED)
14809 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14812 /* Implements target hook vector_mode_supported_p. */
14813 static bool
14814 aarch64_vector_mode_supported_p (machine_mode mode)
14816 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14817 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14820 /* Return the full-width SVE vector mode for element mode MODE, if one
14821 exists. */
14822 opt_machine_mode
14823 aarch64_full_sve_mode (scalar_mode mode)
14825 switch (mode)
14827 case E_DFmode:
14828 return VNx2DFmode;
14829 case E_SFmode:
14830 return VNx4SFmode;
14831 case E_HFmode:
14832 return VNx8HFmode;
14833 case E_DImode:
14834 return VNx2DImode;
14835 case E_SImode:
14836 return VNx4SImode;
14837 case E_HImode:
14838 return VNx8HImode;
14839 case E_QImode:
14840 return VNx16QImode;
14841 default:
14842 return opt_machine_mode ();
14846 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14847 if it exists. */
14848 opt_machine_mode
14849 aarch64_vq_mode (scalar_mode mode)
14851 switch (mode)
14853 case E_DFmode:
14854 return V2DFmode;
14855 case E_SFmode:
14856 return V4SFmode;
14857 case E_HFmode:
14858 return V8HFmode;
14859 case E_SImode:
14860 return V4SImode;
14861 case E_HImode:
14862 return V8HImode;
14863 case E_QImode:
14864 return V16QImode;
14865 case E_DImode:
14866 return V2DImode;
14867 default:
14868 return opt_machine_mode ();
14872 /* Return appropriate SIMD container
14873 for MODE within a vector of WIDTH bits. */
14874 static machine_mode
14875 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14877 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14878 return aarch64_full_sve_mode (mode).else_mode (word_mode);
14880 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14881 if (TARGET_SIMD)
14883 if (known_eq (width, 128))
14884 return aarch64_vq_mode (mode).else_mode (word_mode);
14885 else
14886 switch (mode)
14888 case E_SFmode:
14889 return V2SFmode;
14890 case E_HFmode:
14891 return V4HFmode;
14892 case E_SImode:
14893 return V2SImode;
14894 case E_HImode:
14895 return V4HImode;
14896 case E_QImode:
14897 return V8QImode;
14898 default:
14899 break;
14902 return word_mode;
14905 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14906 static machine_mode
14907 aarch64_preferred_simd_mode (scalar_mode mode)
14909 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14910 return aarch64_simd_container_mode (mode, bits);
14913 /* Return a list of possible vector sizes for the vectorizer
14914 to iterate over. */
14915 static void
14916 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14918 if (TARGET_SVE)
14919 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14920 sizes->safe_push (16);
14921 sizes->safe_push (8);
14924 /* Implement TARGET_MANGLE_TYPE. */
14926 static const char *
14927 aarch64_mangle_type (const_tree type)
14929 /* The AArch64 ABI documents say that "__va_list" has to be
14930 mangled as if it is in the "std" namespace. */
14931 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14932 return "St9__va_list";
14934 /* Half-precision float. */
14935 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14936 return "Dh";
14938 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14939 builtin types. */
14940 if (TYPE_NAME (type) != NULL)
14941 return aarch64_mangle_builtin_type (type);
14943 /* Use the default mangling. */
14944 return NULL;
14947 /* Find the first rtx_insn before insn that will generate an assembly
14948 instruction. */
14950 static rtx_insn *
14951 aarch64_prev_real_insn (rtx_insn *insn)
14953 if (!insn)
14954 return NULL;
14958 insn = prev_real_insn (insn);
14960 while (insn && recog_memoized (insn) < 0);
14962 return insn;
14965 static bool
14966 is_madd_op (enum attr_type t1)
14968 unsigned int i;
14969 /* A number of these may be AArch32 only. */
14970 enum attr_type mlatypes[] = {
14971 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14972 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14973 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14976 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14978 if (t1 == mlatypes[i])
14979 return true;
14982 return false;
14985 /* Check if there is a register dependency between a load and the insn
14986 for which we hold recog_data. */
14988 static bool
14989 dep_between_memop_and_curr (rtx memop)
14991 rtx load_reg;
14992 int opno;
14994 gcc_assert (GET_CODE (memop) == SET);
14996 if (!REG_P (SET_DEST (memop)))
14997 return false;
14999 load_reg = SET_DEST (memop);
15000 for (opno = 1; opno < recog_data.n_operands; opno++)
15002 rtx operand = recog_data.operand[opno];
15003 if (REG_P (operand)
15004 && reg_overlap_mentioned_p (load_reg, operand))
15005 return true;
15008 return false;
15012 /* When working around the Cortex-A53 erratum 835769,
15013 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15014 instruction and has a preceding memory instruction such that a NOP
15015 should be inserted between them. */
15017 bool
15018 aarch64_madd_needs_nop (rtx_insn* insn)
15020 enum attr_type attr_type;
15021 rtx_insn *prev;
15022 rtx body;
15024 if (!TARGET_FIX_ERR_A53_835769)
15025 return false;
15027 if (!INSN_P (insn) || recog_memoized (insn) < 0)
15028 return false;
15030 attr_type = get_attr_type (insn);
15031 if (!is_madd_op (attr_type))
15032 return false;
15034 prev = aarch64_prev_real_insn (insn);
15035 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15036 Restore recog state to INSN to avoid state corruption. */
15037 extract_constrain_insn_cached (insn);
15039 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15040 return false;
15042 body = single_set (prev);
15044 /* If the previous insn is a memory op and there is no dependency between
15045 it and the DImode madd, emit a NOP between them. If body is NULL then we
15046 have a complex memory operation, probably a load/store pair.
15047 Be conservative for now and emit a NOP. */
15048 if (GET_MODE (recog_data.operand[0]) == DImode
15049 && (!body || !dep_between_memop_and_curr (body)))
15050 return true;
15052 return false;
15057 /* Implement FINAL_PRESCAN_INSN. */
15059 void
15060 aarch64_final_prescan_insn (rtx_insn *insn)
15062 if (aarch64_madd_needs_nop (insn))
15063 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15067 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15068 instruction. */
15070 bool
15071 aarch64_sve_index_immediate_p (rtx base_or_step)
15073 return (CONST_INT_P (base_or_step)
15074 && IN_RANGE (INTVAL (base_or_step), -16, 15));
15077 /* Return true if X is a valid immediate for the SVE ADD and SUB
15078 instructions. Negate X first if NEGATE_P is true. */
15080 bool
15081 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15083 rtx elt;
15085 if (!const_vec_duplicate_p (x, &elt)
15086 || !CONST_INT_P (elt))
15087 return false;
15089 HOST_WIDE_INT val = INTVAL (elt);
15090 if (negate_p)
15091 val = -val;
15092 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15094 if (val & 0xff)
15095 return IN_RANGE (val, 0, 0xff);
15096 return IN_RANGE (val, 0, 0xff00);
15099 /* Return true if X is a valid immediate operand for an SVE logical
15100 instruction such as AND. */
15102 bool
15103 aarch64_sve_bitmask_immediate_p (rtx x)
15105 rtx elt;
15107 return (const_vec_duplicate_p (x, &elt)
15108 && CONST_INT_P (elt)
15109 && aarch64_bitmask_imm (INTVAL (elt),
15110 GET_MODE_INNER (GET_MODE (x))));
15113 /* Return true if X is a valid immediate for the SVE DUP and CPY
15114 instructions. */
15116 bool
15117 aarch64_sve_dup_immediate_p (rtx x)
15119 rtx elt;
15121 if (!const_vec_duplicate_p (x, &elt)
15122 || !CONST_INT_P (elt))
15123 return false;
15125 HOST_WIDE_INT val = INTVAL (elt);
15126 if (val & 0xff)
15127 return IN_RANGE (val, -0x80, 0x7f);
15128 return IN_RANGE (val, -0x8000, 0x7f00);
15131 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15132 SIGNED_P says whether the operand is signed rather than unsigned. */
15134 bool
15135 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15137 rtx elt;
15139 return (const_vec_duplicate_p (x, &elt)
15140 && CONST_INT_P (elt)
15141 && (signed_p
15142 ? IN_RANGE (INTVAL (elt), -16, 15)
15143 : IN_RANGE (INTVAL (elt), 0, 127)));
15146 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15147 instruction. Negate X first if NEGATE_P is true. */
15149 bool
15150 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15152 rtx elt;
15153 REAL_VALUE_TYPE r;
15155 if (!const_vec_duplicate_p (x, &elt)
15156 || GET_CODE (elt) != CONST_DOUBLE)
15157 return false;
15159 r = *CONST_DOUBLE_REAL_VALUE (elt);
15161 if (negate_p)
15162 r = real_value_negate (&r);
15164 if (real_equal (&r, &dconst1))
15165 return true;
15166 if (real_equal (&r, &dconsthalf))
15167 return true;
15168 return false;
15171 /* Return true if X is a valid immediate operand for an SVE FMUL
15172 instruction. */
15174 bool
15175 aarch64_sve_float_mul_immediate_p (rtx x)
15177 rtx elt;
15179 /* GCC will never generate a multiply with an immediate of 2, so there is no
15180 point testing for it (even though it is a valid constant). */
15181 return (const_vec_duplicate_p (x, &elt)
15182 && GET_CODE (elt) == CONST_DOUBLE
15183 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
15186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15187 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15188 is nonnull, use it to describe valid immediates. */
15189 static bool
15190 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15191 simd_immediate_info *info,
15192 enum simd_immediate_check which,
15193 simd_immediate_info::insn_type insn)
15195 /* Try a 4-byte immediate with LSL. */
15196 for (unsigned int shift = 0; shift < 32; shift += 8)
15197 if ((val32 & (0xff << shift)) == val32)
15199 if (info)
15200 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15201 simd_immediate_info::LSL, shift);
15202 return true;
15205 /* Try a 2-byte immediate with LSL. */
15206 unsigned int imm16 = val32 & 0xffff;
15207 if (imm16 == (val32 >> 16))
15208 for (unsigned int shift = 0; shift < 16; shift += 8)
15209 if ((imm16 & (0xff << shift)) == imm16)
15211 if (info)
15212 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15213 simd_immediate_info::LSL, shift);
15214 return true;
15217 /* Try a 4-byte immediate with MSL, except for cases that MVN
15218 can handle. */
15219 if (which == AARCH64_CHECK_MOV)
15220 for (unsigned int shift = 8; shift < 24; shift += 8)
15222 unsigned int low = (1 << shift) - 1;
15223 if (((val32 & (0xff << shift)) | low) == val32)
15225 if (info)
15226 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15227 simd_immediate_info::MSL, shift);
15228 return true;
15232 return false;
15235 /* Return true if replicating VAL64 is a valid immediate for the
15236 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15237 use it to describe valid immediates. */
15238 static bool
15239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15240 simd_immediate_info *info,
15241 enum simd_immediate_check which)
15243 unsigned int val32 = val64 & 0xffffffff;
15244 unsigned int val16 = val64 & 0xffff;
15245 unsigned int val8 = val64 & 0xff;
15247 if (val32 == (val64 >> 32))
15249 if ((which & AARCH64_CHECK_ORR) != 0
15250 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15251 simd_immediate_info::MOV))
15252 return true;
15254 if ((which & AARCH64_CHECK_BIC) != 0
15255 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15256 simd_immediate_info::MVN))
15257 return true;
15259 /* Try using a replicated byte. */
15260 if (which == AARCH64_CHECK_MOV
15261 && val16 == (val32 >> 16)
15262 && val8 == (val16 >> 8))
15264 if (info)
15265 *info = simd_immediate_info (QImode, val8);
15266 return true;
15270 /* Try using a bit-to-bytemask. */
15271 if (which == AARCH64_CHECK_MOV)
15273 unsigned int i;
15274 for (i = 0; i < 64; i += 8)
15276 unsigned char byte = (val64 >> i) & 0xff;
15277 if (byte != 0 && byte != 0xff)
15278 break;
15280 if (i == 64)
15282 if (info)
15283 *info = simd_immediate_info (DImode, val64);
15284 return true;
15287 return false;
15290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15291 instruction. If INFO is nonnull, use it to describe valid immediates. */
15293 static bool
15294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15295 simd_immediate_info *info)
15297 scalar_int_mode mode = DImode;
15298 unsigned int val32 = val64 & 0xffffffff;
15299 if (val32 == (val64 >> 32))
15301 mode = SImode;
15302 unsigned int val16 = val32 & 0xffff;
15303 if (val16 == (val32 >> 16))
15305 mode = HImode;
15306 unsigned int val8 = val16 & 0xff;
15307 if (val8 == (val16 >> 8))
15308 mode = QImode;
15311 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15312 if (IN_RANGE (val, -0x80, 0x7f))
15314 /* DUP with no shift. */
15315 if (info)
15316 *info = simd_immediate_info (mode, val);
15317 return true;
15319 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15321 /* DUP with LSL #8. */
15322 if (info)
15323 *info = simd_immediate_info (mode, val);
15324 return true;
15326 if (aarch64_bitmask_imm (val64, mode))
15328 /* DUPM. */
15329 if (info)
15330 *info = simd_immediate_info (mode, val);
15331 return true;
15333 return false;
15336 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15337 it to describe valid immediates. */
15339 static bool
15340 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15342 if (x == CONST0_RTX (GET_MODE (x)))
15344 if (info)
15345 *info = simd_immediate_info (DImode, 0);
15346 return true;
15349 /* Analyze the value as a VNx16BImode. This should be relatively
15350 efficient, since rtx_vector_builder has enough built-in capacity
15351 to store all VLA predicate constants without needing the heap. */
15352 rtx_vector_builder builder;
15353 if (!aarch64_get_sve_pred_bits (builder, x))
15354 return false;
15356 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15357 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15359 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15360 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15361 if (pattern != AARCH64_NUM_SVPATTERNS)
15363 if (info)
15365 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15366 *info = simd_immediate_info (int_mode, pattern);
15368 return true;
15371 return false;
15374 /* Return true if OP is a valid SIMD immediate for the operation
15375 described by WHICH. If INFO is nonnull, use it to describe valid
15376 immediates. */
15377 bool
15378 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15379 enum simd_immediate_check which)
15381 machine_mode mode = GET_MODE (op);
15382 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15383 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15384 return false;
15386 if (vec_flags & VEC_SVE_PRED)
15387 return aarch64_sve_pred_valid_immediate (op, info);
15389 scalar_mode elt_mode = GET_MODE_INNER (mode);
15390 rtx base, step;
15391 unsigned int n_elts;
15392 if (GET_CODE (op) == CONST_VECTOR
15393 && CONST_VECTOR_DUPLICATE_P (op))
15394 n_elts = CONST_VECTOR_NPATTERNS (op);
15395 else if ((vec_flags & VEC_SVE_DATA)
15396 && const_vec_series_p (op, &base, &step))
15398 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15399 if (!aarch64_sve_index_immediate_p (base)
15400 || !aarch64_sve_index_immediate_p (step))
15401 return false;
15403 if (info)
15404 *info = simd_immediate_info (elt_mode, base, step);
15405 return true;
15407 else if (GET_CODE (op) == CONST_VECTOR
15408 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15409 /* N_ELTS set above. */;
15410 else
15411 return false;
15413 scalar_float_mode elt_float_mode;
15414 if (n_elts == 1
15415 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15417 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15418 if (aarch64_float_const_zero_rtx_p (elt)
15419 || aarch64_float_const_representable_p (elt))
15421 if (info)
15422 *info = simd_immediate_info (elt_float_mode, elt);
15423 return true;
15427 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15428 if (elt_size > 8)
15429 return false;
15431 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15433 /* Expand the vector constant out into a byte vector, with the least
15434 significant byte of the register first. */
15435 auto_vec<unsigned char, 16> bytes;
15436 bytes.reserve (n_elts * elt_size);
15437 for (unsigned int i = 0; i < n_elts; i++)
15439 /* The vector is provided in gcc endian-neutral fashion.
15440 For aarch64_be Advanced SIMD, it must be laid out in the vector
15441 register in reverse order. */
15442 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15443 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15445 if (elt_mode != elt_int_mode)
15446 elt = gen_lowpart (elt_int_mode, elt);
15448 if (!CONST_INT_P (elt))
15449 return false;
15451 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15452 for (unsigned int byte = 0; byte < elt_size; byte++)
15454 bytes.quick_push (elt_val & 0xff);
15455 elt_val >>= BITS_PER_UNIT;
15459 /* The immediate must repeat every eight bytes. */
15460 unsigned int nbytes = bytes.length ();
15461 for (unsigned i = 8; i < nbytes; ++i)
15462 if (bytes[i] != bytes[i - 8])
15463 return false;
15465 /* Get the repeating 8-byte value as an integer. No endian correction
15466 is needed here because bytes is already in lsb-first order. */
15467 unsigned HOST_WIDE_INT val64 = 0;
15468 for (unsigned int i = 0; i < 8; i++)
15469 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15470 << (i * BITS_PER_UNIT));
15472 if (vec_flags & VEC_SVE_DATA)
15473 return aarch64_sve_valid_immediate (val64, info);
15474 else
15475 return aarch64_advsimd_valid_immediate (val64, info, which);
15478 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15479 has a step in the range of INDEX. Return the index expression if so,
15480 otherwise return null. */
15482 aarch64_check_zero_based_sve_index_immediate (rtx x)
15484 rtx base, step;
15485 if (const_vec_series_p (x, &base, &step)
15486 && base == const0_rtx
15487 && aarch64_sve_index_immediate_p (step))
15488 return step;
15489 return NULL_RTX;
15492 /* Check of immediate shift constants are within range. */
15493 bool
15494 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15496 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15497 if (left)
15498 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15499 else
15500 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15503 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15504 operation of width WIDTH at bit position POS. */
15507 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15509 gcc_assert (CONST_INT_P (width));
15510 gcc_assert (CONST_INT_P (pos));
15512 unsigned HOST_WIDE_INT mask
15513 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15514 return GEN_INT (mask << UINTVAL (pos));
15517 bool
15518 aarch64_mov_operand_p (rtx x, machine_mode mode)
15520 if (GET_CODE (x) == HIGH
15521 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15522 return true;
15524 if (CONST_INT_P (x))
15525 return true;
15527 if (VECTOR_MODE_P (GET_MODE (x)))
15529 /* Require predicate constants to be VNx16BI before RA, so that we
15530 force everything to have a canonical form. */
15531 if (!lra_in_progress
15532 && !reload_completed
15533 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15534 && GET_MODE (x) != VNx16BImode)
15535 return false;
15537 return aarch64_simd_valid_immediate (x, NULL);
15540 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15541 return true;
15543 if (aarch64_sve_cnt_immediate_p (x))
15544 return true;
15546 return aarch64_classify_symbolic_expression (x)
15547 == SYMBOL_TINY_ABSOLUTE;
15550 /* Return a const_int vector of VAL. */
15552 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15554 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15555 return gen_const_vec_duplicate (mode, c);
15558 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15560 bool
15561 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15563 machine_mode vmode;
15565 vmode = aarch64_simd_container_mode (mode, 64);
15566 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15567 return aarch64_simd_valid_immediate (op_v, NULL);
15570 /* Construct and return a PARALLEL RTX vector with elements numbering the
15571 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15572 the vector - from the perspective of the architecture. This does not
15573 line up with GCC's perspective on lane numbers, so we end up with
15574 different masks depending on our target endian-ness. The diagram
15575 below may help. We must draw the distinction when building masks
15576 which select one half of the vector. An instruction selecting
15577 architectural low-lanes for a big-endian target, must be described using
15578 a mask selecting GCC high-lanes.
15580 Big-Endian Little-Endian
15582 GCC 0 1 2 3 3 2 1 0
15583 | x | x | x | x | | x | x | x | x |
15584 Architecture 3 2 1 0 3 2 1 0
15586 Low Mask: { 2, 3 } { 0, 1 }
15587 High Mask: { 0, 1 } { 2, 3 }
15589 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15592 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15594 rtvec v = rtvec_alloc (nunits / 2);
15595 int high_base = nunits / 2;
15596 int low_base = 0;
15597 int base;
15598 rtx t1;
15599 int i;
15601 if (BYTES_BIG_ENDIAN)
15602 base = high ? low_base : high_base;
15603 else
15604 base = high ? high_base : low_base;
15606 for (i = 0; i < nunits / 2; i++)
15607 RTVEC_ELT (v, i) = GEN_INT (base + i);
15609 t1 = gen_rtx_PARALLEL (mode, v);
15610 return t1;
15613 /* Check OP for validity as a PARALLEL RTX vector with elements
15614 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15615 from the perspective of the architecture. See the diagram above
15616 aarch64_simd_vect_par_cnst_half for more details. */
15618 bool
15619 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15620 bool high)
15622 int nelts;
15623 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15624 return false;
15626 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15627 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15628 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15629 int i = 0;
15631 if (count_op != count_ideal)
15632 return false;
15634 for (i = 0; i < count_ideal; i++)
15636 rtx elt_op = XVECEXP (op, 0, i);
15637 rtx elt_ideal = XVECEXP (ideal, 0, i);
15639 if (!CONST_INT_P (elt_op)
15640 || INTVAL (elt_ideal) != INTVAL (elt_op))
15641 return false;
15643 return true;
15646 /* Return a PARALLEL containing NELTS elements, with element I equal
15647 to BASE + I * STEP. */
15650 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15652 rtvec vec = rtvec_alloc (nelts);
15653 for (unsigned int i = 0; i < nelts; ++i)
15654 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15655 return gen_rtx_PARALLEL (VOIDmode, vec);
15658 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15659 series with step STEP. */
15661 bool
15662 aarch64_stepped_int_parallel_p (rtx op, int step)
15664 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15665 return false;
15667 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15668 for (int i = 1; i < XVECLEN (op, 0); ++i)
15669 if (!CONST_INT_P (XVECEXP (op, 0, i))
15670 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15671 return false;
15673 return true;
15676 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15677 HIGH (exclusive). */
15678 void
15679 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15680 const_tree exp)
15682 HOST_WIDE_INT lane;
15683 gcc_assert (CONST_INT_P (operand));
15684 lane = INTVAL (operand);
15686 if (lane < low || lane >= high)
15688 if (exp)
15689 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15690 else
15691 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15695 /* Peform endian correction on lane number N, which indexes a vector
15696 of mode MODE, and return the result as an SImode rtx. */
15699 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15701 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15704 /* Return TRUE if OP is a valid vector addressing mode. */
15706 bool
15707 aarch64_simd_mem_operand_p (rtx op)
15709 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15710 || REG_P (XEXP (op, 0)));
15713 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15715 bool
15716 aarch64_sve_ld1r_operand_p (rtx op)
15718 struct aarch64_address_info addr;
15719 scalar_mode mode;
15721 return (MEM_P (op)
15722 && is_a <scalar_mode> (GET_MODE (op), &mode)
15723 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15724 && addr.type == ADDRESS_REG_IMM
15725 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15728 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15729 bool
15730 aarch64_sve_ld1rq_operand_p (rtx op)
15732 struct aarch64_address_info addr;
15733 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15734 if (!MEM_P (op)
15735 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15736 return false;
15738 if (addr.type == ADDRESS_REG_IMM)
15739 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15741 if (addr.type == ADDRESS_REG_REG)
15742 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15744 return false;
15747 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15748 The conditions for STR are the same. */
15749 bool
15750 aarch64_sve_ldr_operand_p (rtx op)
15752 struct aarch64_address_info addr;
15754 return (MEM_P (op)
15755 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15756 false, ADDR_QUERY_ANY)
15757 && addr.type == ADDRESS_REG_IMM);
15760 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15761 We need to be able to access the individual pieces, so the range
15762 is different from LD[234] and ST[234]. */
15763 bool
15764 aarch64_sve_struct_memory_operand_p (rtx op)
15766 if (!MEM_P (op))
15767 return false;
15769 machine_mode mode = GET_MODE (op);
15770 struct aarch64_address_info addr;
15771 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15772 ADDR_QUERY_ANY)
15773 || addr.type != ADDRESS_REG_IMM)
15774 return false;
15776 poly_int64 first = addr.const_offset;
15777 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15778 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15779 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15782 /* Emit a register copy from operand to operand, taking care not to
15783 early-clobber source registers in the process.
15785 COUNT is the number of components into which the copy needs to be
15786 decomposed. */
15787 void
15788 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15789 unsigned int count)
15791 unsigned int i;
15792 int rdest = REGNO (operands[0]);
15793 int rsrc = REGNO (operands[1]);
15795 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15796 || rdest < rsrc)
15797 for (i = 0; i < count; i++)
15798 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15799 gen_rtx_REG (mode, rsrc + i));
15800 else
15801 for (i = 0; i < count; i++)
15802 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15803 gen_rtx_REG (mode, rsrc + count - i - 1));
15806 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15807 one of VSTRUCT modes: OI, CI, or XI. */
15809 aarch64_simd_attr_length_rglist (machine_mode mode)
15811 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15812 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15815 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15816 alignment of a vector to 128 bits. SVE predicates have an alignment of
15817 16 bits. */
15818 static HOST_WIDE_INT
15819 aarch64_simd_vector_alignment (const_tree type)
15821 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15822 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15823 be set for non-predicate vectors of booleans. Modes are the most
15824 direct way we have of identifying real SVE predicate types. */
15825 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15826 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15829 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15830 static poly_uint64
15831 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15833 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15835 /* If the length of the vector is fixed, try to align to that length,
15836 otherwise don't try to align at all. */
15837 HOST_WIDE_INT result;
15838 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15839 result = TYPE_ALIGN (TREE_TYPE (type));
15840 return result;
15842 return TYPE_ALIGN (type);
15845 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15846 static bool
15847 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15849 if (is_packed)
15850 return false;
15852 /* For fixed-length vectors, check that the vectorizer will aim for
15853 full-vector alignment. This isn't true for generic GCC vectors
15854 that are wider than the ABI maximum of 128 bits. */
15855 poly_uint64 preferred_alignment =
15856 aarch64_vectorize_preferred_vector_alignment (type);
15857 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15858 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15859 preferred_alignment))
15860 return false;
15862 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15863 return true;
15866 /* Return true if the vector misalignment factor is supported by the
15867 target. */
15868 static bool
15869 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15870 const_tree type, int misalignment,
15871 bool is_packed)
15873 if (TARGET_SIMD && STRICT_ALIGNMENT)
15875 /* Return if movmisalign pattern is not supported for this mode. */
15876 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15877 return false;
15879 /* Misalignment factor is unknown at compile time. */
15880 if (misalignment == -1)
15881 return false;
15883 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15884 is_packed);
15887 /* If VALS is a vector constant that can be loaded into a register
15888 using DUP, generate instructions to do so and return an RTX to
15889 assign to the register. Otherwise return NULL_RTX. */
15890 static rtx
15891 aarch64_simd_dup_constant (rtx vals)
15893 machine_mode mode = GET_MODE (vals);
15894 machine_mode inner_mode = GET_MODE_INNER (mode);
15895 rtx x;
15897 if (!const_vec_duplicate_p (vals, &x))
15898 return NULL_RTX;
15900 /* We can load this constant by using DUP and a constant in a
15901 single ARM register. This will be cheaper than a vector
15902 load. */
15903 x = copy_to_mode_reg (inner_mode, x);
15904 return gen_vec_duplicate (mode, x);
15908 /* Generate code to load VALS, which is a PARALLEL containing only
15909 constants (for vec_init) or CONST_VECTOR, efficiently into a
15910 register. Returns an RTX to copy into the register, or NULL_RTX
15911 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15912 static rtx
15913 aarch64_simd_make_constant (rtx vals)
15915 machine_mode mode = GET_MODE (vals);
15916 rtx const_dup;
15917 rtx const_vec = NULL_RTX;
15918 int n_const = 0;
15919 int i;
15921 if (GET_CODE (vals) == CONST_VECTOR)
15922 const_vec = vals;
15923 else if (GET_CODE (vals) == PARALLEL)
15925 /* A CONST_VECTOR must contain only CONST_INTs and
15926 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15927 Only store valid constants in a CONST_VECTOR. */
15928 int n_elts = XVECLEN (vals, 0);
15929 for (i = 0; i < n_elts; ++i)
15931 rtx x = XVECEXP (vals, 0, i);
15932 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15933 n_const++;
15935 if (n_const == n_elts)
15936 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15938 else
15939 gcc_unreachable ();
15941 if (const_vec != NULL_RTX
15942 && aarch64_simd_valid_immediate (const_vec, NULL))
15943 /* Load using MOVI/MVNI. */
15944 return const_vec;
15945 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15946 /* Loaded using DUP. */
15947 return const_dup;
15948 else if (const_vec != NULL_RTX)
15949 /* Load from constant pool. We cannot take advantage of single-cycle
15950 LD1 because we need a PC-relative addressing mode. */
15951 return const_vec;
15952 else
15953 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15954 We cannot construct an initializer. */
15955 return NULL_RTX;
15958 /* Expand a vector initialisation sequence, such that TARGET is
15959 initialised to contain VALS. */
15961 void
15962 aarch64_expand_vector_init (rtx target, rtx vals)
15964 machine_mode mode = GET_MODE (target);
15965 scalar_mode inner_mode = GET_MODE_INNER (mode);
15966 /* The number of vector elements. */
15967 int n_elts = XVECLEN (vals, 0);
15968 /* The number of vector elements which are not constant. */
15969 int n_var = 0;
15970 rtx any_const = NULL_RTX;
15971 /* The first element of vals. */
15972 rtx v0 = XVECEXP (vals, 0, 0);
15973 bool all_same = true;
15975 /* This is a special vec_init<M><N> where N is not an element mode but a
15976 vector mode with half the elements of M. We expect to find two entries
15977 of mode N in VALS and we must put their concatentation into TARGET. */
15978 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15980 gcc_assert (known_eq (GET_MODE_SIZE (mode),
15981 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15982 rtx lo = XVECEXP (vals, 0, 0);
15983 rtx hi = XVECEXP (vals, 0, 1);
15984 machine_mode narrow_mode = GET_MODE (lo);
15985 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15986 gcc_assert (narrow_mode == GET_MODE (hi));
15988 /* When we want to concatenate a half-width vector with zeroes we can
15989 use the aarch64_combinez[_be] patterns. Just make sure that the
15990 zeroes are in the right half. */
15991 if (BYTES_BIG_ENDIAN
15992 && aarch64_simd_imm_zero (lo, narrow_mode)
15993 && general_operand (hi, narrow_mode))
15994 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15995 else if (!BYTES_BIG_ENDIAN
15996 && aarch64_simd_imm_zero (hi, narrow_mode)
15997 && general_operand (lo, narrow_mode))
15998 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15999 else
16001 /* Else create the two half-width registers and combine them. */
16002 if (!REG_P (lo))
16003 lo = force_reg (GET_MODE (lo), lo);
16004 if (!REG_P (hi))
16005 hi = force_reg (GET_MODE (hi), hi);
16007 if (BYTES_BIG_ENDIAN)
16008 std::swap (lo, hi);
16009 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16011 return;
16014 /* Count the number of variable elements to initialise. */
16015 for (int i = 0; i < n_elts; ++i)
16017 rtx x = XVECEXP (vals, 0, i);
16018 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16019 ++n_var;
16020 else
16021 any_const = x;
16023 all_same &= rtx_equal_p (x, v0);
16026 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16027 how best to handle this. */
16028 if (n_var == 0)
16030 rtx constant = aarch64_simd_make_constant (vals);
16031 if (constant != NULL_RTX)
16033 emit_move_insn (target, constant);
16034 return;
16038 /* Splat a single non-constant element if we can. */
16039 if (all_same)
16041 rtx x = copy_to_mode_reg (inner_mode, v0);
16042 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16043 return;
16046 enum insn_code icode = optab_handler (vec_set_optab, mode);
16047 gcc_assert (icode != CODE_FOR_nothing);
16049 /* If there are only variable elements, try to optimize
16050 the insertion using dup for the most common element
16051 followed by insertions. */
16053 /* The algorithm will fill matches[*][0] with the earliest matching element,
16054 and matches[X][1] with the count of duplicate elements (if X is the
16055 earliest element which has duplicates). */
16057 if (n_var == n_elts && n_elts <= 16)
16059 int matches[16][2] = {0};
16060 for (int i = 0; i < n_elts; i++)
16062 for (int j = 0; j <= i; j++)
16064 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16066 matches[i][0] = j;
16067 matches[j][1]++;
16068 break;
16072 int maxelement = 0;
16073 int maxv = 0;
16074 for (int i = 0; i < n_elts; i++)
16075 if (matches[i][1] > maxv)
16077 maxelement = i;
16078 maxv = matches[i][1];
16081 /* Create a duplicate of the most common element, unless all elements
16082 are equally useless to us, in which case just immediately set the
16083 vector register using the first element. */
16085 if (maxv == 1)
16087 /* For vectors of two 64-bit elements, we can do even better. */
16088 if (n_elts == 2
16089 && (inner_mode == E_DImode
16090 || inner_mode == E_DFmode))
16093 rtx x0 = XVECEXP (vals, 0, 0);
16094 rtx x1 = XVECEXP (vals, 0, 1);
16095 /* Combine can pick up this case, but handling it directly
16096 here leaves clearer RTL.
16098 This is load_pair_lanes<mode>, and also gives us a clean-up
16099 for store_pair_lanes<mode>. */
16100 if (memory_operand (x0, inner_mode)
16101 && memory_operand (x1, inner_mode)
16102 && !STRICT_ALIGNMENT
16103 && rtx_equal_p (XEXP (x1, 0),
16104 plus_constant (Pmode,
16105 XEXP (x0, 0),
16106 GET_MODE_SIZE (inner_mode))))
16108 rtx t;
16109 if (inner_mode == DFmode)
16110 t = gen_load_pair_lanesdf (target, x0, x1);
16111 else
16112 t = gen_load_pair_lanesdi (target, x0, x1);
16113 emit_insn (t);
16114 return;
16117 /* The subreg-move sequence below will move into lane zero of the
16118 vector register. For big-endian we want that position to hold
16119 the last element of VALS. */
16120 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16121 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16122 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16124 else
16126 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16127 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16130 /* Insert the rest. */
16131 for (int i = 0; i < n_elts; i++)
16133 rtx x = XVECEXP (vals, 0, i);
16134 if (matches[i][0] == maxelement)
16135 continue;
16136 x = copy_to_mode_reg (inner_mode, x);
16137 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16139 return;
16142 /* Initialise a vector which is part-variable. We want to first try
16143 to build those lanes which are constant in the most efficient way we
16144 can. */
16145 if (n_var != n_elts)
16147 rtx copy = copy_rtx (vals);
16149 /* Load constant part of vector. We really don't care what goes into the
16150 parts we will overwrite, but we're more likely to be able to load the
16151 constant efficiently if it has fewer, larger, repeating parts
16152 (see aarch64_simd_valid_immediate). */
16153 for (int i = 0; i < n_elts; i++)
16155 rtx x = XVECEXP (vals, 0, i);
16156 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16157 continue;
16158 rtx subst = any_const;
16159 for (int bit = n_elts / 2; bit > 0; bit /= 2)
16161 /* Look in the copied vector, as more elements are const. */
16162 rtx test = XVECEXP (copy, 0, i ^ bit);
16163 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16165 subst = test;
16166 break;
16169 XVECEXP (copy, 0, i) = subst;
16171 aarch64_expand_vector_init (target, copy);
16174 /* Insert the variable lanes directly. */
16175 for (int i = 0; i < n_elts; i++)
16177 rtx x = XVECEXP (vals, 0, i);
16178 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16179 continue;
16180 x = copy_to_mode_reg (inner_mode, x);
16181 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16185 /* Emit RTL corresponding to:
16186 insr TARGET, ELEM. */
16188 static void
16189 emit_insr (rtx target, rtx elem)
16191 machine_mode mode = GET_MODE (target);
16192 scalar_mode elem_mode = GET_MODE_INNER (mode);
16193 elem = force_reg (elem_mode, elem);
16195 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16196 gcc_assert (icode != CODE_FOR_nothing);
16197 emit_insn (GEN_FCN (icode) (target, target, elem));
16200 /* Subroutine of aarch64_sve_expand_vector_init for handling
16201 trailing constants.
16202 This function works as follows:
16203 (a) Create a new vector consisting of trailing constants.
16204 (b) Initialize TARGET with the constant vector using emit_move_insn.
16205 (c) Insert remaining elements in TARGET using insr.
16206 NELTS is the total number of elements in original vector while
16207 while NELTS_REQD is the number of elements that are actually
16208 significant.
16210 ??? The heuristic used is to do above only if number of constants
16211 is at least half the total number of elements. May need fine tuning. */
16213 static bool
16214 aarch64_sve_expand_vector_init_handle_trailing_constants
16215 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16217 machine_mode mode = GET_MODE (target);
16218 scalar_mode elem_mode = GET_MODE_INNER (mode);
16219 int n_trailing_constants = 0;
16221 for (int i = nelts_reqd - 1;
16222 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16223 i--)
16224 n_trailing_constants++;
16226 if (n_trailing_constants >= nelts_reqd / 2)
16228 rtx_vector_builder v (mode, 1, nelts);
16229 for (int i = 0; i < nelts; i++)
16230 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16231 rtx const_vec = v.build ();
16232 emit_move_insn (target, const_vec);
16234 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16235 emit_insr (target, builder.elt (i));
16237 return true;
16240 return false;
16243 /* Subroutine of aarch64_sve_expand_vector_init.
16244 Works as follows:
16245 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16246 (b) Skip trailing elements from BUILDER, which are the same as
16247 element NELTS_REQD - 1.
16248 (c) Insert earlier elements in reverse order in TARGET using insr. */
16250 static void
16251 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16252 const rtx_vector_builder &builder,
16253 int nelts_reqd)
16255 machine_mode mode = GET_MODE (target);
16256 scalar_mode elem_mode = GET_MODE_INNER (mode);
16258 struct expand_operand ops[2];
16259 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16260 gcc_assert (icode != CODE_FOR_nothing);
16262 create_output_operand (&ops[0], target, mode);
16263 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16264 expand_insn (icode, 2, ops);
16266 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16267 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16268 emit_insr (target, builder.elt (i));
16271 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16272 when all trailing elements of builder are same.
16273 This works as follows:
16274 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16275 (b) Insert remaining elements in TARGET using insr.
16277 ??? The heuristic used is to do above if number of same trailing elements
16278 is at least 3/4 of total number of elements, loosely based on
16279 heuristic from mostly_zeros_p. May need fine-tuning. */
16281 static bool
16282 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16283 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16285 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16286 if (ndups >= (3 * nelts_reqd) / 4)
16288 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16289 nelts_reqd - ndups + 1);
16290 return true;
16293 return false;
16296 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16297 of elements in BUILDER.
16299 The function tries to initialize TARGET from BUILDER if it fits one
16300 of the special cases outlined below.
16302 Failing that, the function divides BUILDER into two sub-vectors:
16303 v_even = even elements of BUILDER;
16304 v_odd = odd elements of BUILDER;
16306 and recursively calls itself with v_even and v_odd.
16308 if (recursive call succeeded for v_even or v_odd)
16309 TARGET = zip (v_even, v_odd)
16311 The function returns true if it managed to build TARGET from BUILDER
16312 with one of the special cases, false otherwise.
16314 Example: {a, 1, b, 2, c, 3, d, 4}
16316 The vector gets divided into:
16317 v_even = {a, b, c, d}
16318 v_odd = {1, 2, 3, 4}
16320 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16321 initialize tmp2 from constant vector v_odd using emit_move_insn.
16323 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16324 4 elements, so we construct tmp1 from v_even using insr:
16325 tmp1 = dup(d)
16326 insr tmp1, c
16327 insr tmp1, b
16328 insr tmp1, a
16330 And finally:
16331 TARGET = zip (tmp1, tmp2)
16332 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16334 static bool
16335 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16336 int nelts, int nelts_reqd)
16338 machine_mode mode = GET_MODE (target);
16340 /* Case 1: Vector contains trailing constants. */
16342 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16343 (target, builder, nelts, nelts_reqd))
16344 return true;
16346 /* Case 2: Vector contains leading constants. */
16348 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16349 for (int i = 0; i < nelts_reqd; i++)
16350 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16351 rev_builder.finalize ();
16353 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16354 (target, rev_builder, nelts, nelts_reqd))
16356 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16357 return true;
16360 /* Case 3: Vector contains trailing same element. */
16362 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16363 (target, builder, nelts_reqd))
16364 return true;
16366 /* Case 4: Vector contains leading same element. */
16368 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16369 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16371 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16372 return true;
16375 /* Avoid recursing below 4-elements.
16376 ??? The threshold 4 may need fine-tuning. */
16378 if (nelts_reqd <= 4)
16379 return false;
16381 rtx_vector_builder v_even (mode, 1, nelts);
16382 rtx_vector_builder v_odd (mode, 1, nelts);
16384 for (int i = 0; i < nelts * 2; i += 2)
16386 v_even.quick_push (builder.elt (i));
16387 v_odd.quick_push (builder.elt (i + 1));
16390 v_even.finalize ();
16391 v_odd.finalize ();
16393 rtx tmp1 = gen_reg_rtx (mode);
16394 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16395 nelts, nelts_reqd / 2);
16397 rtx tmp2 = gen_reg_rtx (mode);
16398 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16399 nelts, nelts_reqd / 2);
16401 if (!did_even_p && !did_odd_p)
16402 return false;
16404 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16405 special cases and zip v_even, v_odd. */
16407 if (!did_even_p)
16408 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16410 if (!did_odd_p)
16411 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16413 rtvec v = gen_rtvec (2, tmp1, tmp2);
16414 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16415 return true;
16418 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16420 void
16421 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16423 machine_mode mode = GET_MODE (target);
16424 int nelts = XVECLEN (vals, 0);
16426 rtx_vector_builder v (mode, 1, nelts);
16427 for (int i = 0; i < nelts; i++)
16428 v.quick_push (XVECEXP (vals, 0, i));
16429 v.finalize ();
16431 /* If neither sub-vectors of v could be initialized specially,
16432 then use INSR to insert all elements from v into TARGET.
16433 ??? This might not be optimal for vectors with large
16434 initializers like 16-element or above.
16435 For nelts < 4, it probably isn't useful to handle specially. */
16437 if (nelts < 4
16438 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16439 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16442 static unsigned HOST_WIDE_INT
16443 aarch64_shift_truncation_mask (machine_mode mode)
16445 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16446 return 0;
16447 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16450 /* Select a format to encode pointers in exception handling data. */
16452 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16454 int type;
16455 switch (aarch64_cmodel)
16457 case AARCH64_CMODEL_TINY:
16458 case AARCH64_CMODEL_TINY_PIC:
16459 case AARCH64_CMODEL_SMALL:
16460 case AARCH64_CMODEL_SMALL_PIC:
16461 case AARCH64_CMODEL_SMALL_SPIC:
16462 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16463 for everything. */
16464 type = DW_EH_PE_sdata4;
16465 break;
16466 default:
16467 /* No assumptions here. 8-byte relocs required. */
16468 type = DW_EH_PE_sdata8;
16469 break;
16471 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16474 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16476 static void
16477 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16479 if (aarch64_simd_decl_p (decl))
16481 fprintf (stream, "\t.variant_pcs\t");
16482 assemble_name (stream, name);
16483 fprintf (stream, "\n");
16487 /* The last .arch and .tune assembly strings that we printed. */
16488 static std::string aarch64_last_printed_arch_string;
16489 static std::string aarch64_last_printed_tune_string;
16491 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16492 by the function fndecl. */
16494 void
16495 aarch64_declare_function_name (FILE *stream, const char* name,
16496 tree fndecl)
16498 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16500 struct cl_target_option *targ_options;
16501 if (target_parts)
16502 targ_options = TREE_TARGET_OPTION (target_parts);
16503 else
16504 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16505 gcc_assert (targ_options);
16507 const struct processor *this_arch
16508 = aarch64_get_arch (targ_options->x_explicit_arch);
16510 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16511 std::string extension
16512 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16513 this_arch->flags);
16514 /* Only update the assembler .arch string if it is distinct from the last
16515 such string we printed. */
16516 std::string to_print = this_arch->name + extension;
16517 if (to_print != aarch64_last_printed_arch_string)
16519 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16520 aarch64_last_printed_arch_string = to_print;
16523 /* Print the cpu name we're tuning for in the comments, might be
16524 useful to readers of the generated asm. Do it only when it changes
16525 from function to function and verbose assembly is requested. */
16526 const struct processor *this_tune
16527 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16529 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16531 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16532 this_tune->name);
16533 aarch64_last_printed_tune_string = this_tune->name;
16536 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16538 /* Don't forget the type directive for ELF. */
16539 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16540 ASM_OUTPUT_LABEL (stream, name);
16543 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16545 void
16546 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16548 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16549 const char *value = IDENTIFIER_POINTER (target);
16550 aarch64_asm_output_variant_pcs (stream, decl, name);
16551 ASM_OUTPUT_DEF (stream, name, value);
16554 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16555 function symbol references. */
16557 void
16558 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16560 default_elf_asm_output_external (stream, decl, name);
16561 aarch64_asm_output_variant_pcs (stream, decl, name);
16564 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16565 Used to output the .cfi_b_key_frame directive when signing the current
16566 function with the B key. */
16568 void
16569 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16571 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16572 && aarch64_ra_sign_key == AARCH64_KEY_B)
16573 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16576 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16578 static void
16579 aarch64_start_file (void)
16581 struct cl_target_option *default_options
16582 = TREE_TARGET_OPTION (target_option_default_node);
16584 const struct processor *default_arch
16585 = aarch64_get_arch (default_options->x_explicit_arch);
16586 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16587 std::string extension
16588 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16589 default_arch->flags);
16591 aarch64_last_printed_arch_string = default_arch->name + extension;
16592 aarch64_last_printed_tune_string = "";
16593 asm_fprintf (asm_out_file, "\t.arch %s\n",
16594 aarch64_last_printed_arch_string.c_str ());
16596 default_file_start ();
16599 /* Emit load exclusive. */
16601 static void
16602 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16603 rtx mem, rtx model_rtx)
16605 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16608 /* Emit store exclusive. */
16610 static void
16611 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16612 rtx rval, rtx mem, rtx model_rtx)
16614 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16617 /* Mark the previous jump instruction as unlikely. */
16619 static void
16620 aarch64_emit_unlikely_jump (rtx insn)
16622 rtx_insn *jump = emit_jump_insn (insn);
16623 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16626 /* Expand a compare and swap pattern. */
16628 void
16629 aarch64_expand_compare_and_swap (rtx operands[])
16631 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16632 machine_mode mode, r_mode;
16634 bval = operands[0];
16635 rval = operands[1];
16636 mem = operands[2];
16637 oldval = operands[3];
16638 newval = operands[4];
16639 is_weak = operands[5];
16640 mod_s = operands[6];
16641 mod_f = operands[7];
16642 mode = GET_MODE (mem);
16644 /* Normally the succ memory model must be stronger than fail, but in the
16645 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16646 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16647 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16648 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16649 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16651 r_mode = mode;
16652 if (mode == QImode || mode == HImode)
16654 r_mode = SImode;
16655 rval = gen_reg_rtx (r_mode);
16658 if (TARGET_LSE)
16660 /* The CAS insn requires oldval and rval overlap, but we need to
16661 have a copy of oldval saved across the operation to tell if
16662 the operation is successful. */
16663 if (reg_overlap_mentioned_p (rval, oldval))
16664 rval = copy_to_mode_reg (r_mode, oldval);
16665 else
16666 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16668 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16669 newval, mod_s));
16670 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16672 else
16674 /* The oldval predicate varies by mode. Test it and force to reg. */
16675 insn_code code = code_for_aarch64_compare_and_swap (mode);
16676 if (!insn_data[code].operand[2].predicate (oldval, mode))
16677 oldval = force_reg (mode, oldval);
16679 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16680 is_weak, mod_s, mod_f));
16681 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16684 if (r_mode != mode)
16685 rval = gen_lowpart (mode, rval);
16686 emit_move_insn (operands[1], rval);
16688 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16689 emit_insn (gen_rtx_SET (bval, x));
16692 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16693 sequence implementing an atomic operation. */
16695 static void
16696 aarch64_emit_post_barrier (enum memmodel model)
16698 const enum memmodel base_model = memmodel_base (model);
16700 if (is_mm_sync (model)
16701 && (base_model == MEMMODEL_ACQUIRE
16702 || base_model == MEMMODEL_ACQ_REL
16703 || base_model == MEMMODEL_SEQ_CST))
16705 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16709 /* Split a compare and swap pattern. */
16711 void
16712 aarch64_split_compare_and_swap (rtx operands[])
16714 rtx rval, mem, oldval, newval, scratch;
16715 machine_mode mode;
16716 bool is_weak;
16717 rtx_code_label *label1, *label2;
16718 rtx x, cond;
16719 enum memmodel model;
16720 rtx model_rtx;
16722 rval = operands[0];
16723 mem = operands[1];
16724 oldval = operands[2];
16725 newval = operands[3];
16726 is_weak = (operands[4] != const0_rtx);
16727 model_rtx = operands[5];
16728 scratch = operands[7];
16729 mode = GET_MODE (mem);
16730 model = memmodel_from_int (INTVAL (model_rtx));
16732 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16733 loop:
16734 .label1:
16735 LD[A]XR rval, [mem]
16736 CBNZ rval, .label2
16737 ST[L]XR scratch, newval, [mem]
16738 CBNZ scratch, .label1
16739 .label2:
16740 CMP rval, 0. */
16741 bool strong_zero_p = !is_weak && oldval == const0_rtx;
16743 label1 = NULL;
16744 if (!is_weak)
16746 label1 = gen_label_rtx ();
16747 emit_label (label1);
16749 label2 = gen_label_rtx ();
16751 /* The initial load can be relaxed for a __sync operation since a final
16752 barrier will be emitted to stop code hoisting. */
16753 if (is_mm_sync (model))
16754 aarch64_emit_load_exclusive (mode, rval, mem,
16755 GEN_INT (MEMMODEL_RELAXED));
16756 else
16757 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16759 if (strong_zero_p)
16761 if (aarch64_track_speculation)
16763 /* Emit an explicit compare instruction, so that we can correctly
16764 track the condition codes. */
16765 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16766 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16768 else
16769 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16771 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16772 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16773 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16775 else
16777 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16778 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16779 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16780 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16781 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16784 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16786 if (!is_weak)
16788 if (aarch64_track_speculation)
16790 /* Emit an explicit compare instruction, so that we can correctly
16791 track the condition codes. */
16792 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16793 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16795 else
16796 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16798 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16799 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16800 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16802 else
16804 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16805 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16806 emit_insn (gen_rtx_SET (cond, x));
16809 emit_label (label2);
16810 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16811 to set the condition flags. If this is not used it will be removed by
16812 later passes. */
16813 if (strong_zero_p)
16815 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16816 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16817 emit_insn (gen_rtx_SET (cond, x));
16819 /* Emit any final barrier needed for a __sync operation. */
16820 if (is_mm_sync (model))
16821 aarch64_emit_post_barrier (model);
16824 /* Split an atomic operation. */
16826 void
16827 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16828 rtx value, rtx model_rtx, rtx cond)
16830 machine_mode mode = GET_MODE (mem);
16831 machine_mode wmode = (mode == DImode ? DImode : SImode);
16832 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16833 const bool is_sync = is_mm_sync (model);
16834 rtx_code_label *label;
16835 rtx x;
16837 /* Split the atomic operation into a sequence. */
16838 label = gen_label_rtx ();
16839 emit_label (label);
16841 if (new_out)
16842 new_out = gen_lowpart (wmode, new_out);
16843 if (old_out)
16844 old_out = gen_lowpart (wmode, old_out);
16845 else
16846 old_out = new_out;
16847 value = simplify_gen_subreg (wmode, value, mode, 0);
16849 /* The initial load can be relaxed for a __sync operation since a final
16850 barrier will be emitted to stop code hoisting. */
16851 if (is_sync)
16852 aarch64_emit_load_exclusive (mode, old_out, mem,
16853 GEN_INT (MEMMODEL_RELAXED));
16854 else
16855 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16857 switch (code)
16859 case SET:
16860 new_out = value;
16861 break;
16863 case NOT:
16864 x = gen_rtx_AND (wmode, old_out, value);
16865 emit_insn (gen_rtx_SET (new_out, x));
16866 x = gen_rtx_NOT (wmode, new_out);
16867 emit_insn (gen_rtx_SET (new_out, x));
16868 break;
16870 case MINUS:
16871 if (CONST_INT_P (value))
16873 value = GEN_INT (-INTVAL (value));
16874 code = PLUS;
16876 /* Fall through. */
16878 default:
16879 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16880 emit_insn (gen_rtx_SET (new_out, x));
16881 break;
16884 aarch64_emit_store_exclusive (mode, cond, mem,
16885 gen_lowpart (mode, new_out), model_rtx);
16887 if (aarch64_track_speculation)
16889 /* Emit an explicit compare instruction, so that we can correctly
16890 track the condition codes. */
16891 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16892 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16894 else
16895 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16897 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16898 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16899 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16901 /* Emit any final barrier needed for a __sync operation. */
16902 if (is_sync)
16903 aarch64_emit_post_barrier (model);
16906 static void
16907 aarch64_init_libfuncs (void)
16909 /* Half-precision float operations. The compiler handles all operations
16910 with NULL libfuncs by converting to SFmode. */
16912 /* Conversions. */
16913 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16914 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16916 /* Arithmetic. */
16917 set_optab_libfunc (add_optab, HFmode, NULL);
16918 set_optab_libfunc (sdiv_optab, HFmode, NULL);
16919 set_optab_libfunc (smul_optab, HFmode, NULL);
16920 set_optab_libfunc (neg_optab, HFmode, NULL);
16921 set_optab_libfunc (sub_optab, HFmode, NULL);
16923 /* Comparisons. */
16924 set_optab_libfunc (eq_optab, HFmode, NULL);
16925 set_optab_libfunc (ne_optab, HFmode, NULL);
16926 set_optab_libfunc (lt_optab, HFmode, NULL);
16927 set_optab_libfunc (le_optab, HFmode, NULL);
16928 set_optab_libfunc (ge_optab, HFmode, NULL);
16929 set_optab_libfunc (gt_optab, HFmode, NULL);
16930 set_optab_libfunc (unord_optab, HFmode, NULL);
16933 /* Target hook for c_mode_for_suffix. */
16934 static machine_mode
16935 aarch64_c_mode_for_suffix (char suffix)
16937 if (suffix == 'q')
16938 return TFmode;
16940 return VOIDmode;
16943 /* We can only represent floating point constants which will fit in
16944 "quarter-precision" values. These values are characterised by
16945 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16948 (-1)^s * (n/16) * 2^r
16950 Where:
16951 's' is the sign bit.
16952 'n' is an integer in the range 16 <= n <= 31.
16953 'r' is an integer in the range -3 <= r <= 4. */
16955 /* Return true iff X can be represented by a quarter-precision
16956 floating point immediate operand X. Note, we cannot represent 0.0. */
16957 bool
16958 aarch64_float_const_representable_p (rtx x)
16960 /* This represents our current view of how many bits
16961 make up the mantissa. */
16962 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16963 int exponent;
16964 unsigned HOST_WIDE_INT mantissa, mask;
16965 REAL_VALUE_TYPE r, m;
16966 bool fail;
16968 if (!CONST_DOUBLE_P (x))
16969 return false;
16971 if (GET_MODE (x) == VOIDmode
16972 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16973 return false;
16975 r = *CONST_DOUBLE_REAL_VALUE (x);
16977 /* We cannot represent infinities, NaNs or +/-zero. We won't
16978 know if we have +zero until we analyse the mantissa, but we
16979 can reject the other invalid values. */
16980 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16981 || REAL_VALUE_MINUS_ZERO (r))
16982 return false;
16984 /* Extract exponent. */
16985 r = real_value_abs (&r);
16986 exponent = REAL_EXP (&r);
16988 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16989 highest (sign) bit, with a fixed binary point at bit point_pos.
16990 m1 holds the low part of the mantissa, m2 the high part.
16991 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16992 bits for the mantissa, this can fail (low bits will be lost). */
16993 real_ldexp (&m, &r, point_pos - exponent);
16994 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16996 /* If the low part of the mantissa has bits set we cannot represent
16997 the value. */
16998 if (w.ulow () != 0)
16999 return false;
17000 /* We have rejected the lower HOST_WIDE_INT, so update our
17001 understanding of how many bits lie in the mantissa and
17002 look only at the high HOST_WIDE_INT. */
17003 mantissa = w.elt (1);
17004 point_pos -= HOST_BITS_PER_WIDE_INT;
17006 /* We can only represent values with a mantissa of the form 1.xxxx. */
17007 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17008 if ((mantissa & mask) != 0)
17009 return false;
17011 /* Having filtered unrepresentable values, we may now remove all
17012 but the highest 5 bits. */
17013 mantissa >>= point_pos - 5;
17015 /* We cannot represent the value 0.0, so reject it. This is handled
17016 elsewhere. */
17017 if (mantissa == 0)
17018 return false;
17020 /* Then, as bit 4 is always set, we can mask it off, leaving
17021 the mantissa in the range [0, 15]. */
17022 mantissa &= ~(1 << 4);
17023 gcc_assert (mantissa <= 15);
17025 /* GCC internally does not use IEEE754-like encoding (where normalized
17026 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17027 Our mantissa values are shifted 4 places to the left relative to
17028 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17029 by 5 places to correct for GCC's representation. */
17030 exponent = 5 - exponent;
17032 return (exponent >= 0 && exponent <= 7);
17035 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17036 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17037 output MOVI/MVNI, ORR or BIC immediate. */
17038 char*
17039 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17040 enum simd_immediate_check which)
17042 bool is_valid;
17043 static char templ[40];
17044 const char *mnemonic;
17045 const char *shift_op;
17046 unsigned int lane_count = 0;
17047 char element_char;
17049 struct simd_immediate_info info;
17051 /* This will return true to show const_vector is legal for use as either
17052 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17053 It will also update INFO to show how the immediate should be generated.
17054 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17055 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17056 gcc_assert (is_valid);
17058 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17059 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17061 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17063 gcc_assert (info.insn == simd_immediate_info::MOV
17064 && info.u.mov.shift == 0);
17065 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17066 move immediate path. */
17067 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17068 info.u.mov.value = GEN_INT (0);
17069 else
17071 const unsigned int buf_size = 20;
17072 char float_buf[buf_size] = {'\0'};
17073 real_to_decimal_for_mode (float_buf,
17074 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17075 buf_size, buf_size, 1, info.elt_mode);
17077 if (lane_count == 1)
17078 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17079 else
17080 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17081 lane_count, element_char, float_buf);
17082 return templ;
17086 gcc_assert (CONST_INT_P (info.u.mov.value));
17088 if (which == AARCH64_CHECK_MOV)
17090 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17091 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17092 ? "msl" : "lsl");
17093 if (lane_count == 1)
17094 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17095 mnemonic, UINTVAL (info.u.mov.value));
17096 else if (info.u.mov.shift)
17097 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17098 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17099 element_char, UINTVAL (info.u.mov.value), shift_op,
17100 info.u.mov.shift);
17101 else
17102 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17103 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17104 element_char, UINTVAL (info.u.mov.value));
17106 else
17108 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17109 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17110 if (info.u.mov.shift)
17111 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17112 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17113 element_char, UINTVAL (info.u.mov.value), "lsl",
17114 info.u.mov.shift);
17115 else
17116 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17117 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17118 element_char, UINTVAL (info.u.mov.value));
17120 return templ;
17123 char*
17124 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17127 /* If a floating point number was passed and we desire to use it in an
17128 integer mode do the conversion to integer. */
17129 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17131 unsigned HOST_WIDE_INT ival;
17132 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17133 gcc_unreachable ();
17134 immediate = gen_int_mode (ival, mode);
17137 machine_mode vmode;
17138 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17139 a 128 bit vector mode. */
17140 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17142 vmode = aarch64_simd_container_mode (mode, width);
17143 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17144 return aarch64_output_simd_mov_immediate (v_op, width);
17147 /* Return the output string to use for moving immediate CONST_VECTOR
17148 into an SVE register. */
17150 char *
17151 aarch64_output_sve_mov_immediate (rtx const_vector)
17153 static char templ[40];
17154 struct simd_immediate_info info;
17155 char element_char;
17157 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17158 gcc_assert (is_valid);
17160 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17162 machine_mode vec_mode = GET_MODE (const_vector);
17163 if (aarch64_sve_pred_mode_p (vec_mode))
17165 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17166 if (info.insn == simd_immediate_info::MOV)
17168 gcc_assert (info.u.mov.value == const0_rtx);
17169 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17171 else
17173 gcc_assert (info.insn == simd_immediate_info::PTRUE);
17174 unsigned int total_bytes;
17175 if (info.u.pattern == AARCH64_SV_ALL
17176 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17177 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17178 total_bytes / GET_MODE_SIZE (info.elt_mode));
17179 else
17180 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17181 svpattern_token (info.u.pattern));
17183 return buf;
17186 if (info.insn == simd_immediate_info::INDEX)
17188 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17189 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17190 element_char, INTVAL (info.u.index.base),
17191 INTVAL (info.u.index.step));
17192 return templ;
17195 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17197 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17198 info.u.mov.value = GEN_INT (0);
17199 else
17201 const int buf_size = 20;
17202 char float_buf[buf_size] = {};
17203 real_to_decimal_for_mode (float_buf,
17204 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17205 buf_size, buf_size, 1, info.elt_mode);
17207 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17208 element_char, float_buf);
17209 return templ;
17213 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17214 element_char, INTVAL (info.u.mov.value));
17215 return templ;
17218 /* Split operands into moves from op[1] + op[2] into op[0]. */
17220 void
17221 aarch64_split_combinev16qi (rtx operands[3])
17223 unsigned int dest = REGNO (operands[0]);
17224 unsigned int src1 = REGNO (operands[1]);
17225 unsigned int src2 = REGNO (operands[2]);
17226 machine_mode halfmode = GET_MODE (operands[1]);
17227 unsigned int halfregs = REG_NREGS (operands[1]);
17228 rtx destlo, desthi;
17230 gcc_assert (halfmode == V16QImode);
17232 if (src1 == dest && src2 == dest + halfregs)
17234 /* No-op move. Can't split to nothing; emit something. */
17235 emit_note (NOTE_INSN_DELETED);
17236 return;
17239 /* Preserve register attributes for variable tracking. */
17240 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17241 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17242 GET_MODE_SIZE (halfmode));
17244 /* Special case of reversed high/low parts. */
17245 if (reg_overlap_mentioned_p (operands[2], destlo)
17246 && reg_overlap_mentioned_p (operands[1], desthi))
17248 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17249 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17250 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17252 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17254 /* Try to avoid unnecessary moves if part of the result
17255 is in the right place already. */
17256 if (src1 != dest)
17257 emit_move_insn (destlo, operands[1]);
17258 if (src2 != dest + halfregs)
17259 emit_move_insn (desthi, operands[2]);
17261 else
17263 if (src2 != dest + halfregs)
17264 emit_move_insn (desthi, operands[2]);
17265 if (src1 != dest)
17266 emit_move_insn (destlo, operands[1]);
17270 /* vec_perm support. */
17272 struct expand_vec_perm_d
17274 rtx target, op0, op1;
17275 vec_perm_indices perm;
17276 machine_mode vmode;
17277 unsigned int vec_flags;
17278 bool one_vector_p;
17279 bool testing_p;
17282 /* Generate a variable permutation. */
17284 static void
17285 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17287 machine_mode vmode = GET_MODE (target);
17288 bool one_vector_p = rtx_equal_p (op0, op1);
17290 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17291 gcc_checking_assert (GET_MODE (op0) == vmode);
17292 gcc_checking_assert (GET_MODE (op1) == vmode);
17293 gcc_checking_assert (GET_MODE (sel) == vmode);
17294 gcc_checking_assert (TARGET_SIMD);
17296 if (one_vector_p)
17298 if (vmode == V8QImode)
17300 /* Expand the argument to a V16QI mode by duplicating it. */
17301 rtx pair = gen_reg_rtx (V16QImode);
17302 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17303 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17305 else
17307 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17310 else
17312 rtx pair;
17314 if (vmode == V8QImode)
17316 pair = gen_reg_rtx (V16QImode);
17317 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17318 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17320 else
17322 pair = gen_reg_rtx (OImode);
17323 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17324 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17329 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17330 NELT is the number of elements in the vector. */
17332 void
17333 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17334 unsigned int nelt)
17336 machine_mode vmode = GET_MODE (target);
17337 bool one_vector_p = rtx_equal_p (op0, op1);
17338 rtx mask;
17340 /* The TBL instruction does not use a modulo index, so we must take care
17341 of that ourselves. */
17342 mask = aarch64_simd_gen_const_vector_dup (vmode,
17343 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17344 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17346 /* For big-endian, we also need to reverse the index within the vector
17347 (but not which vector). */
17348 if (BYTES_BIG_ENDIAN)
17350 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17351 if (!one_vector_p)
17352 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17353 sel = expand_simple_binop (vmode, XOR, sel, mask,
17354 NULL, 0, OPTAB_LIB_WIDEN);
17356 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17359 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17361 static void
17362 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17364 emit_insn (gen_rtx_SET (target,
17365 gen_rtx_UNSPEC (GET_MODE (target),
17366 gen_rtvec (2, op0, op1), code)));
17369 /* Expand an SVE vec_perm with the given operands. */
17371 void
17372 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17374 machine_mode data_mode = GET_MODE (target);
17375 machine_mode sel_mode = GET_MODE (sel);
17376 /* Enforced by the pattern condition. */
17377 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17379 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17380 size of the two value vectors, i.e. the upper bits of the indices
17381 are effectively ignored. SVE TBL instead produces 0 for any
17382 out-of-range indices, so we need to modulo all the vec_perm indices
17383 to ensure they are all in range. */
17384 rtx sel_reg = force_reg (sel_mode, sel);
17386 /* Check if the sel only references the first values vector. */
17387 if (GET_CODE (sel) == CONST_VECTOR
17388 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17390 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17391 return;
17394 /* Check if the two values vectors are the same. */
17395 if (rtx_equal_p (op0, op1))
17397 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17398 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17399 NULL, 0, OPTAB_DIRECT);
17400 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17401 return;
17404 /* Run TBL on for each value vector and combine the results. */
17406 rtx res0 = gen_reg_rtx (data_mode);
17407 rtx res1 = gen_reg_rtx (data_mode);
17408 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17409 if (GET_CODE (sel) != CONST_VECTOR
17410 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17412 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17413 2 * nunits - 1);
17414 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17415 NULL, 0, OPTAB_DIRECT);
17417 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17418 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17419 NULL, 0, OPTAB_DIRECT);
17420 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17421 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17422 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17423 else
17424 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17427 /* Recognize patterns suitable for the TRN instructions. */
17428 static bool
17429 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17431 HOST_WIDE_INT odd;
17432 poly_uint64 nelt = d->perm.length ();
17433 rtx out, in0, in1, x;
17434 machine_mode vmode = d->vmode;
17436 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17437 return false;
17439 /* Note that these are little-endian tests.
17440 We correct for big-endian later. */
17441 if (!d->perm[0].is_constant (&odd)
17442 || (odd != 0 && odd != 1)
17443 || !d->perm.series_p (0, 2, odd, 2)
17444 || !d->perm.series_p (1, 2, nelt + odd, 2))
17445 return false;
17447 /* Success! */
17448 if (d->testing_p)
17449 return true;
17451 in0 = d->op0;
17452 in1 = d->op1;
17453 /* We don't need a big-endian lane correction for SVE; see the comment
17454 at the head of aarch64-sve.md for details. */
17455 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17457 x = in0, in0 = in1, in1 = x;
17458 odd = !odd;
17460 out = d->target;
17462 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17463 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17464 return true;
17467 /* Recognize patterns suitable for the UZP instructions. */
17468 static bool
17469 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17471 HOST_WIDE_INT odd;
17472 rtx out, in0, in1, x;
17473 machine_mode vmode = d->vmode;
17475 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17476 return false;
17478 /* Note that these are little-endian tests.
17479 We correct for big-endian later. */
17480 if (!d->perm[0].is_constant (&odd)
17481 || (odd != 0 && odd != 1)
17482 || !d->perm.series_p (0, 1, odd, 2))
17483 return false;
17485 /* Success! */
17486 if (d->testing_p)
17487 return true;
17489 in0 = d->op0;
17490 in1 = d->op1;
17491 /* We don't need a big-endian lane correction for SVE; see the comment
17492 at the head of aarch64-sve.md for details. */
17493 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17495 x = in0, in0 = in1, in1 = x;
17496 odd = !odd;
17498 out = d->target;
17500 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17501 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17502 return true;
17505 /* Recognize patterns suitable for the ZIP instructions. */
17506 static bool
17507 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17509 unsigned int high;
17510 poly_uint64 nelt = d->perm.length ();
17511 rtx out, in0, in1, x;
17512 machine_mode vmode = d->vmode;
17514 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17515 return false;
17517 /* Note that these are little-endian tests.
17518 We correct for big-endian later. */
17519 poly_uint64 first = d->perm[0];
17520 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17521 || !d->perm.series_p (0, 2, first, 1)
17522 || !d->perm.series_p (1, 2, first + nelt, 1))
17523 return false;
17524 high = maybe_ne (first, 0U);
17526 /* Success! */
17527 if (d->testing_p)
17528 return true;
17530 in0 = d->op0;
17531 in1 = d->op1;
17532 /* We don't need a big-endian lane correction for SVE; see the comment
17533 at the head of aarch64-sve.md for details. */
17534 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17536 x = in0, in0 = in1, in1 = x;
17537 high = !high;
17539 out = d->target;
17541 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17542 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17543 return true;
17546 /* Recognize patterns for the EXT insn. */
17548 static bool
17549 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17551 HOST_WIDE_INT location;
17552 rtx offset;
17554 /* The first element always refers to the first vector.
17555 Check if the extracted indices are increasing by one. */
17556 if (d->vec_flags == VEC_SVE_PRED
17557 || !d->perm[0].is_constant (&location)
17558 || !d->perm.series_p (0, 1, location, 1))
17559 return false;
17561 /* Success! */
17562 if (d->testing_p)
17563 return true;
17565 /* The case where (location == 0) is a no-op for both big- and little-endian,
17566 and is removed by the mid-end at optimization levels -O1 and higher.
17568 We don't need a big-endian lane correction for SVE; see the comment
17569 at the head of aarch64-sve.md for details. */
17570 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17572 /* After setup, we want the high elements of the first vector (stored
17573 at the LSB end of the register), and the low elements of the second
17574 vector (stored at the MSB end of the register). So swap. */
17575 std::swap (d->op0, d->op1);
17576 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17577 to_constant () is safe since this is restricted to Advanced SIMD
17578 vectors. */
17579 location = d->perm.length ().to_constant () - location;
17582 offset = GEN_INT (location);
17583 emit_set_insn (d->target,
17584 gen_rtx_UNSPEC (d->vmode,
17585 gen_rtvec (3, d->op0, d->op1, offset),
17586 UNSPEC_EXT));
17587 return true;
17590 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17591 within each 64-bit, 32-bit or 16-bit granule. */
17593 static bool
17594 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17596 HOST_WIDE_INT diff;
17597 unsigned int i, size, unspec;
17598 machine_mode pred_mode;
17600 if (d->vec_flags == VEC_SVE_PRED
17601 || !d->one_vector_p
17602 || !d->perm[0].is_constant (&diff))
17603 return false;
17605 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17606 if (size == 8)
17608 unspec = UNSPEC_REV64;
17609 pred_mode = VNx2BImode;
17611 else if (size == 4)
17613 unspec = UNSPEC_REV32;
17614 pred_mode = VNx4BImode;
17616 else if (size == 2)
17618 unspec = UNSPEC_REV16;
17619 pred_mode = VNx8BImode;
17621 else
17622 return false;
17624 unsigned int step = diff + 1;
17625 for (i = 0; i < step; ++i)
17626 if (!d->perm.series_p (i, step, diff - i, step))
17627 return false;
17629 /* Success! */
17630 if (d->testing_p)
17631 return true;
17633 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17634 if (d->vec_flags == VEC_SVE_DATA)
17636 rtx pred = aarch64_ptrue_reg (pred_mode);
17637 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17638 UNSPEC_PRED_X);
17640 emit_set_insn (d->target, src);
17641 return true;
17644 /* Recognize patterns for the REV insn, which reverses elements within
17645 a full vector. */
17647 static bool
17648 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17650 poly_uint64 nelt = d->perm.length ();
17652 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17653 return false;
17655 if (!d->perm.series_p (0, 1, nelt - 1, -1))
17656 return false;
17658 /* Success! */
17659 if (d->testing_p)
17660 return true;
17662 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17663 emit_set_insn (d->target, src);
17664 return true;
17667 static bool
17668 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17670 rtx out = d->target;
17671 rtx in0;
17672 HOST_WIDE_INT elt;
17673 machine_mode vmode = d->vmode;
17674 rtx lane;
17676 if (d->vec_flags == VEC_SVE_PRED
17677 || d->perm.encoding ().encoded_nelts () != 1
17678 || !d->perm[0].is_constant (&elt))
17679 return false;
17681 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17682 return false;
17684 /* Success! */
17685 if (d->testing_p)
17686 return true;
17688 /* The generic preparation in aarch64_expand_vec_perm_const_1
17689 swaps the operand order and the permute indices if it finds
17690 d->perm[0] to be in the second operand. Thus, we can always
17691 use d->op0 and need not do any extra arithmetic to get the
17692 correct lane number. */
17693 in0 = d->op0;
17694 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
17696 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17697 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17698 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17699 return true;
17702 static bool
17703 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17705 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17706 machine_mode vmode = d->vmode;
17708 /* Make sure that the indices are constant. */
17709 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17710 for (unsigned int i = 0; i < encoded_nelts; ++i)
17711 if (!d->perm[i].is_constant ())
17712 return false;
17714 if (d->testing_p)
17715 return true;
17717 /* Generic code will try constant permutation twice. Once with the
17718 original mode and again with the elements lowered to QImode.
17719 So wait and don't do the selector expansion ourselves. */
17720 if (vmode != V8QImode && vmode != V16QImode)
17721 return false;
17723 /* to_constant is safe since this routine is specific to Advanced SIMD
17724 vectors. */
17725 unsigned int nelt = d->perm.length ().to_constant ();
17726 for (unsigned int i = 0; i < nelt; ++i)
17727 /* If big-endian and two vectors we end up with a weird mixed-endian
17728 mode on NEON. Reverse the index within each word but not the word
17729 itself. to_constant is safe because we checked is_constant above. */
17730 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17731 ? d->perm[i].to_constant () ^ (nelt - 1)
17732 : d->perm[i].to_constant ());
17734 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17735 sel = force_reg (vmode, sel);
17737 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17738 return true;
17741 /* Try to implement D using an SVE TBL instruction. */
17743 static bool
17744 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17746 unsigned HOST_WIDE_INT nelt;
17748 /* Permuting two variable-length vectors could overflow the
17749 index range. */
17750 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17751 return false;
17753 if (d->testing_p)
17754 return true;
17756 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17757 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17758 if (d->one_vector_p)
17759 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17760 else
17761 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17762 return true;
17765 static bool
17766 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17768 /* The pattern matching functions above are written to look for a small
17769 number to begin the sequence (0, 1, N/2). If we begin with an index
17770 from the second operand, we can swap the operands. */
17771 poly_int64 nelt = d->perm.length ();
17772 if (known_ge (d->perm[0], nelt))
17774 d->perm.rotate_inputs (1);
17775 std::swap (d->op0, d->op1);
17778 if ((d->vec_flags == VEC_ADVSIMD
17779 || d->vec_flags == VEC_SVE_DATA
17780 || d->vec_flags == VEC_SVE_PRED)
17781 && known_gt (nelt, 1))
17783 if (aarch64_evpc_rev_local (d))
17784 return true;
17785 else if (aarch64_evpc_rev_global (d))
17786 return true;
17787 else if (aarch64_evpc_ext (d))
17788 return true;
17789 else if (aarch64_evpc_dup (d))
17790 return true;
17791 else if (aarch64_evpc_zip (d))
17792 return true;
17793 else if (aarch64_evpc_uzp (d))
17794 return true;
17795 else if (aarch64_evpc_trn (d))
17796 return true;
17797 if (d->vec_flags == VEC_SVE_DATA)
17798 return aarch64_evpc_sve_tbl (d);
17799 else if (d->vec_flags == VEC_ADVSIMD)
17800 return aarch64_evpc_tbl (d);
17802 return false;
17805 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17807 static bool
17808 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17809 rtx op1, const vec_perm_indices &sel)
17811 struct expand_vec_perm_d d;
17813 /* Check whether the mask can be applied to a single vector. */
17814 if (sel.ninputs () == 1
17815 || (op0 && rtx_equal_p (op0, op1)))
17816 d.one_vector_p = true;
17817 else if (sel.all_from_input_p (0))
17819 d.one_vector_p = true;
17820 op1 = op0;
17822 else if (sel.all_from_input_p (1))
17824 d.one_vector_p = true;
17825 op0 = op1;
17827 else
17828 d.one_vector_p = false;
17830 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17831 sel.nelts_per_input ());
17832 d.vmode = vmode;
17833 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17834 d.target = target;
17835 d.op0 = op0;
17836 d.op1 = op1;
17837 d.testing_p = !target;
17839 if (!d.testing_p)
17840 return aarch64_expand_vec_perm_const_1 (&d);
17842 rtx_insn *last = get_last_insn ();
17843 bool ret = aarch64_expand_vec_perm_const_1 (&d);
17844 gcc_assert (last == get_last_insn ());
17846 return ret;
17849 /* Generate a byte permute mask for a register of mode MODE,
17850 which has NUNITS units. */
17853 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17855 /* We have to reverse each vector because we dont have
17856 a permuted load that can reverse-load according to ABI rules. */
17857 rtx mask;
17858 rtvec v = rtvec_alloc (16);
17859 unsigned int i, j;
17860 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17862 gcc_assert (BYTES_BIG_ENDIAN);
17863 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17865 for (i = 0; i < nunits; i++)
17866 for (j = 0; j < usize; j++)
17867 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17868 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17869 return force_reg (V16QImode, mask);
17872 /* Expand an SVE integer comparison using the SVE equivalent of:
17874 (set TARGET (CODE OP0 OP1)). */
17876 void
17877 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17879 machine_mode pred_mode = GET_MODE (target);
17880 machine_mode data_mode = GET_MODE (op0);
17881 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
17882 op0, op1);
17883 if (!rtx_equal_p (target, res))
17884 emit_move_insn (target, res);
17887 /* Return the UNSPEC_COND_* code for comparison CODE. */
17889 static unsigned int
17890 aarch64_unspec_cond_code (rtx_code code)
17892 switch (code)
17894 case NE:
17895 return UNSPEC_COND_FCMNE;
17896 case EQ:
17897 return UNSPEC_COND_FCMEQ;
17898 case LT:
17899 return UNSPEC_COND_FCMLT;
17900 case GT:
17901 return UNSPEC_COND_FCMGT;
17902 case LE:
17903 return UNSPEC_COND_FCMLE;
17904 case GE:
17905 return UNSPEC_COND_FCMGE;
17906 case UNORDERED:
17907 return UNSPEC_COND_FCMUO;
17908 default:
17909 gcc_unreachable ();
17913 /* Emit:
17915 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17917 where <X> is the operation associated with comparison CODE.
17918 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17920 static void
17921 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
17922 bool known_ptrue_p, rtx op0, rtx op1)
17924 rtx flag = gen_int_mode (known_ptrue_p, SImode);
17925 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17926 gen_rtvec (4, pred, flag, op0, op1),
17927 aarch64_unspec_cond_code (code));
17928 emit_set_insn (target, unspec);
17931 /* Emit the SVE equivalent of:
17933 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
17934 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
17935 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17937 where <Xi> is the operation associated with comparison CODEi.
17938 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17940 static void
17941 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
17942 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
17944 machine_mode pred_mode = GET_MODE (pred);
17945 rtx tmp1 = gen_reg_rtx (pred_mode);
17946 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
17947 rtx tmp2 = gen_reg_rtx (pred_mode);
17948 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
17949 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17952 /* Emit the SVE equivalent of:
17954 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17955 (set TARGET (not TMP))
17957 where <X> is the operation associated with comparison CODE.
17958 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17960 static void
17961 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
17962 bool known_ptrue_p, rtx op0, rtx op1)
17964 machine_mode pred_mode = GET_MODE (pred);
17965 rtx tmp = gen_reg_rtx (pred_mode);
17966 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
17967 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17970 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17972 (set TARGET (CODE OP0 OP1))
17974 If CAN_INVERT_P is true, the caller can also handle inverted results;
17975 return true if the result is in fact inverted. */
17977 bool
17978 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17979 rtx op0, rtx op1, bool can_invert_p)
17981 machine_mode pred_mode = GET_MODE (target);
17982 machine_mode data_mode = GET_MODE (op0);
17984 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17985 switch (code)
17987 case UNORDERED:
17988 /* UNORDERED has no immediate form. */
17989 op1 = force_reg (data_mode, op1);
17990 /* fall through */
17991 case LT:
17992 case LE:
17993 case GT:
17994 case GE:
17995 case EQ:
17996 case NE:
17998 /* There is native support for the comparison. */
17999 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18000 return false;
18003 case LTGT:
18004 /* This is a trapping operation (LT or GT). */
18005 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18006 return false;
18008 case UNEQ:
18009 if (!flag_trapping_math)
18011 /* This would trap for signaling NaNs. */
18012 op1 = force_reg (data_mode, op1);
18013 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18014 ptrue, true, op0, op1);
18015 return false;
18017 /* fall through */
18018 case UNLT:
18019 case UNLE:
18020 case UNGT:
18021 case UNGE:
18022 if (flag_trapping_math)
18024 /* Work out which elements are ordered. */
18025 rtx ordered = gen_reg_rtx (pred_mode);
18026 op1 = force_reg (data_mode, op1);
18027 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18028 ptrue, true, op0, op1);
18030 /* Test the opposite condition for the ordered elements,
18031 then invert the result. */
18032 if (code == UNEQ)
18033 code = NE;
18034 else
18035 code = reverse_condition_maybe_unordered (code);
18036 if (can_invert_p)
18038 aarch64_emit_sve_fp_cond (target, code,
18039 ordered, false, op0, op1);
18040 return true;
18042 aarch64_emit_sve_invert_fp_cond (target, code,
18043 ordered, false, op0, op1);
18044 return false;
18046 break;
18048 case ORDERED:
18049 /* ORDERED has no immediate form. */
18050 op1 = force_reg (data_mode, op1);
18051 break;
18053 default:
18054 gcc_unreachable ();
18057 /* There is native support for the inverse comparison. */
18058 code = reverse_condition_maybe_unordered (code);
18059 if (can_invert_p)
18061 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18062 return true;
18064 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18065 return false;
18068 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18069 of the data being selected and CMP_MODE is the mode of the values being
18070 compared. */
18072 void
18073 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18074 rtx *ops)
18076 machine_mode pred_mode
18077 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18078 GET_MODE_SIZE (cmp_mode)).require ();
18079 rtx pred = gen_reg_rtx (pred_mode);
18080 if (FLOAT_MODE_P (cmp_mode))
18082 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18083 ops[4], ops[5], true))
18084 std::swap (ops[1], ops[2]);
18086 else
18087 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18089 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18090 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18093 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18094 true. However due to issues with register allocation it is preferable
18095 to avoid tieing integer scalar and FP scalar modes. Executing integer
18096 operations in general registers is better than treating them as scalar
18097 vector operations. This reduces latency and avoids redundant int<->FP
18098 moves. So tie modes if they are either the same class, or vector modes
18099 with other vector modes, vector structs or any scalar mode. */
18101 static bool
18102 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18104 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18105 return true;
18107 /* We specifically want to allow elements of "structure" modes to
18108 be tieable to the structure. This more general condition allows
18109 other rarer situations too. The reason we don't extend this to
18110 predicate modes is that there are no predicate structure modes
18111 nor any specific instructions for extracting part of a predicate
18112 register. */
18113 if (aarch64_vector_data_mode_p (mode1)
18114 && aarch64_vector_data_mode_p (mode2))
18115 return true;
18117 /* Also allow any scalar modes with vectors. */
18118 if (aarch64_vector_mode_supported_p (mode1)
18119 || aarch64_vector_mode_supported_p (mode2))
18120 return true;
18122 return false;
18125 /* Return a new RTX holding the result of moving POINTER forward by
18126 AMOUNT bytes. */
18128 static rtx
18129 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18131 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18133 return adjust_automodify_address (pointer, GET_MODE (pointer),
18134 next, amount);
18137 /* Return a new RTX holding the result of moving POINTER forward by the
18138 size of the mode it points to. */
18140 static rtx
18141 aarch64_progress_pointer (rtx pointer)
18143 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18146 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18147 MODE bytes. */
18149 static void
18150 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18151 machine_mode mode)
18153 rtx reg = gen_reg_rtx (mode);
18155 /* "Cast" the pointers to the correct mode. */
18156 *src = adjust_address (*src, mode, 0);
18157 *dst = adjust_address (*dst, mode, 0);
18158 /* Emit the memcpy. */
18159 emit_move_insn (reg, *src);
18160 emit_move_insn (*dst, reg);
18161 /* Move the pointers forward. */
18162 *src = aarch64_progress_pointer (*src);
18163 *dst = aarch64_progress_pointer (*dst);
18166 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18167 we succeed, otherwise return false. */
18169 bool
18170 aarch64_expand_cpymem (rtx *operands)
18172 int n, mode_bits;
18173 rtx dst = operands[0];
18174 rtx src = operands[1];
18175 rtx base;
18176 machine_mode cur_mode = BLKmode, next_mode;
18177 bool speed_p = !optimize_function_for_size_p (cfun);
18179 /* When optimizing for size, give a better estimate of the length of a
18180 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18181 will always require an even number of instructions to do now. And each
18182 operation requires both a load+store, so devide the max number by 2. */
18183 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18185 /* We can't do anything smart if the amount to copy is not constant. */
18186 if (!CONST_INT_P (operands[2]))
18187 return false;
18189 n = INTVAL (operands[2]);
18191 /* Try to keep the number of instructions low. For all cases we will do at
18192 most two moves for the residual amount, since we'll always overlap the
18193 remainder. */
18194 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18195 return false;
18197 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18198 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18200 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18201 src = adjust_automodify_address (src, VOIDmode, base, 0);
18203 /* Convert n to bits to make the rest of the code simpler. */
18204 n = n * BITS_PER_UNIT;
18206 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18207 larger than TImode, but we should not use them for loads/stores here. */
18208 const int copy_limit = GET_MODE_BITSIZE (TImode);
18210 while (n > 0)
18212 /* Find the largest mode in which to do the copy in without over reading
18213 or writing. */
18214 opt_scalar_int_mode mode_iter;
18215 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18216 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18217 cur_mode = mode_iter.require ();
18219 gcc_assert (cur_mode != BLKmode);
18221 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18222 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18224 n -= mode_bits;
18226 /* Do certain trailing copies as overlapping if it's going to be
18227 cheaper. i.e. less instructions to do so. For instance doing a 15
18228 byte copy it's more efficient to do two overlapping 8 byte copies than
18229 8 + 6 + 1. */
18230 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18232 next_mode = smallest_mode_for_size (n, MODE_INT);
18233 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18234 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18235 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18236 n = n_bits;
18240 return true;
18243 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18244 SImode stores. Handle the case when the constant has identical
18245 bottom and top halves. This is beneficial when the two stores can be
18246 merged into an STP and we avoid synthesising potentially expensive
18247 immediates twice. Return true if such a split is possible. */
18249 bool
18250 aarch64_split_dimode_const_store (rtx dst, rtx src)
18252 rtx lo = gen_lowpart (SImode, src);
18253 rtx hi = gen_highpart_mode (SImode, DImode, src);
18255 bool size_p = optimize_function_for_size_p (cfun);
18257 if (!rtx_equal_p (lo, hi))
18258 return false;
18260 unsigned int orig_cost
18261 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18262 unsigned int lo_cost
18263 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18265 /* We want to transform:
18266 MOV x1, 49370
18267 MOVK x1, 0x140, lsl 16
18268 MOVK x1, 0xc0da, lsl 32
18269 MOVK x1, 0x140, lsl 48
18270 STR x1, [x0]
18271 into:
18272 MOV w1, 49370
18273 MOVK w1, 0x140, lsl 16
18274 STP w1, w1, [x0]
18275 So we want to perform this only when we save two instructions
18276 or more. When optimizing for size, however, accept any code size
18277 savings we can. */
18278 if (size_p && orig_cost <= lo_cost)
18279 return false;
18281 if (!size_p
18282 && (orig_cost <= lo_cost + 1))
18283 return false;
18285 rtx mem_lo = adjust_address (dst, SImode, 0);
18286 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18287 return false;
18289 rtx tmp_reg = gen_reg_rtx (SImode);
18290 aarch64_expand_mov_immediate (tmp_reg, lo);
18291 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18292 /* Don't emit an explicit store pair as this may not be always profitable.
18293 Let the sched-fusion logic decide whether to merge them. */
18294 emit_move_insn (mem_lo, tmp_reg);
18295 emit_move_insn (mem_hi, tmp_reg);
18297 return true;
18300 /* Generate RTL for a conditional branch with rtx comparison CODE in
18301 mode CC_MODE. The destination of the unlikely conditional branch
18302 is LABEL_REF. */
18304 void
18305 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18306 rtx label_ref)
18308 rtx x;
18309 x = gen_rtx_fmt_ee (code, VOIDmode,
18310 gen_rtx_REG (cc_mode, CC_REGNUM),
18311 const0_rtx);
18313 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18314 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18315 pc_rtx);
18316 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18319 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18321 OP1 represents the TImode destination operand 1
18322 OP2 represents the TImode destination operand 2
18323 LOW_DEST represents the low half (DImode) of TImode operand 0
18324 LOW_IN1 represents the low half (DImode) of TImode operand 1
18325 LOW_IN2 represents the low half (DImode) of TImode operand 2
18326 HIGH_DEST represents the high half (DImode) of TImode operand 0
18327 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18328 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18330 void
18331 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18332 rtx *low_in1, rtx *low_in2,
18333 rtx *high_dest, rtx *high_in1,
18334 rtx *high_in2)
18336 *low_dest = gen_reg_rtx (DImode);
18337 *low_in1 = gen_lowpart (DImode, op1);
18338 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18339 subreg_lowpart_offset (DImode, TImode));
18340 *high_dest = gen_reg_rtx (DImode);
18341 *high_in1 = gen_highpart (DImode, op1);
18342 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18343 subreg_highpart_offset (DImode, TImode));
18346 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18348 This function differs from 'arch64_addti_scratch_regs' in that
18349 OP1 can be an immediate constant (zero). We must call
18350 subreg_highpart_offset with DImode and TImode arguments, otherwise
18351 VOIDmode will be used for the const_int which generates an internal
18352 error from subreg_size_highpart_offset which does not expect a size of zero.
18354 OP1 represents the TImode destination operand 1
18355 OP2 represents the TImode destination operand 2
18356 LOW_DEST represents the low half (DImode) of TImode operand 0
18357 LOW_IN1 represents the low half (DImode) of TImode operand 1
18358 LOW_IN2 represents the low half (DImode) of TImode operand 2
18359 HIGH_DEST represents the high half (DImode) of TImode operand 0
18360 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18361 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18364 void
18365 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18366 rtx *low_in1, rtx *low_in2,
18367 rtx *high_dest, rtx *high_in1,
18368 rtx *high_in2)
18370 *low_dest = gen_reg_rtx (DImode);
18371 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18372 subreg_lowpart_offset (DImode, TImode));
18374 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18375 subreg_lowpart_offset (DImode, TImode));
18376 *high_dest = gen_reg_rtx (DImode);
18378 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18379 subreg_highpart_offset (DImode, TImode));
18380 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18381 subreg_highpart_offset (DImode, TImode));
18384 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18386 OP0 represents the TImode destination operand 0
18387 LOW_DEST represents the low half (DImode) of TImode operand 0
18388 LOW_IN1 represents the low half (DImode) of TImode operand 1
18389 LOW_IN2 represents the low half (DImode) of TImode operand 2
18390 HIGH_DEST represents the high half (DImode) of TImode operand 0
18391 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18392 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18393 UNSIGNED_P is true if the operation is being performed on unsigned
18394 values. */
18395 void
18396 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18397 rtx low_in2, rtx high_dest, rtx high_in1,
18398 rtx high_in2, bool unsigned_p)
18400 if (low_in2 == const0_rtx)
18402 low_dest = low_in1;
18403 high_in2 = force_reg (DImode, high_in2);
18404 if (unsigned_p)
18405 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18406 else
18407 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18409 else
18411 if (CONST_INT_P (low_in2))
18413 high_in2 = force_reg (DImode, high_in2);
18414 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18415 GEN_INT (-INTVAL (low_in2))));
18417 else
18418 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18420 if (unsigned_p)
18421 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18422 else
18423 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18426 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18427 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18431 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18433 static unsigned HOST_WIDE_INT
18434 aarch64_asan_shadow_offset (void)
18436 if (TARGET_ILP32)
18437 return (HOST_WIDE_INT_1 << 29);
18438 else
18439 return (HOST_WIDE_INT_1 << 36);
18442 static rtx
18443 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18444 int code, tree treeop0, tree treeop1)
18446 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18447 rtx op0, op1;
18448 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18449 insn_code icode;
18450 struct expand_operand ops[4];
18452 start_sequence ();
18453 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18455 op_mode = GET_MODE (op0);
18456 if (op_mode == VOIDmode)
18457 op_mode = GET_MODE (op1);
18459 switch (op_mode)
18461 case E_QImode:
18462 case E_HImode:
18463 case E_SImode:
18464 cmp_mode = SImode;
18465 icode = CODE_FOR_cmpsi;
18466 break;
18468 case E_DImode:
18469 cmp_mode = DImode;
18470 icode = CODE_FOR_cmpdi;
18471 break;
18473 case E_SFmode:
18474 cmp_mode = SFmode;
18475 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18476 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18477 break;
18479 case E_DFmode:
18480 cmp_mode = DFmode;
18481 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18482 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18483 break;
18485 default:
18486 end_sequence ();
18487 return NULL_RTX;
18490 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18491 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18492 if (!op0 || !op1)
18494 end_sequence ();
18495 return NULL_RTX;
18497 *prep_seq = get_insns ();
18498 end_sequence ();
18500 create_fixed_operand (&ops[0], op0);
18501 create_fixed_operand (&ops[1], op1);
18503 start_sequence ();
18504 if (!maybe_expand_insn (icode, 2, ops))
18506 end_sequence ();
18507 return NULL_RTX;
18509 *gen_seq = get_insns ();
18510 end_sequence ();
18512 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18513 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18516 static rtx
18517 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18518 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18520 rtx op0, op1, target;
18521 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18522 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18523 insn_code icode;
18524 struct expand_operand ops[6];
18525 int aarch64_cond;
18527 push_to_sequence (*prep_seq);
18528 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18530 op_mode = GET_MODE (op0);
18531 if (op_mode == VOIDmode)
18532 op_mode = GET_MODE (op1);
18534 switch (op_mode)
18536 case E_QImode:
18537 case E_HImode:
18538 case E_SImode:
18539 cmp_mode = SImode;
18540 icode = CODE_FOR_ccmpsi;
18541 break;
18543 case E_DImode:
18544 cmp_mode = DImode;
18545 icode = CODE_FOR_ccmpdi;
18546 break;
18548 case E_SFmode:
18549 cmp_mode = SFmode;
18550 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18551 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18552 break;
18554 case E_DFmode:
18555 cmp_mode = DFmode;
18556 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18557 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18558 break;
18560 default:
18561 end_sequence ();
18562 return NULL_RTX;
18565 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18566 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18567 if (!op0 || !op1)
18569 end_sequence ();
18570 return NULL_RTX;
18572 *prep_seq = get_insns ();
18573 end_sequence ();
18575 target = gen_rtx_REG (cc_mode, CC_REGNUM);
18576 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18578 if (bit_code != AND)
18580 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18581 GET_MODE (XEXP (prev, 0))),
18582 VOIDmode, XEXP (prev, 0), const0_rtx);
18583 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18586 create_fixed_operand (&ops[0], XEXP (prev, 0));
18587 create_fixed_operand (&ops[1], target);
18588 create_fixed_operand (&ops[2], op0);
18589 create_fixed_operand (&ops[3], op1);
18590 create_fixed_operand (&ops[4], prev);
18591 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18593 push_to_sequence (*gen_seq);
18594 if (!maybe_expand_insn (icode, 6, ops))
18596 end_sequence ();
18597 return NULL_RTX;
18600 *gen_seq = get_insns ();
18601 end_sequence ();
18603 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18606 #undef TARGET_GEN_CCMP_FIRST
18607 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18609 #undef TARGET_GEN_CCMP_NEXT
18610 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18612 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18613 instruction fusion of some sort. */
18615 static bool
18616 aarch64_macro_fusion_p (void)
18618 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18622 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18623 should be kept together during scheduling. */
18625 static bool
18626 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18628 rtx set_dest;
18629 rtx prev_set = single_set (prev);
18630 rtx curr_set = single_set (curr);
18631 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18632 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18634 if (!aarch64_macro_fusion_p ())
18635 return false;
18637 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18639 /* We are trying to match:
18640 prev (mov) == (set (reg r0) (const_int imm16))
18641 curr (movk) == (set (zero_extract (reg r0)
18642 (const_int 16)
18643 (const_int 16))
18644 (const_int imm16_1)) */
18646 set_dest = SET_DEST (curr_set);
18648 if (GET_CODE (set_dest) == ZERO_EXTRACT
18649 && CONST_INT_P (SET_SRC (curr_set))
18650 && CONST_INT_P (SET_SRC (prev_set))
18651 && CONST_INT_P (XEXP (set_dest, 2))
18652 && INTVAL (XEXP (set_dest, 2)) == 16
18653 && REG_P (XEXP (set_dest, 0))
18654 && REG_P (SET_DEST (prev_set))
18655 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18657 return true;
18661 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18664 /* We're trying to match:
18665 prev (adrp) == (set (reg r1)
18666 (high (symbol_ref ("SYM"))))
18667 curr (add) == (set (reg r0)
18668 (lo_sum (reg r1)
18669 (symbol_ref ("SYM"))))
18670 Note that r0 need not necessarily be the same as r1, especially
18671 during pre-regalloc scheduling. */
18673 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18674 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18676 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18677 && REG_P (XEXP (SET_SRC (curr_set), 0))
18678 && REGNO (XEXP (SET_SRC (curr_set), 0))
18679 == REGNO (SET_DEST (prev_set))
18680 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18681 XEXP (SET_SRC (curr_set), 1)))
18682 return true;
18686 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18689 /* We're trying to match:
18690 prev (movk) == (set (zero_extract (reg r0)
18691 (const_int 16)
18692 (const_int 32))
18693 (const_int imm16_1))
18694 curr (movk) == (set (zero_extract (reg r0)
18695 (const_int 16)
18696 (const_int 48))
18697 (const_int imm16_2)) */
18699 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18700 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18701 && REG_P (XEXP (SET_DEST (prev_set), 0))
18702 && REG_P (XEXP (SET_DEST (curr_set), 0))
18703 && REGNO (XEXP (SET_DEST (prev_set), 0))
18704 == REGNO (XEXP (SET_DEST (curr_set), 0))
18705 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18706 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18707 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18708 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18709 && CONST_INT_P (SET_SRC (prev_set))
18710 && CONST_INT_P (SET_SRC (curr_set)))
18711 return true;
18714 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18716 /* We're trying to match:
18717 prev (adrp) == (set (reg r0)
18718 (high (symbol_ref ("SYM"))))
18719 curr (ldr) == (set (reg r1)
18720 (mem (lo_sum (reg r0)
18721 (symbol_ref ("SYM")))))
18723 curr (ldr) == (set (reg r1)
18724 (zero_extend (mem
18725 (lo_sum (reg r0)
18726 (symbol_ref ("SYM")))))) */
18727 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18728 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18730 rtx curr_src = SET_SRC (curr_set);
18732 if (GET_CODE (curr_src) == ZERO_EXTEND)
18733 curr_src = XEXP (curr_src, 0);
18735 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18736 && REG_P (XEXP (XEXP (curr_src, 0), 0))
18737 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18738 == REGNO (SET_DEST (prev_set))
18739 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18740 XEXP (SET_SRC (prev_set), 0)))
18741 return true;
18745 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18746 && any_condjump_p (curr))
18748 unsigned int condreg1, condreg2;
18749 rtx cc_reg_1;
18750 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18751 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18753 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18754 && prev
18755 && modified_in_p (cc_reg_1, prev))
18757 enum attr_type prev_type = get_attr_type (prev);
18759 /* FIXME: this misses some which is considered simple arthematic
18760 instructions for ThunderX. Simple shifts are missed here. */
18761 if (prev_type == TYPE_ALUS_SREG
18762 || prev_type == TYPE_ALUS_IMM
18763 || prev_type == TYPE_LOGICS_REG
18764 || prev_type == TYPE_LOGICS_IMM)
18765 return true;
18769 if (prev_set
18770 && curr_set
18771 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18772 && any_condjump_p (curr))
18774 /* We're trying to match:
18775 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18776 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18777 (const_int 0))
18778 (label_ref ("SYM"))
18779 (pc)) */
18780 if (SET_DEST (curr_set) == (pc_rtx)
18781 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18782 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18783 && REG_P (SET_DEST (prev_set))
18784 && REGNO (SET_DEST (prev_set))
18785 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18787 /* Fuse ALU operations followed by conditional branch instruction. */
18788 switch (get_attr_type (prev))
18790 case TYPE_ALU_IMM:
18791 case TYPE_ALU_SREG:
18792 case TYPE_ADC_REG:
18793 case TYPE_ADC_IMM:
18794 case TYPE_ADCS_REG:
18795 case TYPE_ADCS_IMM:
18796 case TYPE_LOGIC_REG:
18797 case TYPE_LOGIC_IMM:
18798 case TYPE_CSEL:
18799 case TYPE_ADR:
18800 case TYPE_MOV_IMM:
18801 case TYPE_SHIFT_REG:
18802 case TYPE_SHIFT_IMM:
18803 case TYPE_BFM:
18804 case TYPE_RBIT:
18805 case TYPE_REV:
18806 case TYPE_EXTEND:
18807 return true;
18809 default:;
18814 return false;
18817 /* Return true iff the instruction fusion described by OP is enabled. */
18819 bool
18820 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18822 return (aarch64_tune_params.fusible_ops & op) != 0;
18825 /* If MEM is in the form of [base+offset], extract the two parts
18826 of address and set to BASE and OFFSET, otherwise return false
18827 after clearing BASE and OFFSET. */
18829 bool
18830 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18832 rtx addr;
18834 gcc_assert (MEM_P (mem));
18836 addr = XEXP (mem, 0);
18838 if (REG_P (addr))
18840 *base = addr;
18841 *offset = const0_rtx;
18842 return true;
18845 if (GET_CODE (addr) == PLUS
18846 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18848 *base = XEXP (addr, 0);
18849 *offset = XEXP (addr, 1);
18850 return true;
18853 *base = NULL_RTX;
18854 *offset = NULL_RTX;
18856 return false;
18859 /* Types for scheduling fusion. */
18860 enum sched_fusion_type
18862 SCHED_FUSION_NONE = 0,
18863 SCHED_FUSION_LD_SIGN_EXTEND,
18864 SCHED_FUSION_LD_ZERO_EXTEND,
18865 SCHED_FUSION_LD,
18866 SCHED_FUSION_ST,
18867 SCHED_FUSION_NUM
18870 /* If INSN is a load or store of address in the form of [base+offset],
18871 extract the two parts and set to BASE and OFFSET. Return scheduling
18872 fusion type this INSN is. */
18874 static enum sched_fusion_type
18875 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18877 rtx x, dest, src;
18878 enum sched_fusion_type fusion = SCHED_FUSION_LD;
18880 gcc_assert (INSN_P (insn));
18881 x = PATTERN (insn);
18882 if (GET_CODE (x) != SET)
18883 return SCHED_FUSION_NONE;
18885 src = SET_SRC (x);
18886 dest = SET_DEST (x);
18888 machine_mode dest_mode = GET_MODE (dest);
18890 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18891 return SCHED_FUSION_NONE;
18893 if (GET_CODE (src) == SIGN_EXTEND)
18895 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18896 src = XEXP (src, 0);
18897 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18898 return SCHED_FUSION_NONE;
18900 else if (GET_CODE (src) == ZERO_EXTEND)
18902 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18903 src = XEXP (src, 0);
18904 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18905 return SCHED_FUSION_NONE;
18908 if (GET_CODE (src) == MEM && REG_P (dest))
18909 extract_base_offset_in_addr (src, base, offset);
18910 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18912 fusion = SCHED_FUSION_ST;
18913 extract_base_offset_in_addr (dest, base, offset);
18915 else
18916 return SCHED_FUSION_NONE;
18918 if (*base == NULL_RTX || *offset == NULL_RTX)
18919 fusion = SCHED_FUSION_NONE;
18921 return fusion;
18924 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18926 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18927 and PRI are only calculated for these instructions. For other instruction,
18928 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18929 type instruction fusion can be added by returning different priorities.
18931 It's important that irrelevant instructions get the largest FUSION_PRI. */
18933 static void
18934 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18935 int *fusion_pri, int *pri)
18937 int tmp, off_val;
18938 rtx base, offset;
18939 enum sched_fusion_type fusion;
18941 gcc_assert (INSN_P (insn));
18943 tmp = max_pri - 1;
18944 fusion = fusion_load_store (insn, &base, &offset);
18945 if (fusion == SCHED_FUSION_NONE)
18947 *pri = tmp;
18948 *fusion_pri = tmp;
18949 return;
18952 /* Set FUSION_PRI according to fusion type and base register. */
18953 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18955 /* Calculate PRI. */
18956 tmp /= 2;
18958 /* INSN with smaller offset goes first. */
18959 off_val = (int)(INTVAL (offset));
18960 if (off_val >= 0)
18961 tmp -= (off_val & 0xfffff);
18962 else
18963 tmp += ((- off_val) & 0xfffff);
18965 *pri = tmp;
18966 return;
18969 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18970 Adjust priority of sha1h instructions so they are scheduled before
18971 other SHA1 instructions. */
18973 static int
18974 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18976 rtx x = PATTERN (insn);
18978 if (GET_CODE (x) == SET)
18980 x = SET_SRC (x);
18982 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18983 return priority + 10;
18986 return priority;
18989 /* Given OPERANDS of consecutive load/store, check if we can merge
18990 them into ldp/stp. LOAD is true if they are load instructions.
18991 MODE is the mode of memory operands. */
18993 bool
18994 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18995 machine_mode mode)
18997 HOST_WIDE_INT offval_1, offval_2, msize;
18998 enum reg_class rclass_1, rclass_2;
18999 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19001 if (load)
19003 mem_1 = operands[1];
19004 mem_2 = operands[3];
19005 reg_1 = operands[0];
19006 reg_2 = operands[2];
19007 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19008 if (REGNO (reg_1) == REGNO (reg_2))
19009 return false;
19011 else
19013 mem_1 = operands[0];
19014 mem_2 = operands[2];
19015 reg_1 = operands[1];
19016 reg_2 = operands[3];
19019 /* The mems cannot be volatile. */
19020 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19021 return false;
19023 /* If we have SImode and slow unaligned ldp,
19024 check the alignment to be at least 8 byte. */
19025 if (mode == SImode
19026 && (aarch64_tune_params.extra_tuning_flags
19027 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19028 && !optimize_size
19029 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19030 return false;
19032 /* Check if the addresses are in the form of [base+offset]. */
19033 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19034 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19035 return false;
19036 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19037 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19038 return false;
19040 /* Check if the bases are same. */
19041 if (!rtx_equal_p (base_1, base_2))
19042 return false;
19044 /* The operands must be of the same size. */
19045 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19046 GET_MODE_SIZE (GET_MODE (mem_2))));
19048 offval_1 = INTVAL (offset_1);
19049 offval_2 = INTVAL (offset_2);
19050 /* We should only be trying this for fixed-sized modes. There is no
19051 SVE LDP/STP instruction. */
19052 msize = GET_MODE_SIZE (mode).to_constant ();
19053 /* Check if the offsets are consecutive. */
19054 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19055 return false;
19057 /* Check if the addresses are clobbered by load. */
19058 if (load)
19060 if (reg_mentioned_p (reg_1, mem_1))
19061 return false;
19063 /* In increasing order, the last load can clobber the address. */
19064 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19065 return false;
19068 /* One of the memory accesses must be a mempair operand.
19069 If it is not the first one, they need to be swapped by the
19070 peephole. */
19071 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19072 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19073 return false;
19075 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19076 rclass_1 = FP_REGS;
19077 else
19078 rclass_1 = GENERAL_REGS;
19080 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19081 rclass_2 = FP_REGS;
19082 else
19083 rclass_2 = GENERAL_REGS;
19085 /* Check if the registers are of same class. */
19086 if (rclass_1 != rclass_2)
19087 return false;
19089 return true;
19092 /* Given OPERANDS of consecutive load/store that can be merged,
19093 swap them if they are not in ascending order. */
19094 void
19095 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19097 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19098 HOST_WIDE_INT offval_1, offval_2;
19100 if (load)
19102 mem_1 = operands[1];
19103 mem_2 = operands[3];
19105 else
19107 mem_1 = operands[0];
19108 mem_2 = operands[2];
19111 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19112 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19114 offval_1 = INTVAL (offset_1);
19115 offval_2 = INTVAL (offset_2);
19117 if (offval_1 > offval_2)
19119 /* Irrespective of whether this is a load or a store,
19120 we do the same swap. */
19121 std::swap (operands[0], operands[2]);
19122 std::swap (operands[1], operands[3]);
19126 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19127 comparison between the two. */
19129 aarch64_host_wide_int_compare (const void *x, const void *y)
19131 return wi::cmps (* ((const HOST_WIDE_INT *) x),
19132 * ((const HOST_WIDE_INT *) y));
19135 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19136 other pointing to a REG rtx containing an offset, compare the offsets
19137 of the two pairs.
19139 Return:
19141 1 iff offset (X) > offset (Y)
19142 0 iff offset (X) == offset (Y)
19143 -1 iff offset (X) < offset (Y) */
19145 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19147 const rtx * operands_1 = (const rtx *) x;
19148 const rtx * operands_2 = (const rtx *) y;
19149 rtx mem_1, mem_2, base, offset_1, offset_2;
19151 if (MEM_P (operands_1[0]))
19152 mem_1 = operands_1[0];
19153 else
19154 mem_1 = operands_1[1];
19156 if (MEM_P (operands_2[0]))
19157 mem_2 = operands_2[0];
19158 else
19159 mem_2 = operands_2[1];
19161 /* Extract the offsets. */
19162 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19163 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19165 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19167 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19170 /* Given OPERANDS of consecutive load/store, check if we can merge
19171 them into ldp/stp by adjusting the offset. LOAD is true if they
19172 are load instructions. MODE is the mode of memory operands.
19174 Given below consecutive stores:
19176 str w1, [xb, 0x100]
19177 str w1, [xb, 0x104]
19178 str w1, [xb, 0x108]
19179 str w1, [xb, 0x10c]
19181 Though the offsets are out of the range supported by stp, we can
19182 still pair them after adjusting the offset, like:
19184 add scratch, xb, 0x100
19185 stp w1, w1, [scratch]
19186 stp w1, w1, [scratch, 0x8]
19188 The peephole patterns detecting this opportunity should guarantee
19189 the scratch register is avaliable. */
19191 bool
19192 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19193 scalar_mode mode)
19195 const int num_insns = 4;
19196 enum reg_class rclass;
19197 HOST_WIDE_INT offvals[num_insns], msize;
19198 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19200 if (load)
19202 for (int i = 0; i < num_insns; i++)
19204 reg[i] = operands[2 * i];
19205 mem[i] = operands[2 * i + 1];
19207 gcc_assert (REG_P (reg[i]));
19210 /* Do not attempt to merge the loads if the loads clobber each other. */
19211 for (int i = 0; i < 8; i += 2)
19212 for (int j = i + 2; j < 8; j += 2)
19213 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19214 return false;
19216 else
19217 for (int i = 0; i < num_insns; i++)
19219 mem[i] = operands[2 * i];
19220 reg[i] = operands[2 * i + 1];
19223 /* Skip if memory operand is by itself valid for ldp/stp. */
19224 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19225 return false;
19227 for (int i = 0; i < num_insns; i++)
19229 /* The mems cannot be volatile. */
19230 if (MEM_VOLATILE_P (mem[i]))
19231 return false;
19233 /* Check if the addresses are in the form of [base+offset]. */
19234 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19235 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19236 return false;
19239 /* Check if the registers are of same class. */
19240 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19241 ? FP_REGS : GENERAL_REGS;
19243 for (int i = 1; i < num_insns; i++)
19244 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19246 if (rclass != FP_REGS)
19247 return false;
19249 else
19251 if (rclass != GENERAL_REGS)
19252 return false;
19255 /* Only the last register in the order in which they occur
19256 may be clobbered by the load. */
19257 if (rclass == GENERAL_REGS && load)
19258 for (int i = 0; i < num_insns - 1; i++)
19259 if (reg_mentioned_p (reg[i], mem[i]))
19260 return false;
19262 /* Check if the bases are same. */
19263 for (int i = 0; i < num_insns - 1; i++)
19264 if (!rtx_equal_p (base[i], base[i + 1]))
19265 return false;
19267 for (int i = 0; i < num_insns; i++)
19268 offvals[i] = INTVAL (offset[i]);
19270 msize = GET_MODE_SIZE (mode);
19272 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19273 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19274 aarch64_host_wide_int_compare);
19276 if (!(offvals[1] == offvals[0] + msize
19277 && offvals[3] == offvals[2] + msize))
19278 return false;
19280 /* Check that offsets are within range of each other. The ldp/stp
19281 instructions have 7 bit immediate offsets, so use 0x80. */
19282 if (offvals[2] - offvals[0] >= msize * 0x80)
19283 return false;
19285 /* The offsets must be aligned with respect to each other. */
19286 if (offvals[0] % msize != offvals[2] % msize)
19287 return false;
19289 /* If we have SImode and slow unaligned ldp,
19290 check the alignment to be at least 8 byte. */
19291 if (mode == SImode
19292 && (aarch64_tune_params.extra_tuning_flags
19293 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19294 && !optimize_size
19295 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19296 return false;
19298 return true;
19301 /* Given OPERANDS of consecutive load/store, this function pairs them
19302 into LDP/STP after adjusting the offset. It depends on the fact
19303 that the operands can be sorted so the offsets are correct for STP.
19304 MODE is the mode of memory operands. CODE is the rtl operator
19305 which should be applied to all memory operands, it's SIGN_EXTEND,
19306 ZERO_EXTEND or UNKNOWN. */
19308 bool
19309 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19310 scalar_mode mode, RTX_CODE code)
19312 rtx base, offset_1, offset_3, t1, t2;
19313 rtx mem_1, mem_2, mem_3, mem_4;
19314 rtx temp_operands[8];
19315 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19316 stp_off_upper_limit, stp_off_lower_limit, msize;
19318 /* We make changes on a copy as we may still bail out. */
19319 for (int i = 0; i < 8; i ++)
19320 temp_operands[i] = operands[i];
19322 /* Sort the operands. */
19323 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19325 /* Copy the memory operands so that if we have to bail for some
19326 reason the original addresses are unchanged. */
19327 if (load)
19329 mem_1 = copy_rtx (temp_operands[1]);
19330 mem_2 = copy_rtx (temp_operands[3]);
19331 mem_3 = copy_rtx (temp_operands[5]);
19332 mem_4 = copy_rtx (temp_operands[7]);
19334 else
19336 mem_1 = copy_rtx (temp_operands[0]);
19337 mem_2 = copy_rtx (temp_operands[2]);
19338 mem_3 = copy_rtx (temp_operands[4]);
19339 mem_4 = copy_rtx (temp_operands[6]);
19340 gcc_assert (code == UNKNOWN);
19343 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19344 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19345 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19346 && offset_3 != NULL_RTX);
19348 /* Adjust offset so it can fit in LDP/STP instruction. */
19349 msize = GET_MODE_SIZE (mode);
19350 stp_off_upper_limit = msize * (0x40 - 1);
19351 stp_off_lower_limit = - msize * 0x40;
19353 off_val_1 = INTVAL (offset_1);
19354 off_val_3 = INTVAL (offset_3);
19356 /* The base offset is optimally half way between the two STP/LDP offsets. */
19357 if (msize <= 4)
19358 base_off = (off_val_1 + off_val_3) / 2;
19359 else
19360 /* However, due to issues with negative LDP/STP offset generation for
19361 larger modes, for DF, DI and vector modes. we must not use negative
19362 addresses smaller than 9 signed unadjusted bits can store. This
19363 provides the most range in this case. */
19364 base_off = off_val_1;
19366 /* Adjust the base so that it is aligned with the addresses but still
19367 optimal. */
19368 if (base_off % msize != off_val_1 % msize)
19369 /* Fix the offset, bearing in mind we want to make it bigger not
19370 smaller. */
19371 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19372 else if (msize <= 4)
19373 /* The negative range of LDP/STP is one larger than the positive range. */
19374 base_off += msize;
19376 /* Check if base offset is too big or too small. We can attempt to resolve
19377 this issue by setting it to the maximum value and seeing if the offsets
19378 still fit. */
19379 if (base_off >= 0x1000)
19381 base_off = 0x1000 - 1;
19382 /* We must still make sure that the base offset is aligned with respect
19383 to the address. But it may may not be made any bigger. */
19384 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19387 /* Likewise for the case where the base is too small. */
19388 if (base_off <= -0x1000)
19390 base_off = -0x1000 + 1;
19391 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19394 /* Offset of the first STP/LDP. */
19395 new_off_1 = off_val_1 - base_off;
19397 /* Offset of the second STP/LDP. */
19398 new_off_3 = off_val_3 - base_off;
19400 /* The offsets must be within the range of the LDP/STP instructions. */
19401 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19402 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19403 return false;
19405 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19406 new_off_1), true);
19407 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19408 new_off_1 + msize), true);
19409 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19410 new_off_3), true);
19411 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19412 new_off_3 + msize), true);
19414 if (!aarch64_mem_pair_operand (mem_1, mode)
19415 || !aarch64_mem_pair_operand (mem_3, mode))
19416 return false;
19418 if (code == ZERO_EXTEND)
19420 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19421 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19422 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19423 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19425 else if (code == SIGN_EXTEND)
19427 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19428 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19429 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19430 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19433 if (load)
19435 operands[0] = temp_operands[0];
19436 operands[1] = mem_1;
19437 operands[2] = temp_operands[2];
19438 operands[3] = mem_2;
19439 operands[4] = temp_operands[4];
19440 operands[5] = mem_3;
19441 operands[6] = temp_operands[6];
19442 operands[7] = mem_4;
19444 else
19446 operands[0] = mem_1;
19447 operands[1] = temp_operands[1];
19448 operands[2] = mem_2;
19449 operands[3] = temp_operands[3];
19450 operands[4] = mem_3;
19451 operands[5] = temp_operands[5];
19452 operands[6] = mem_4;
19453 operands[7] = temp_operands[7];
19456 /* Emit adjusting instruction. */
19457 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19458 /* Emit ldp/stp instructions. */
19459 t1 = gen_rtx_SET (operands[0], operands[1]);
19460 t2 = gen_rtx_SET (operands[2], operands[3]);
19461 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19462 t1 = gen_rtx_SET (operands[4], operands[5]);
19463 t2 = gen_rtx_SET (operands[6], operands[7]);
19464 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19465 return true;
19468 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19469 it isn't worth branching around empty masked ops (including masked
19470 stores). */
19472 static bool
19473 aarch64_empty_mask_is_expensive (unsigned)
19475 return false;
19478 /* Return 1 if pseudo register should be created and used to hold
19479 GOT address for PIC code. */
19481 bool
19482 aarch64_use_pseudo_pic_reg (void)
19484 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19487 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19489 static int
19490 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19492 switch (XINT (x, 1))
19494 case UNSPEC_GOTSMALLPIC:
19495 case UNSPEC_GOTSMALLPIC28K:
19496 case UNSPEC_GOTTINYPIC:
19497 return 0;
19498 default:
19499 break;
19502 return default_unspec_may_trap_p (x, flags);
19506 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19507 return the log2 of that value. Otherwise return -1. */
19510 aarch64_fpconst_pow_of_2 (rtx x)
19512 const REAL_VALUE_TYPE *r;
19514 if (!CONST_DOUBLE_P (x))
19515 return -1;
19517 r = CONST_DOUBLE_REAL_VALUE (x);
19519 if (REAL_VALUE_NEGATIVE (*r)
19520 || REAL_VALUE_ISNAN (*r)
19521 || REAL_VALUE_ISINF (*r)
19522 || !real_isinteger (r, DFmode))
19523 return -1;
19525 return exact_log2 (real_to_integer (r));
19528 /* If X is a vector of equal CONST_DOUBLE values and that value is
19529 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19532 aarch64_vec_fpconst_pow_of_2 (rtx x)
19534 int nelts;
19535 if (GET_CODE (x) != CONST_VECTOR
19536 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19537 return -1;
19539 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19540 return -1;
19542 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19543 if (firstval <= 0)
19544 return -1;
19546 for (int i = 1; i < nelts; i++)
19547 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19548 return -1;
19550 return firstval;
19553 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19554 to float.
19556 __fp16 always promotes through this hook.
19557 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19558 through the generic excess precision logic rather than here. */
19560 static tree
19561 aarch64_promoted_type (const_tree t)
19563 if (SCALAR_FLOAT_TYPE_P (t)
19564 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19565 return float_type_node;
19567 return NULL_TREE;
19570 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19572 static bool
19573 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19574 optimization_type opt_type)
19576 switch (op)
19578 case rsqrt_optab:
19579 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19581 default:
19582 return true;
19586 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19588 static unsigned int
19589 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19590 int *offset)
19592 /* Polynomial invariant 1 == (VG / 2) - 1. */
19593 gcc_assert (i == 1);
19594 *factor = 2;
19595 *offset = 1;
19596 return AARCH64_DWARF_VG;
19599 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19600 if MODE is HFmode, and punt to the generic implementation otherwise. */
19602 static bool
19603 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19605 return (mode == HFmode
19606 ? true
19607 : default_libgcc_floating_mode_supported_p (mode));
19610 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19611 if MODE is HFmode, and punt to the generic implementation otherwise. */
19613 static bool
19614 aarch64_scalar_mode_supported_p (scalar_mode mode)
19616 return (mode == HFmode
19617 ? true
19618 : default_scalar_mode_supported_p (mode));
19621 /* Set the value of FLT_EVAL_METHOD.
19622 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19624 0: evaluate all operations and constants, whose semantic type has at
19625 most the range and precision of type float, to the range and
19626 precision of float; evaluate all other operations and constants to
19627 the range and precision of the semantic type;
19629 N, where _FloatN is a supported interchange floating type
19630 evaluate all operations and constants, whose semantic type has at
19631 most the range and precision of _FloatN type, to the range and
19632 precision of the _FloatN type; evaluate all other operations and
19633 constants to the range and precision of the semantic type;
19635 If we have the ARMv8.2-A extensions then we support _Float16 in native
19636 precision, so we should set this to 16. Otherwise, we support the type,
19637 but want to evaluate expressions in float precision, so set this to
19638 0. */
19640 static enum flt_eval_method
19641 aarch64_excess_precision (enum excess_precision_type type)
19643 switch (type)
19645 case EXCESS_PRECISION_TYPE_FAST:
19646 case EXCESS_PRECISION_TYPE_STANDARD:
19647 /* We can calculate either in 16-bit range and precision or
19648 32-bit range and precision. Make that decision based on whether
19649 we have native support for the ARMv8.2-A 16-bit floating-point
19650 instructions or not. */
19651 return (TARGET_FP_F16INST
19652 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19653 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19654 case EXCESS_PRECISION_TYPE_IMPLICIT:
19655 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19656 default:
19657 gcc_unreachable ();
19659 return FLT_EVAL_METHOD_UNPREDICTABLE;
19662 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19663 scheduled for speculative execution. Reject the long-running division
19664 and square-root instructions. */
19666 static bool
19667 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19669 switch (get_attr_type (insn))
19671 case TYPE_SDIV:
19672 case TYPE_UDIV:
19673 case TYPE_FDIVS:
19674 case TYPE_FDIVD:
19675 case TYPE_FSQRTS:
19676 case TYPE_FSQRTD:
19677 case TYPE_NEON_FP_SQRT_S:
19678 case TYPE_NEON_FP_SQRT_D:
19679 case TYPE_NEON_FP_SQRT_S_Q:
19680 case TYPE_NEON_FP_SQRT_D_Q:
19681 case TYPE_NEON_FP_DIV_S:
19682 case TYPE_NEON_FP_DIV_D:
19683 case TYPE_NEON_FP_DIV_S_Q:
19684 case TYPE_NEON_FP_DIV_D_Q:
19685 return false;
19686 default:
19687 return true;
19691 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19693 static int
19694 aarch64_compute_pressure_classes (reg_class *classes)
19696 int i = 0;
19697 classes[i++] = GENERAL_REGS;
19698 classes[i++] = FP_REGS;
19699 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19700 registers need to go in PR_LO_REGS at some point during their
19701 lifetime. Splitting it into two halves has the effect of making
19702 all predicates count against PR_LO_REGS, so that we try whenever
19703 possible to restrict the number of live predicates to 8. This
19704 greatly reduces the amount of spilling in certain loops. */
19705 classes[i++] = PR_LO_REGS;
19706 classes[i++] = PR_HI_REGS;
19707 return i;
19710 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19712 static bool
19713 aarch64_can_change_mode_class (machine_mode from,
19714 machine_mode to, reg_class_t)
19716 if (BYTES_BIG_ENDIAN)
19718 bool from_sve_p = aarch64_sve_data_mode_p (from);
19719 bool to_sve_p = aarch64_sve_data_mode_p (to);
19721 /* Don't allow changes between SVE data modes and non-SVE modes.
19722 See the comment at the head of aarch64-sve.md for details. */
19723 if (from_sve_p != to_sve_p)
19724 return false;
19726 /* Don't allow changes in element size: lane 0 of the new vector
19727 would not then be lane 0 of the old vector. See the comment
19728 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19729 description.
19731 In the worst case, this forces a register to be spilled in
19732 one mode and reloaded in the other, which handles the
19733 endianness correctly. */
19734 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19735 return false;
19737 return true;
19740 /* Implement TARGET_EARLY_REMAT_MODES. */
19742 static void
19743 aarch64_select_early_remat_modes (sbitmap modes)
19745 /* SVE values are not normally live across a call, so it should be
19746 worth doing early rematerialization even in VL-specific mode. */
19747 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19749 machine_mode mode = (machine_mode) i;
19750 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19751 if (vec_flags & VEC_ANY_SVE)
19752 bitmap_set_bit (modes, i);
19756 /* Override the default target speculation_safe_value. */
19757 static rtx
19758 aarch64_speculation_safe_value (machine_mode mode,
19759 rtx result, rtx val, rtx failval)
19761 /* Maybe we should warn if falling back to hard barriers. They are
19762 likely to be noticably more expensive than the alternative below. */
19763 if (!aarch64_track_speculation)
19764 return default_speculation_safe_value (mode, result, val, failval);
19766 if (!REG_P (val))
19767 val = copy_to_mode_reg (mode, val);
19769 if (!aarch64_reg_or_zero (failval, mode))
19770 failval = copy_to_mode_reg (mode, failval);
19772 emit_insn (gen_despeculate_copy (mode, result, val, failval));
19773 return result;
19776 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19777 Look into the tuning structure for an estimate.
19778 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19779 Advanced SIMD 128 bits. */
19781 static HOST_WIDE_INT
19782 aarch64_estimated_poly_value (poly_int64 val)
19784 enum aarch64_sve_vector_bits_enum width_source
19785 = aarch64_tune_params.sve_width;
19787 /* If we still don't have an estimate, use the default. */
19788 if (width_source == SVE_SCALABLE)
19789 return default_estimated_poly_value (val);
19791 HOST_WIDE_INT over_128 = width_source - 128;
19792 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19796 /* Return true for types that could be supported as SIMD return or
19797 argument types. */
19799 static bool
19800 supported_simd_type (tree t)
19802 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19804 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19805 return s == 1 || s == 2 || s == 4 || s == 8;
19807 return false;
19810 /* Return true for types that currently are supported as SIMD return
19811 or argument types. */
19813 static bool
19814 currently_supported_simd_type (tree t, tree b)
19816 if (COMPLEX_FLOAT_TYPE_P (t))
19817 return false;
19819 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19820 return false;
19822 return supported_simd_type (t);
19825 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19827 static int
19828 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19829 struct cgraph_simd_clone *clonei,
19830 tree base_type, int num)
19832 tree t, ret_type, arg_type;
19833 unsigned int elt_bits, vec_bits, count;
19835 if (!TARGET_SIMD)
19836 return 0;
19838 if (clonei->simdlen
19839 && (clonei->simdlen < 2
19840 || clonei->simdlen > 1024
19841 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19843 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19844 "unsupported simdlen %d", clonei->simdlen);
19845 return 0;
19848 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19849 if (TREE_CODE (ret_type) != VOID_TYPE
19850 && !currently_supported_simd_type (ret_type, base_type))
19852 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19853 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19854 "GCC does not currently support mixed size types "
19855 "for %<simd%> functions");
19856 else if (supported_simd_type (ret_type))
19857 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19858 "GCC does not currently support return type %qT "
19859 "for %<simd%> functions", ret_type);
19860 else
19861 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19862 "unsupported return type %qT for %<simd%> functions",
19863 ret_type);
19864 return 0;
19867 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19869 arg_type = TREE_TYPE (t);
19871 if (!currently_supported_simd_type (arg_type, base_type))
19873 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19874 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19875 "GCC does not currently support mixed size types "
19876 "for %<simd%> functions");
19877 else
19878 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19879 "GCC does not currently support argument type %qT "
19880 "for %<simd%> functions", arg_type);
19881 return 0;
19885 clonei->vecsize_mangle = 'n';
19886 clonei->mask_mode = VOIDmode;
19887 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19888 if (clonei->simdlen == 0)
19890 count = 2;
19891 vec_bits = (num == 0 ? 64 : 128);
19892 clonei->simdlen = vec_bits / elt_bits;
19894 else
19896 count = 1;
19897 vec_bits = clonei->simdlen * elt_bits;
19898 if (vec_bits != 64 && vec_bits != 128)
19900 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19901 "GCC does not currently support simdlen %d for type %qT",
19902 clonei->simdlen, base_type);
19903 return 0;
19906 clonei->vecsize_int = vec_bits;
19907 clonei->vecsize_float = vec_bits;
19908 return count;
19911 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19913 static void
19914 aarch64_simd_clone_adjust (struct cgraph_node *node)
19916 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19917 use the correct ABI. */
19919 tree t = TREE_TYPE (node->decl);
19920 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19921 TYPE_ATTRIBUTES (t));
19924 /* Implement TARGET_SIMD_CLONE_USABLE. */
19926 static int
19927 aarch64_simd_clone_usable (struct cgraph_node *node)
19929 switch (node->simdclone->vecsize_mangle)
19931 case 'n':
19932 if (!TARGET_SIMD)
19933 return -1;
19934 return 0;
19935 default:
19936 gcc_unreachable ();
19940 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19942 static int
19943 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19945 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19946 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19947 return 0;
19948 return 1;
19951 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19953 static const char *
19954 aarch64_get_multilib_abi_name (void)
19956 if (TARGET_BIG_END)
19957 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19958 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19961 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19962 global variable based guard use the default else
19963 return a null tree. */
19964 static tree
19965 aarch64_stack_protect_guard (void)
19967 if (aarch64_stack_protector_guard == SSP_GLOBAL)
19968 return default_stack_protect_guard ();
19970 return NULL_TREE;
19973 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19974 section at the end if needed. */
19975 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19976 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19977 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19978 void
19979 aarch64_file_end_indicate_exec_stack ()
19981 file_end_indicate_exec_stack ();
19983 unsigned feature_1_and = 0;
19984 if (aarch64_bti_enabled ())
19985 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19987 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19988 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19990 if (feature_1_and)
19992 /* Generate .note.gnu.property section. */
19993 switch_to_section (get_section (".note.gnu.property",
19994 SECTION_NOTYPE, NULL));
19996 /* PT_NOTE header: namesz, descsz, type.
19997 namesz = 4 ("GNU\0")
19998 descsz = 16 (Size of the program property array)
19999 [(12 + padding) * Number of array elements]
20000 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20001 assemble_align (POINTER_SIZE);
20002 assemble_integer (GEN_INT (4), 4, 32, 1);
20003 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20004 assemble_integer (GEN_INT (5), 4, 32, 1);
20006 /* PT_NOTE name. */
20007 assemble_string ("GNU", 4);
20009 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20010 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20011 datasz = 4
20012 data = feature_1_and. */
20013 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20014 assemble_integer (GEN_INT (4), 4, 32, 1);
20015 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20017 /* Pad the size of the note to the required alignment. */
20018 assemble_align (POINTER_SIZE);
20021 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20022 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20023 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20025 /* Target-specific selftests. */
20027 #if CHECKING_P
20029 namespace selftest {
20031 /* Selftest for the RTL loader.
20032 Verify that the RTL loader copes with a dump from
20033 print_rtx_function. This is essentially just a test that class
20034 function_reader can handle a real dump, but it also verifies
20035 that lookup_reg_by_dump_name correctly handles hard regs.
20036 The presence of hard reg names in the dump means that the test is
20037 target-specific, hence it is in this file. */
20039 static void
20040 aarch64_test_loading_full_dump ()
20042 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20044 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20046 rtx_insn *insn_1 = get_insn_by_uid (1);
20047 ASSERT_EQ (NOTE, GET_CODE (insn_1));
20049 rtx_insn *insn_15 = get_insn_by_uid (15);
20050 ASSERT_EQ (INSN, GET_CODE (insn_15));
20051 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20053 /* Verify crtl->return_rtx. */
20054 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20055 ASSERT_EQ (0, REGNO (crtl->return_rtx));
20056 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20059 /* Run all target-specific selftests. */
20061 static void
20062 aarch64_run_selftests (void)
20064 aarch64_test_loading_full_dump ();
20067 } // namespace selftest
20069 #endif /* #if CHECKING_P */
20071 #undef TARGET_STACK_PROTECT_GUARD
20072 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20074 #undef TARGET_ADDRESS_COST
20075 #define TARGET_ADDRESS_COST aarch64_address_cost
20077 /* This hook will determines whether unnamed bitfields affect the alignment
20078 of the containing structure. The hook returns true if the structure
20079 should inherit the alignment requirements of an unnamed bitfield's
20080 type. */
20081 #undef TARGET_ALIGN_ANON_BITFIELD
20082 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20084 #undef TARGET_ASM_ALIGNED_DI_OP
20085 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20087 #undef TARGET_ASM_ALIGNED_HI_OP
20088 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20090 #undef TARGET_ASM_ALIGNED_SI_OP
20091 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20093 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20094 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20095 hook_bool_const_tree_hwi_hwi_const_tree_true
20097 #undef TARGET_ASM_FILE_START
20098 #define TARGET_ASM_FILE_START aarch64_start_file
20100 #undef TARGET_ASM_OUTPUT_MI_THUNK
20101 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20103 #undef TARGET_ASM_SELECT_RTX_SECTION
20104 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20106 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20107 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20109 #undef TARGET_BUILD_BUILTIN_VA_LIST
20110 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20112 #undef TARGET_CALLEE_COPIES
20113 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20115 #undef TARGET_CAN_ELIMINATE
20116 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20118 #undef TARGET_CAN_INLINE_P
20119 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20121 #undef TARGET_CANNOT_FORCE_CONST_MEM
20122 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20124 #undef TARGET_CASE_VALUES_THRESHOLD
20125 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20127 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20128 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20130 /* Only the least significant bit is used for initialization guard
20131 variables. */
20132 #undef TARGET_CXX_GUARD_MASK_BIT
20133 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20135 #undef TARGET_C_MODE_FOR_SUFFIX
20136 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20138 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20139 #undef TARGET_DEFAULT_TARGET_FLAGS
20140 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20141 #endif
20143 #undef TARGET_CLASS_MAX_NREGS
20144 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20146 #undef TARGET_BUILTIN_DECL
20147 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20149 #undef TARGET_BUILTIN_RECIPROCAL
20150 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20152 #undef TARGET_C_EXCESS_PRECISION
20153 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20155 #undef TARGET_EXPAND_BUILTIN
20156 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20158 #undef TARGET_EXPAND_BUILTIN_VA_START
20159 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20161 #undef TARGET_FOLD_BUILTIN
20162 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20164 #undef TARGET_FUNCTION_ARG
20165 #define TARGET_FUNCTION_ARG aarch64_function_arg
20167 #undef TARGET_FUNCTION_ARG_ADVANCE
20168 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20170 #undef TARGET_FUNCTION_ARG_BOUNDARY
20171 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20173 #undef TARGET_FUNCTION_ARG_PADDING
20174 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20176 #undef TARGET_GET_RAW_RESULT_MODE
20177 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20178 #undef TARGET_GET_RAW_ARG_MODE
20179 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20181 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20182 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20184 #undef TARGET_FUNCTION_VALUE
20185 #define TARGET_FUNCTION_VALUE aarch64_function_value
20187 #undef TARGET_FUNCTION_VALUE_REGNO_P
20188 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20190 #undef TARGET_GIMPLE_FOLD_BUILTIN
20191 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20193 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20194 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20196 #undef TARGET_INIT_BUILTINS
20197 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20199 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20200 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20201 aarch64_ira_change_pseudo_allocno_class
20203 #undef TARGET_LEGITIMATE_ADDRESS_P
20204 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20206 #undef TARGET_LEGITIMATE_CONSTANT_P
20207 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20209 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20210 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20211 aarch64_legitimize_address_displacement
20213 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20214 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20216 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20217 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20218 aarch64_libgcc_floating_mode_supported_p
20220 #undef TARGET_MANGLE_TYPE
20221 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20223 #undef TARGET_MEMORY_MOVE_COST
20224 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20226 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20227 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20229 #undef TARGET_MUST_PASS_IN_STACK
20230 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20232 /* This target hook should return true if accesses to volatile bitfields
20233 should use the narrowest mode possible. It should return false if these
20234 accesses should use the bitfield container type. */
20235 #undef TARGET_NARROW_VOLATILE_BITFIELD
20236 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20238 #undef TARGET_OPTION_OVERRIDE
20239 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20241 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20242 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20243 aarch64_override_options_after_change
20245 #undef TARGET_OPTION_SAVE
20246 #define TARGET_OPTION_SAVE aarch64_option_save
20248 #undef TARGET_OPTION_RESTORE
20249 #define TARGET_OPTION_RESTORE aarch64_option_restore
20251 #undef TARGET_OPTION_PRINT
20252 #define TARGET_OPTION_PRINT aarch64_option_print
20254 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20255 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20257 #undef TARGET_SET_CURRENT_FUNCTION
20258 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20260 #undef TARGET_PASS_BY_REFERENCE
20261 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20263 #undef TARGET_PREFERRED_RELOAD_CLASS
20264 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20266 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20267 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20269 #undef TARGET_PROMOTED_TYPE
20270 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20272 #undef TARGET_SECONDARY_RELOAD
20273 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20275 #undef TARGET_SHIFT_TRUNCATION_MASK
20276 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20278 #undef TARGET_SETUP_INCOMING_VARARGS
20279 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20281 #undef TARGET_STRUCT_VALUE_RTX
20282 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20284 #undef TARGET_REGISTER_MOVE_COST
20285 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20287 #undef TARGET_RETURN_IN_MEMORY
20288 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20290 #undef TARGET_RETURN_IN_MSB
20291 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20293 #undef TARGET_RTX_COSTS
20294 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20296 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20297 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20299 #undef TARGET_SCHED_ISSUE_RATE
20300 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20302 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20303 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20304 aarch64_sched_first_cycle_multipass_dfa_lookahead
20306 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20307 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20308 aarch64_first_cycle_multipass_dfa_lookahead_guard
20310 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20311 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20312 aarch64_get_separate_components
20314 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20315 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20316 aarch64_components_for_bb
20318 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20319 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20320 aarch64_disqualify_components
20322 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20323 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20324 aarch64_emit_prologue_components
20326 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20327 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20328 aarch64_emit_epilogue_components
20330 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20331 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20332 aarch64_set_handled_components
20334 #undef TARGET_TRAMPOLINE_INIT
20335 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20337 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20338 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20340 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20341 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20343 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20344 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20345 aarch64_builtin_support_vector_misalignment
20347 #undef TARGET_ARRAY_MODE
20348 #define TARGET_ARRAY_MODE aarch64_array_mode
20350 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20351 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20353 #undef TARGET_VECTORIZE_ADD_STMT_COST
20354 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20356 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20357 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20358 aarch64_builtin_vectorization_cost
20360 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20361 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20363 #undef TARGET_VECTORIZE_BUILTINS
20364 #define TARGET_VECTORIZE_BUILTINS
20366 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20367 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20368 aarch64_builtin_vectorized_function
20370 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20371 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20372 aarch64_autovectorize_vector_sizes
20374 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20375 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20376 aarch64_atomic_assign_expand_fenv
20378 /* Section anchor support. */
20380 #undef TARGET_MIN_ANCHOR_OFFSET
20381 #define TARGET_MIN_ANCHOR_OFFSET -256
20383 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20384 byte offset; we can do much more for larger data types, but have no way
20385 to determine the size of the access. We assume accesses are aligned. */
20386 #undef TARGET_MAX_ANCHOR_OFFSET
20387 #define TARGET_MAX_ANCHOR_OFFSET 4095
20389 #undef TARGET_VECTOR_ALIGNMENT
20390 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20392 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20393 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20394 aarch64_vectorize_preferred_vector_alignment
20395 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20396 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20397 aarch64_simd_vector_alignment_reachable
20399 /* vec_perm support. */
20401 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20402 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20403 aarch64_vectorize_vec_perm_const
20405 #undef TARGET_VECTORIZE_GET_MASK_MODE
20406 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20407 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20408 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20409 aarch64_empty_mask_is_expensive
20410 #undef TARGET_PREFERRED_ELSE_VALUE
20411 #define TARGET_PREFERRED_ELSE_VALUE \
20412 aarch64_preferred_else_value
20414 #undef TARGET_INIT_LIBFUNCS
20415 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20417 #undef TARGET_FIXED_CONDITION_CODE_REGS
20418 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20420 #undef TARGET_FLAGS_REGNUM
20421 #define TARGET_FLAGS_REGNUM CC_REGNUM
20423 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20424 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20426 #undef TARGET_ASAN_SHADOW_OFFSET
20427 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20429 #undef TARGET_LEGITIMIZE_ADDRESS
20430 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20432 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20433 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20435 #undef TARGET_CAN_USE_DOLOOP_P
20436 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20438 #undef TARGET_SCHED_ADJUST_PRIORITY
20439 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20441 #undef TARGET_SCHED_MACRO_FUSION_P
20442 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20444 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20445 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20447 #undef TARGET_SCHED_FUSION_PRIORITY
20448 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20450 #undef TARGET_UNSPEC_MAY_TRAP_P
20451 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20453 #undef TARGET_USE_PSEUDO_PIC_REG
20454 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20456 #undef TARGET_PRINT_OPERAND
20457 #define TARGET_PRINT_OPERAND aarch64_print_operand
20459 #undef TARGET_PRINT_OPERAND_ADDRESS
20460 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20462 #undef TARGET_OPTAB_SUPPORTED_P
20463 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20465 #undef TARGET_OMIT_STRUCT_RETURN_REG
20466 #define TARGET_OMIT_STRUCT_RETURN_REG true
20468 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20469 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20470 aarch64_dwarf_poly_indeterminate_value
20472 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20473 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20474 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20476 #undef TARGET_HARD_REGNO_NREGS
20477 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20478 #undef TARGET_HARD_REGNO_MODE_OK
20479 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20481 #undef TARGET_MODES_TIEABLE_P
20482 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20484 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20485 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20486 aarch64_hard_regno_call_part_clobbered
20488 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20489 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20490 aarch64_remove_extra_call_preserved_regs
20492 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20493 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20494 aarch64_return_call_with_max_clobbers
20496 #undef TARGET_CONSTANT_ALIGNMENT
20497 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20499 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20500 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20501 aarch64_stack_clash_protection_alloca_probe_range
20503 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20504 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20506 #undef TARGET_CAN_CHANGE_MODE_CLASS
20507 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20509 #undef TARGET_SELECT_EARLY_REMAT_MODES
20510 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20512 #undef TARGET_SPECULATION_SAFE_VALUE
20513 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20515 #undef TARGET_ESTIMATED_POLY_VALUE
20516 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20518 #undef TARGET_ATTRIBUTE_TABLE
20519 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20521 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20522 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20523 aarch64_simd_clone_compute_vecsize_and_simdlen
20525 #undef TARGET_SIMD_CLONE_ADJUST
20526 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20528 #undef TARGET_SIMD_CLONE_USABLE
20529 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20531 #undef TARGET_COMP_TYPE_ATTRIBUTES
20532 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20534 #undef TARGET_GET_MULTILIB_ABI_NAME
20535 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20537 #if CHECKING_P
20538 #undef TARGET_RUN_TARGET_SELFTESTS
20539 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20540 #endif /* #if CHECKING_P */
20542 #undef TARGET_ASM_POST_CFI_STARTPROC
20543 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20545 struct gcc_target targetm = TARGET_INITIALIZER;
20547 #include "gt-aarch64.h"