AArch64: Cleanup move immediate code
[official-gcc.git] / gcc / config / aarch64 / aarch64.cc
blob89bf0dff904b6b52b71841aec299541f01884f3d
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2022 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #include "config.h"
26 #include "system.h"
27 #include "coretypes.h"
28 #include "backend.h"
29 #include "target.h"
30 #include "rtl.h"
31 #include "tree.h"
32 #include "memmodel.h"
33 #include "gimple.h"
34 #include "cfghooks.h"
35 #include "cfgloop.h"
36 #include "df.h"
37 #include "tm_p.h"
38 #include "stringpool.h"
39 #include "attribs.h"
40 #include "optabs.h"
41 #include "regs.h"
42 #include "emit-rtl.h"
43 #include "recog.h"
44 #include "cgraph.h"
45 #include "diagnostic.h"
46 #include "insn-attr.h"
47 #include "alias.h"
48 #include "fold-const.h"
49 #include "stor-layout.h"
50 #include "calls.h"
51 #include "varasm.h"
52 #include "output.h"
53 #include "flags.h"
54 #include "explow.h"
55 #include "expr.h"
56 #include "reload.h"
57 #include "langhooks.h"
58 #include "opts.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78 #include "gimple-pretty-print.h"
79 #include "tree-ssa-loop-niter.h"
80 #include "fractional-cost.h"
81 #include "rtlanal.h"
82 #include "tree-dfa.h"
83 #include "asan.h"
84 #include "aarch64-feature-deps.h"
86 /* This file should be included last. */
87 #include "target-def.h"
89 /* Defined for convenience. */
90 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
92 /* Information about a legitimate vector immediate operand. */
93 struct simd_immediate_info
95 enum insn_type { MOV, MVN, INDEX, PTRUE };
96 enum modifier_type { LSL, MSL };
98 simd_immediate_info () {}
99 simd_immediate_info (scalar_float_mode, rtx);
100 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
101 insn_type = MOV, modifier_type = LSL,
102 unsigned int = 0);
103 simd_immediate_info (scalar_mode, rtx, rtx);
104 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
106 /* The mode of the elements. */
107 scalar_mode elt_mode;
109 /* The instruction to use to move the immediate into a vector. */
110 insn_type insn;
112 union
114 /* For MOV and MVN. */
115 struct
117 /* The value of each element. */
118 rtx value;
120 /* The kind of shift modifier to use, and the number of bits to shift.
121 This is (LSL, 0) if no shift is needed. */
122 modifier_type modifier;
123 unsigned int shift;
124 } mov;
126 /* For INDEX. */
127 struct
129 /* The value of the first element and the step to be added for each
130 subsequent element. */
131 rtx base, step;
132 } index;
134 /* For PTRUE. */
135 aarch64_svpattern pattern;
136 } u;
139 /* Construct a floating-point immediate in which each element has mode
140 ELT_MODE_IN and value VALUE_IN. */
141 inline simd_immediate_info
142 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
143 : elt_mode (elt_mode_in), insn (MOV)
145 u.mov.value = value_in;
146 u.mov.modifier = LSL;
147 u.mov.shift = 0;
150 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
151 and value VALUE_IN. The other parameters are as for the structure
152 fields. */
153 inline simd_immediate_info
154 ::simd_immediate_info (scalar_int_mode elt_mode_in,
155 unsigned HOST_WIDE_INT value_in,
156 insn_type insn_in, modifier_type modifier_in,
157 unsigned int shift_in)
158 : elt_mode (elt_mode_in), insn (insn_in)
160 u.mov.value = gen_int_mode (value_in, elt_mode_in);
161 u.mov.modifier = modifier_in;
162 u.mov.shift = shift_in;
165 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
166 and where element I is equal to BASE_IN + I * STEP_IN. */
167 inline simd_immediate_info
168 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
169 : elt_mode (elt_mode_in), insn (INDEX)
171 u.index.base = base_in;
172 u.index.step = step_in;
175 /* Construct a predicate that controls elements of mode ELT_MODE_IN
176 and has PTRUE pattern PATTERN_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_int_mode elt_mode_in,
179 aarch64_svpattern pattern_in)
180 : elt_mode (elt_mode_in), insn (PTRUE)
182 u.pattern = pattern_in;
185 namespace {
187 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
188 class pure_scalable_type_info
190 public:
191 /* Represents the result of analyzing a type. All values are nonzero,
192 in the possibly forlorn hope that accidental conversions to bool
193 trigger a warning. */
194 enum analysis_result
196 /* The type does not have an ABI identity; i.e. it doesn't contain
197 at least one object whose type is a Fundamental Data Type. */
198 NO_ABI_IDENTITY = 1,
200 /* The type is definitely a Pure Scalable Type. */
201 IS_PST,
203 /* The type is definitely not a Pure Scalable Type. */
204 ISNT_PST,
206 /* It doesn't matter for PCS purposes whether the type is a Pure
207 Scalable Type or not, since the type will be handled the same
208 way regardless.
210 Specifically, this means that if the type is a Pure Scalable Type,
211 there aren't enough argument registers to hold it, and so it will
212 need to be passed or returned in memory. If the type isn't a
213 Pure Scalable Type, it's too big to be passed or returned in core
214 or SIMD&FP registers, and so again will need to go in memory. */
215 DOESNT_MATTER
218 /* Aggregates of 17 bytes or more are normally passed and returned
219 in memory, so aggregates of that size can safely be analyzed as
220 DOESNT_MATTER. We need to be able to collect enough pieces to
221 represent a PST that is smaller than that. Since predicates are
222 2 bytes in size for -msve-vector-bits=128, that means we need to be
223 able to store at least 8 pieces.
225 We also need to be able to store enough pieces to represent
226 a single vector in each vector argument register and a single
227 predicate in each predicate argument register. This means that
228 we need at least 12 pieces. */
229 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
230 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
232 /* Describes one piece of a PST. Each piece is one of:
234 - a single Scalable Vector Type (SVT)
235 - a single Scalable Predicate Type (SPT)
236 - a PST containing 2, 3 or 4 SVTs, with no padding
238 It either represents a single built-in type or a PST formed from
239 multiple homogeneous built-in types. */
240 struct piece
242 rtx get_rtx (unsigned int, unsigned int) const;
244 /* The number of vector and predicate registers that the piece
245 occupies. One of the two is always zero. */
246 unsigned int num_zr;
247 unsigned int num_pr;
249 /* The mode of the registers described above. */
250 machine_mode mode;
252 /* If this piece is formed from multiple homogeneous built-in types,
253 this is the mode of the built-in types, otherwise it is MODE. */
254 machine_mode orig_mode;
256 /* The offset in bytes of the piece from the start of the type. */
257 poly_uint64_pod offset;
260 /* Divides types analyzed as IS_PST into individual pieces. The pieces
261 are in memory order. */
262 auto_vec<piece, MAX_PIECES> pieces;
264 unsigned int num_zr () const;
265 unsigned int num_pr () const;
267 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
269 analysis_result analyze (const_tree);
270 bool analyze_registers (const_tree);
272 private:
273 analysis_result analyze_array (const_tree);
274 analysis_result analyze_record (const_tree);
275 void add_piece (const piece &);
279 /* The current code model. */
280 enum aarch64_code_model aarch64_cmodel;
282 /* The number of 64-bit elements in an SVE vector. */
283 poly_uint16 aarch64_sve_vg;
285 #ifdef HAVE_AS_TLS
286 #undef TARGET_HAVE_TLS
287 #define TARGET_HAVE_TLS 1
288 #endif
290 static bool aarch64_composite_type_p (const_tree, machine_mode);
291 static bool aarch64_return_in_memory_1 (const_tree);
292 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
293 const_tree,
294 machine_mode *, int *,
295 bool *, bool);
296 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
297 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
298 static void aarch64_override_options_after_change (void);
299 static bool aarch64_vector_mode_supported_p (machine_mode);
300 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
301 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
302 const_tree type,
303 int misalignment,
304 bool is_packed);
305 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
306 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
307 aarch64_addr_query_type);
309 /* The processor for which instructions should be scheduled. */
310 enum aarch64_processor aarch64_tune = cortexa53;
312 /* Mask to specify which instruction scheduling options should be used. */
313 uint64_t aarch64_tune_flags = 0;
315 /* Global flag for PC relative loads. */
316 bool aarch64_pcrelative_literal_loads;
318 /* Global flag for whether frame pointer is enabled. */
319 bool aarch64_use_frame_pointer;
321 #define BRANCH_PROTECT_STR_MAX 255
322 char *accepted_branch_protection_string = NULL;
324 static enum aarch64_parse_opt_result
325 aarch64_parse_branch_protection (const char*, char**);
327 /* Support for command line parsing of boolean flags in the tuning
328 structures. */
329 struct aarch64_flag_desc
331 const char* name;
332 unsigned int flag;
335 #define AARCH64_FUSION_PAIR(name, internal_name) \
336 { name, AARCH64_FUSE_##internal_name },
337 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
339 { "none", AARCH64_FUSE_NOTHING },
340 #include "aarch64-fusion-pairs.def"
341 { "all", AARCH64_FUSE_ALL },
342 { NULL, AARCH64_FUSE_NOTHING }
345 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
346 { name, AARCH64_EXTRA_TUNE_##internal_name },
347 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
349 { "none", AARCH64_EXTRA_TUNE_NONE },
350 #include "aarch64-tuning-flags.def"
351 { "all", AARCH64_EXTRA_TUNE_ALL },
352 { NULL, AARCH64_EXTRA_TUNE_NONE }
355 /* Tuning parameters. */
357 static const struct cpu_addrcost_table generic_addrcost_table =
360 1, /* hi */
361 0, /* si */
362 0, /* di */
363 1, /* ti */
365 0, /* pre_modify */
366 0, /* post_modify */
367 0, /* post_modify_ld3_st3 */
368 0, /* post_modify_ld4_st4 */
369 0, /* register_offset */
370 0, /* register_sextend */
371 0, /* register_zextend */
372 0 /* imm_offset */
375 static const struct cpu_addrcost_table exynosm1_addrcost_table =
378 0, /* hi */
379 0, /* si */
380 0, /* di */
381 2, /* ti */
383 0, /* pre_modify */
384 0, /* post_modify */
385 0, /* post_modify_ld3_st3 */
386 0, /* post_modify_ld4_st4 */
387 1, /* register_offset */
388 1, /* register_sextend */
389 2, /* register_zextend */
390 0, /* imm_offset */
393 static const struct cpu_addrcost_table xgene1_addrcost_table =
396 1, /* hi */
397 0, /* si */
398 0, /* di */
399 1, /* ti */
401 1, /* pre_modify */
402 1, /* post_modify */
403 1, /* post_modify_ld3_st3 */
404 1, /* post_modify_ld4_st4 */
405 0, /* register_offset */
406 1, /* register_sextend */
407 1, /* register_zextend */
408 0, /* imm_offset */
411 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
414 1, /* hi */
415 1, /* si */
416 1, /* di */
417 2, /* ti */
419 0, /* pre_modify */
420 0, /* post_modify */
421 0, /* post_modify_ld3_st3 */
422 0, /* post_modify_ld4_st4 */
423 2, /* register_offset */
424 3, /* register_sextend */
425 3, /* register_zextend */
426 0, /* imm_offset */
429 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
432 1, /* hi */
433 1, /* si */
434 1, /* di */
435 2, /* ti */
437 0, /* pre_modify */
438 0, /* post_modify */
439 0, /* post_modify_ld3_st3 */
440 0, /* post_modify_ld4_st4 */
441 2, /* register_offset */
442 3, /* register_sextend */
443 3, /* register_zextend */
444 0, /* imm_offset */
447 static const struct cpu_addrcost_table tsv110_addrcost_table =
450 1, /* hi */
451 0, /* si */
452 0, /* di */
453 1, /* ti */
455 0, /* pre_modify */
456 0, /* post_modify */
457 0, /* post_modify_ld3_st3 */
458 0, /* post_modify_ld4_st4 */
459 0, /* register_offset */
460 1, /* register_sextend */
461 1, /* register_zextend */
462 0, /* imm_offset */
465 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
468 1, /* hi */
469 1, /* si */
470 1, /* di */
471 2, /* ti */
473 1, /* pre_modify */
474 1, /* post_modify */
475 1, /* post_modify_ld3_st3 */
476 1, /* post_modify_ld4_st4 */
477 3, /* register_offset */
478 3, /* register_sextend */
479 3, /* register_zextend */
480 2, /* imm_offset */
483 static const struct cpu_addrcost_table a64fx_addrcost_table =
486 1, /* hi */
487 1, /* si */
488 1, /* di */
489 2, /* ti */
491 0, /* pre_modify */
492 0, /* post_modify */
493 0, /* post_modify_ld3_st3 */
494 0, /* post_modify_ld4_st4 */
495 2, /* register_offset */
496 3, /* register_sextend */
497 3, /* register_zextend */
498 0, /* imm_offset */
501 static const struct cpu_addrcost_table neoversev1_addrcost_table =
504 1, /* hi */
505 0, /* si */
506 0, /* di */
507 1, /* ti */
509 0, /* pre_modify */
510 0, /* post_modify */
511 3, /* post_modify_ld3_st3 */
512 3, /* post_modify_ld4_st4 */
513 0, /* register_offset */
514 0, /* register_sextend */
515 0, /* register_zextend */
516 0 /* imm_offset */
519 static const struct cpu_addrcost_table neoversen2_addrcost_table =
522 1, /* hi */
523 0, /* si */
524 0, /* di */
525 1, /* ti */
527 0, /* pre_modify */
528 0, /* post_modify */
529 2, /* post_modify_ld3_st3 */
530 2, /* post_modify_ld4_st4 */
531 0, /* register_offset */
532 0, /* register_sextend */
533 0, /* register_zextend */
534 0 /* imm_offset */
537 static const struct cpu_addrcost_table neoversev2_addrcost_table =
540 1, /* hi */
541 0, /* si */
542 0, /* di */
543 1, /* ti */
545 0, /* pre_modify */
546 0, /* post_modify */
547 2, /* post_modify_ld3_st3 */
548 2, /* post_modify_ld4_st4 */
549 0, /* register_offset */
550 0, /* register_sextend */
551 0, /* register_zextend */
552 0 /* imm_offset */
555 static const struct cpu_regmove_cost generic_regmove_cost =
557 1, /* GP2GP */
558 /* Avoid the use of slow int<->fp moves for spilling by setting
559 their cost higher than memmov_cost. */
560 5, /* GP2FP */
561 5, /* FP2GP */
562 2 /* FP2FP */
565 static const struct cpu_regmove_cost cortexa57_regmove_cost =
567 1, /* GP2GP */
568 /* Avoid the use of slow int<->fp moves for spilling by setting
569 their cost higher than memmov_cost. */
570 5, /* GP2FP */
571 5, /* FP2GP */
572 2 /* FP2FP */
575 static const struct cpu_regmove_cost cortexa53_regmove_cost =
577 1, /* GP2GP */
578 /* Avoid the use of slow int<->fp moves for spilling by setting
579 their cost higher than memmov_cost. */
580 5, /* GP2FP */
581 5, /* FP2GP */
582 2 /* FP2FP */
585 static const struct cpu_regmove_cost exynosm1_regmove_cost =
587 1, /* GP2GP */
588 /* Avoid the use of slow int<->fp moves for spilling by setting
589 their cost higher than memmov_cost (actual, 4 and 9). */
590 9, /* GP2FP */
591 9, /* FP2GP */
592 1 /* FP2FP */
595 static const struct cpu_regmove_cost thunderx_regmove_cost =
597 2, /* GP2GP */
598 2, /* GP2FP */
599 6, /* FP2GP */
600 4 /* FP2FP */
603 static const struct cpu_regmove_cost xgene1_regmove_cost =
605 1, /* GP2GP */
606 /* Avoid the use of slow int<->fp moves for spilling by setting
607 their cost higher than memmov_cost. */
608 8, /* GP2FP */
609 8, /* FP2GP */
610 2 /* FP2FP */
613 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
615 2, /* GP2GP */
616 /* Avoid the use of int<->fp moves for spilling. */
617 6, /* GP2FP */
618 6, /* FP2GP */
619 4 /* FP2FP */
622 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
624 1, /* GP2GP */
625 /* Avoid the use of int<->fp moves for spilling. */
626 5, /* GP2FP */
627 6, /* FP2GP */
628 3, /* FP2FP */
631 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
633 1, /* GP2GP */
634 /* Avoid the use of int<->fp moves for spilling. */
635 4, /* GP2FP */
636 5, /* FP2GP */
637 4 /* FP2FP */
640 static const struct cpu_regmove_cost tsv110_regmove_cost =
642 1, /* GP2GP */
643 /* Avoid the use of slow int<->fp moves for spilling by setting
644 their cost higher than memmov_cost. */
645 2, /* GP2FP */
646 3, /* FP2GP */
647 2 /* FP2FP */
650 static const struct cpu_regmove_cost a64fx_regmove_cost =
652 1, /* GP2GP */
653 /* Avoid the use of slow int<->fp moves for spilling by setting
654 their cost higher than memmov_cost. */
655 5, /* GP2FP */
656 7, /* FP2GP */
657 2 /* FP2FP */
660 static const struct cpu_regmove_cost neoversen2_regmove_cost =
662 1, /* GP2GP */
663 /* Spilling to int<->fp instead of memory is recommended so set
664 realistic costs compared to memmov_cost. */
665 3, /* GP2FP */
666 2, /* FP2GP */
667 2 /* FP2FP */
670 static const struct cpu_regmove_cost neoversev1_regmove_cost =
672 1, /* GP2GP */
673 /* Spilling to int<->fp instead of memory is recommended so set
674 realistic costs compared to memmov_cost. */
675 3, /* GP2FP */
676 2, /* FP2GP */
677 2 /* FP2FP */
680 static const struct cpu_regmove_cost neoversev2_regmove_cost =
682 1, /* GP2GP */
683 /* Spilling to int<->fp instead of memory is recommended so set
684 realistic costs compared to memmov_cost. */
685 3, /* GP2FP */
686 2, /* FP2GP */
687 2 /* FP2FP */
690 /* Generic costs for Advanced SIMD vector operations. */
691 static const advsimd_vec_cost generic_advsimd_vector_cost =
693 1, /* int_stmt_cost */
694 1, /* fp_stmt_cost */
695 0, /* ld2_st2_permute_cost */
696 0, /* ld3_st3_permute_cost */
697 0, /* ld4_st4_permute_cost */
698 2, /* permute_cost */
699 2, /* reduc_i8_cost */
700 2, /* reduc_i16_cost */
701 2, /* reduc_i32_cost */
702 2, /* reduc_i64_cost */
703 2, /* reduc_f16_cost */
704 2, /* reduc_f32_cost */
705 2, /* reduc_f64_cost */
706 2, /* store_elt_extra_cost */
707 2, /* vec_to_scalar_cost */
708 1, /* scalar_to_vec_cost */
709 1, /* align_load_cost */
710 1, /* unalign_load_cost */
711 1, /* unalign_store_cost */
712 1 /* store_cost */
715 /* Generic costs for SVE vector operations. */
716 static const sve_vec_cost generic_sve_vector_cost =
719 1, /* int_stmt_cost */
720 1, /* fp_stmt_cost */
721 0, /* ld2_st2_permute_cost */
722 0, /* ld3_st3_permute_cost */
723 0, /* ld4_st4_permute_cost */
724 2, /* permute_cost */
725 2, /* reduc_i8_cost */
726 2, /* reduc_i16_cost */
727 2, /* reduc_i32_cost */
728 2, /* reduc_i64_cost */
729 2, /* reduc_f16_cost */
730 2, /* reduc_f32_cost */
731 2, /* reduc_f64_cost */
732 2, /* store_elt_extra_cost */
733 2, /* vec_to_scalar_cost */
734 1, /* scalar_to_vec_cost */
735 1, /* align_load_cost */
736 1, /* unalign_load_cost */
737 1, /* unalign_store_cost */
738 1 /* store_cost */
740 2, /* clast_cost */
741 2, /* fadda_f16_cost */
742 2, /* fadda_f32_cost */
743 2, /* fadda_f64_cost */
744 4, /* gather_load_x32_cost */
745 2, /* gather_load_x64_cost */
746 1 /* scatter_store_elt_cost */
749 /* Generic costs for vector insn classes. */
750 static const struct cpu_vector_cost generic_vector_cost =
752 1, /* scalar_int_stmt_cost */
753 1, /* scalar_fp_stmt_cost */
754 1, /* scalar_load_cost */
755 1, /* scalar_store_cost */
756 3, /* cond_taken_branch_cost */
757 1, /* cond_not_taken_branch_cost */
758 &generic_advsimd_vector_cost, /* advsimd */
759 &generic_sve_vector_cost, /* sve */
760 nullptr /* issue_info */
763 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
765 2, /* int_stmt_cost */
766 5, /* fp_stmt_cost */
767 0, /* ld2_st2_permute_cost */
768 0, /* ld3_st3_permute_cost */
769 0, /* ld4_st4_permute_cost */
770 3, /* permute_cost */
771 13, /* reduc_i8_cost */
772 13, /* reduc_i16_cost */
773 13, /* reduc_i32_cost */
774 13, /* reduc_i64_cost */
775 13, /* reduc_f16_cost */
776 13, /* reduc_f32_cost */
777 13, /* reduc_f64_cost */
778 13, /* store_elt_extra_cost */
779 13, /* vec_to_scalar_cost */
780 4, /* scalar_to_vec_cost */
781 6, /* align_load_cost */
782 6, /* unalign_load_cost */
783 1, /* unalign_store_cost */
784 1 /* store_cost */
787 static const sve_vec_cost a64fx_sve_vector_cost =
790 2, /* int_stmt_cost */
791 5, /* fp_stmt_cost */
792 0, /* ld2_st2_permute_cost */
793 0, /* ld3_st3_permute_cost */
794 0, /* ld4_st4_permute_cost */
795 3, /* permute_cost */
796 13, /* reduc_i8_cost */
797 13, /* reduc_i16_cost */
798 13, /* reduc_i32_cost */
799 13, /* reduc_i64_cost */
800 13, /* reduc_f16_cost */
801 13, /* reduc_f32_cost */
802 13, /* reduc_f64_cost */
803 13, /* store_elt_extra_cost */
804 13, /* vec_to_scalar_cost */
805 4, /* scalar_to_vec_cost */
806 6, /* align_load_cost */
807 6, /* unalign_load_cost */
808 1, /* unalign_store_cost */
809 1 /* store_cost */
811 13, /* clast_cost */
812 13, /* fadda_f16_cost */
813 13, /* fadda_f32_cost */
814 13, /* fadda_f64_cost */
815 64, /* gather_load_x32_cost */
816 32, /* gather_load_x64_cost */
817 1 /* scatter_store_elt_cost */
820 static const struct cpu_vector_cost a64fx_vector_cost =
822 1, /* scalar_int_stmt_cost */
823 5, /* scalar_fp_stmt_cost */
824 4, /* scalar_load_cost */
825 1, /* scalar_store_cost */
826 3, /* cond_taken_branch_cost */
827 1, /* cond_not_taken_branch_cost */
828 &a64fx_advsimd_vector_cost, /* advsimd */
829 &a64fx_sve_vector_cost, /* sve */
830 nullptr /* issue_info */
833 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
835 1, /* int_stmt_cost */
836 3, /* fp_stmt_cost */
837 0, /* ld2_st2_permute_cost */
838 0, /* ld3_st3_permute_cost */
839 0, /* ld4_st4_permute_cost */
840 2, /* permute_cost */
841 1, /* reduc_i8_cost */
842 1, /* reduc_i16_cost */
843 1, /* reduc_i32_cost */
844 1, /* reduc_i64_cost */
845 1, /* reduc_f16_cost */
846 1, /* reduc_f32_cost */
847 1, /* reduc_f64_cost */
848 1, /* store_elt_extra_cost */
849 1, /* vec_to_scalar_cost */
850 1, /* scalar_to_vec_cost */
851 1, /* align_load_cost */
852 1, /* unalign_load_cost */
853 1, /* unalign_store_cost */
854 1 /* store_cost */
857 /* QDF24XX costs for vector insn classes. */
858 static const struct cpu_vector_cost qdf24xx_vector_cost =
860 1, /* scalar_int_stmt_cost */
861 1, /* scalar_fp_stmt_cost */
862 1, /* scalar_load_cost */
863 1, /* scalar_store_cost */
864 3, /* cond_taken_branch_cost */
865 1, /* cond_not_taken_branch_cost */
866 &qdf24xx_advsimd_vector_cost, /* advsimd */
867 nullptr, /* sve */
868 nullptr /* issue_info */
872 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
874 4, /* int_stmt_cost */
875 1, /* fp_stmt_cost */
876 0, /* ld2_st2_permute_cost */
877 0, /* ld3_st3_permute_cost */
878 0, /* ld4_st4_permute_cost */
879 4, /* permute_cost */
880 2, /* reduc_i8_cost */
881 2, /* reduc_i16_cost */
882 2, /* reduc_i32_cost */
883 2, /* reduc_i64_cost */
884 2, /* reduc_f16_cost */
885 2, /* reduc_f32_cost */
886 2, /* reduc_f64_cost */
887 2, /* store_elt_extra_cost */
888 2, /* vec_to_scalar_cost */
889 2, /* scalar_to_vec_cost */
890 3, /* align_load_cost */
891 5, /* unalign_load_cost */
892 5, /* unalign_store_cost */
893 1 /* store_cost */
896 /* ThunderX costs for vector insn classes. */
897 static const struct cpu_vector_cost thunderx_vector_cost =
899 1, /* scalar_int_stmt_cost */
900 1, /* scalar_fp_stmt_cost */
901 3, /* scalar_load_cost */
902 1, /* scalar_store_cost */
903 3, /* cond_taken_branch_cost */
904 3, /* cond_not_taken_branch_cost */
905 &thunderx_advsimd_vector_cost, /* advsimd */
906 nullptr, /* sve */
907 nullptr /* issue_info */
910 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
912 2, /* int_stmt_cost */
913 2, /* fp_stmt_cost */
914 0, /* ld2_st2_permute_cost */
915 0, /* ld3_st3_permute_cost */
916 0, /* ld4_st4_permute_cost */
917 2, /* permute_cost */
918 3, /* reduc_i8_cost */
919 3, /* reduc_i16_cost */
920 3, /* reduc_i32_cost */
921 3, /* reduc_i64_cost */
922 3, /* reduc_f16_cost */
923 3, /* reduc_f32_cost */
924 3, /* reduc_f64_cost */
925 3, /* store_elt_extra_cost */
926 3, /* vec_to_scalar_cost */
927 2, /* scalar_to_vec_cost */
928 5, /* align_load_cost */
929 5, /* unalign_load_cost */
930 1, /* unalign_store_cost */
931 1 /* store_cost */
934 static const struct cpu_vector_cost tsv110_vector_cost =
936 1, /* scalar_int_stmt_cost */
937 1, /* scalar_fp_stmt_cost */
938 5, /* scalar_load_cost */
939 1, /* scalar_store_cost */
940 1, /* cond_taken_branch_cost */
941 1, /* cond_not_taken_branch_cost */
942 &tsv110_advsimd_vector_cost, /* advsimd */
943 nullptr, /* sve */
944 nullptr /* issue_info */
947 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
949 2, /* int_stmt_cost */
950 2, /* fp_stmt_cost */
951 0, /* ld2_st2_permute_cost */
952 0, /* ld3_st3_permute_cost */
953 0, /* ld4_st4_permute_cost */
954 3, /* permute_cost */
955 8, /* reduc_i8_cost */
956 8, /* reduc_i16_cost */
957 8, /* reduc_i32_cost */
958 8, /* reduc_i64_cost */
959 8, /* reduc_f16_cost */
960 8, /* reduc_f32_cost */
961 8, /* reduc_f64_cost */
962 8, /* store_elt_extra_cost */
963 8, /* vec_to_scalar_cost */
964 8, /* scalar_to_vec_cost */
965 4, /* align_load_cost */
966 4, /* unalign_load_cost */
967 1, /* unalign_store_cost */
968 1 /* store_cost */
971 /* Cortex-A57 costs for vector insn classes. */
972 static const struct cpu_vector_cost cortexa57_vector_cost =
974 1, /* scalar_int_stmt_cost */
975 1, /* scalar_fp_stmt_cost */
976 4, /* scalar_load_cost */
977 1, /* scalar_store_cost */
978 1, /* cond_taken_branch_cost */
979 1, /* cond_not_taken_branch_cost */
980 &cortexa57_advsimd_vector_cost, /* advsimd */
981 nullptr, /* sve */
982 nullptr /* issue_info */
985 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
987 3, /* int_stmt_cost */
988 3, /* fp_stmt_cost */
989 0, /* ld2_st2_permute_cost */
990 0, /* ld3_st3_permute_cost */
991 0, /* ld4_st4_permute_cost */
992 3, /* permute_cost */
993 3, /* reduc_i8_cost */
994 3, /* reduc_i16_cost */
995 3, /* reduc_i32_cost */
996 3, /* reduc_i64_cost */
997 3, /* reduc_f16_cost */
998 3, /* reduc_f32_cost */
999 3, /* reduc_f64_cost */
1000 3, /* store_elt_extra_cost */
1001 3, /* vec_to_scalar_cost */
1002 3, /* scalar_to_vec_cost */
1003 5, /* align_load_cost */
1004 5, /* unalign_load_cost */
1005 1, /* unalign_store_cost */
1006 1 /* store_cost */
1009 static const struct cpu_vector_cost exynosm1_vector_cost =
1011 1, /* scalar_int_stmt_cost */
1012 1, /* scalar_fp_stmt_cost */
1013 5, /* scalar_load_cost */
1014 1, /* scalar_store_cost */
1015 1, /* cond_taken_branch_cost */
1016 1, /* cond_not_taken_branch_cost */
1017 &exynosm1_advsimd_vector_cost, /* advsimd */
1018 nullptr, /* sve */
1019 nullptr /* issue_info */
1022 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1024 2, /* int_stmt_cost */
1025 2, /* fp_stmt_cost */
1026 0, /* ld2_st2_permute_cost */
1027 0, /* ld3_st3_permute_cost */
1028 0, /* ld4_st4_permute_cost */
1029 2, /* permute_cost */
1030 4, /* reduc_i8_cost */
1031 4, /* reduc_i16_cost */
1032 4, /* reduc_i32_cost */
1033 4, /* reduc_i64_cost */
1034 4, /* reduc_f16_cost */
1035 4, /* reduc_f32_cost */
1036 4, /* reduc_f64_cost */
1037 4, /* store_elt_extra_cost */
1038 4, /* vec_to_scalar_cost */
1039 4, /* scalar_to_vec_cost */
1040 10, /* align_load_cost */
1041 10, /* unalign_load_cost */
1042 2, /* unalign_store_cost */
1043 2 /* store_cost */
1046 /* Generic costs for vector insn classes. */
1047 static const struct cpu_vector_cost xgene1_vector_cost =
1049 1, /* scalar_int_stmt_cost */
1050 1, /* scalar_fp_stmt_cost */
1051 5, /* scalar_load_cost */
1052 1, /* scalar_store_cost */
1053 2, /* cond_taken_branch_cost */
1054 1, /* cond_not_taken_branch_cost */
1055 &xgene1_advsimd_vector_cost, /* advsimd */
1056 nullptr, /* sve */
1057 nullptr /* issue_info */
1060 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1062 4, /* int_stmt_cost */
1063 5, /* fp_stmt_cost */
1064 0, /* ld2_st2_permute_cost */
1065 0, /* ld3_st3_permute_cost */
1066 0, /* ld4_st4_permute_cost */
1067 10, /* permute_cost */
1068 6, /* reduc_i8_cost */
1069 6, /* reduc_i16_cost */
1070 6, /* reduc_i32_cost */
1071 6, /* reduc_i64_cost */
1072 6, /* reduc_f16_cost */
1073 6, /* reduc_f32_cost */
1074 6, /* reduc_f64_cost */
1075 6, /* store_elt_extra_cost */
1076 6, /* vec_to_scalar_cost */
1077 5, /* scalar_to_vec_cost */
1078 4, /* align_load_cost */
1079 4, /* unalign_load_cost */
1080 1, /* unalign_store_cost */
1081 1 /* store_cost */
1084 /* Costs for vector insn classes for Vulcan. */
1085 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1087 1, /* scalar_int_stmt_cost */
1088 6, /* scalar_fp_stmt_cost */
1089 4, /* scalar_load_cost */
1090 1, /* scalar_store_cost */
1091 2, /* cond_taken_branch_cost */
1092 1, /* cond_not_taken_branch_cost */
1093 &thunderx2t99_advsimd_vector_cost, /* advsimd */
1094 nullptr, /* sve */
1095 nullptr /* issue_info */
1098 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1100 5, /* int_stmt_cost */
1101 5, /* fp_stmt_cost */
1102 0, /* ld2_st2_permute_cost */
1103 0, /* ld3_st3_permute_cost */
1104 0, /* ld4_st4_permute_cost */
1105 10, /* permute_cost */
1106 5, /* reduc_i8_cost */
1107 5, /* reduc_i16_cost */
1108 5, /* reduc_i32_cost */
1109 5, /* reduc_i64_cost */
1110 5, /* reduc_f16_cost */
1111 5, /* reduc_f32_cost */
1112 5, /* reduc_f64_cost */
1113 5, /* store_elt_extra_cost */
1114 5, /* vec_to_scalar_cost */
1115 5, /* scalar_to_vec_cost */
1116 4, /* align_load_cost */
1117 4, /* unalign_load_cost */
1118 4, /* unalign_store_cost */
1119 4 /* store_cost */
1122 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1124 1, /* scalar_int_stmt_cost */
1125 5, /* scalar_fp_stmt_cost */
1126 4, /* scalar_load_cost */
1127 1, /* scalar_store_cost */
1128 2, /* cond_taken_branch_cost */
1129 1, /* cond_not_taken_branch_cost */
1130 &thunderx3t110_advsimd_vector_cost, /* advsimd */
1131 nullptr, /* sve */
1132 nullptr /* issue_info */
1135 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1137 3, /* int_stmt_cost */
1138 3, /* fp_stmt_cost */
1139 0, /* ld2_st2_permute_cost */
1140 0, /* ld3_st3_permute_cost */
1141 0, /* ld4_st4_permute_cost */
1142 2, /* permute_cost */
1143 12, /* reduc_i8_cost */
1144 9, /* reduc_i16_cost */
1145 6, /* reduc_i32_cost */
1146 5, /* reduc_i64_cost */
1147 9, /* reduc_f16_cost */
1148 6, /* reduc_f32_cost */
1149 5, /* reduc_f64_cost */
1150 8, /* store_elt_extra_cost */
1151 6, /* vec_to_scalar_cost */
1152 7, /* scalar_to_vec_cost */
1153 5, /* align_load_cost */
1154 5, /* unalign_load_cost */
1155 2, /* unalign_store_cost */
1156 2 /* store_cost */
1159 /* Ampere-1 costs for vector insn classes. */
1160 static const struct cpu_vector_cost ampere1_vector_cost =
1162 1, /* scalar_int_stmt_cost */
1163 1, /* scalar_fp_stmt_cost */
1164 4, /* scalar_load_cost */
1165 1, /* scalar_store_cost */
1166 1, /* cond_taken_branch_cost */
1167 1, /* cond_not_taken_branch_cost */
1168 &ampere1_advsimd_vector_cost, /* advsimd */
1169 nullptr, /* sve */
1170 nullptr /* issue_info */
1173 /* Generic costs for branch instructions. */
1174 static const struct cpu_branch_cost generic_branch_cost =
1176 1, /* Predictable. */
1177 3 /* Unpredictable. */
1180 /* Generic approximation modes. */
1181 static const cpu_approx_modes generic_approx_modes =
1183 AARCH64_APPROX_NONE, /* division */
1184 AARCH64_APPROX_NONE, /* sqrt */
1185 AARCH64_APPROX_NONE /* recip_sqrt */
1188 /* Approximation modes for Exynos M1. */
1189 static const cpu_approx_modes exynosm1_approx_modes =
1191 AARCH64_APPROX_NONE, /* division */
1192 AARCH64_APPROX_ALL, /* sqrt */
1193 AARCH64_APPROX_ALL /* recip_sqrt */
1196 /* Approximation modes for X-Gene 1. */
1197 static const cpu_approx_modes xgene1_approx_modes =
1199 AARCH64_APPROX_NONE, /* division */
1200 AARCH64_APPROX_NONE, /* sqrt */
1201 AARCH64_APPROX_ALL /* recip_sqrt */
1204 /* Generic prefetch settings (which disable prefetch). */
1205 static const cpu_prefetch_tune generic_prefetch_tune =
1207 0, /* num_slots */
1208 -1, /* l1_cache_size */
1209 -1, /* l1_cache_line_size */
1210 -1, /* l2_cache_size */
1211 true, /* prefetch_dynamic_strides */
1212 -1, /* minimum_stride */
1213 -1 /* default_opt_level */
1216 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1218 0, /* num_slots */
1219 -1, /* l1_cache_size */
1220 64, /* l1_cache_line_size */
1221 -1, /* l2_cache_size */
1222 true, /* prefetch_dynamic_strides */
1223 -1, /* minimum_stride */
1224 -1 /* default_opt_level */
1227 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1229 4, /* num_slots */
1230 32, /* l1_cache_size */
1231 64, /* l1_cache_line_size */
1232 512, /* l2_cache_size */
1233 false, /* prefetch_dynamic_strides */
1234 2048, /* minimum_stride */
1235 3 /* default_opt_level */
1238 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1240 8, /* num_slots */
1241 32, /* l1_cache_size */
1242 128, /* l1_cache_line_size */
1243 16*1024, /* l2_cache_size */
1244 true, /* prefetch_dynamic_strides */
1245 -1, /* minimum_stride */
1246 3 /* default_opt_level */
1249 static const cpu_prefetch_tune thunderx_prefetch_tune =
1251 8, /* num_slots */
1252 32, /* l1_cache_size */
1253 128, /* l1_cache_line_size */
1254 -1, /* l2_cache_size */
1255 true, /* prefetch_dynamic_strides */
1256 -1, /* minimum_stride */
1257 -1 /* default_opt_level */
1260 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1262 8, /* num_slots */
1263 32, /* l1_cache_size */
1264 64, /* l1_cache_line_size */
1265 256, /* l2_cache_size */
1266 true, /* prefetch_dynamic_strides */
1267 -1, /* minimum_stride */
1268 -1 /* default_opt_level */
1271 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1273 8, /* num_slots */
1274 32, /* l1_cache_size */
1275 64, /* l1_cache_line_size */
1276 256, /* l2_cache_size */
1277 true, /* prefetch_dynamic_strides */
1278 -1, /* minimum_stride */
1279 -1 /* default_opt_level */
1282 static const cpu_prefetch_tune tsv110_prefetch_tune =
1284 0, /* num_slots */
1285 64, /* l1_cache_size */
1286 64, /* l1_cache_line_size */
1287 512, /* l2_cache_size */
1288 true, /* prefetch_dynamic_strides */
1289 -1, /* minimum_stride */
1290 -1 /* default_opt_level */
1293 static const cpu_prefetch_tune xgene1_prefetch_tune =
1295 8, /* num_slots */
1296 32, /* l1_cache_size */
1297 64, /* l1_cache_line_size */
1298 256, /* l2_cache_size */
1299 true, /* prefetch_dynamic_strides */
1300 -1, /* minimum_stride */
1301 -1 /* default_opt_level */
1304 static const cpu_prefetch_tune a64fx_prefetch_tune =
1306 8, /* num_slots */
1307 64, /* l1_cache_size */
1308 256, /* l1_cache_line_size */
1309 32768, /* l2_cache_size */
1310 true, /* prefetch_dynamic_strides */
1311 -1, /* minimum_stride */
1312 -1 /* default_opt_level */
1315 static const cpu_prefetch_tune ampere1_prefetch_tune =
1317 0, /* num_slots */
1318 64, /* l1_cache_size */
1319 64, /* l1_cache_line_size */
1320 2048, /* l2_cache_size */
1321 true, /* prefetch_dynamic_strides */
1322 -1, /* minimum_stride */
1323 -1 /* default_opt_level */
1326 static const struct tune_params generic_tunings =
1328 &cortexa57_extra_costs,
1329 &generic_addrcost_table,
1330 &generic_regmove_cost,
1331 &generic_vector_cost,
1332 &generic_branch_cost,
1333 &generic_approx_modes,
1334 SVE_NOT_IMPLEMENTED, /* sve_width */
1335 { 4, /* load_int. */
1336 4, /* store_int. */
1337 4, /* load_fp. */
1338 4, /* store_fp. */
1339 4, /* load_pred. */
1340 4 /* store_pred. */
1341 }, /* memmov_cost. */
1342 2, /* issue_rate */
1343 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1344 "16:12", /* function_align. */
1345 "4", /* jump_align. */
1346 "8", /* loop_align. */
1347 2, /* int_reassoc_width. */
1348 4, /* fp_reassoc_width. */
1349 1, /* fma_reassoc_width. */
1350 1, /* vec_reassoc_width. */
1351 2, /* min_div_recip_mul_sf. */
1352 2, /* min_div_recip_mul_df. */
1353 0, /* max_case_values. */
1354 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1355 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1356 Neoverse V1. It does not have a noticeable effect on A64FX and should
1357 have at most a very minor effect on SVE2 cores. */
1358 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
1359 &generic_prefetch_tune
1362 static const struct tune_params cortexa35_tunings =
1364 &cortexa53_extra_costs,
1365 &generic_addrcost_table,
1366 &cortexa53_regmove_cost,
1367 &generic_vector_cost,
1368 &generic_branch_cost,
1369 &generic_approx_modes,
1370 SVE_NOT_IMPLEMENTED, /* sve_width */
1371 { 4, /* load_int. */
1372 4, /* store_int. */
1373 4, /* load_fp. */
1374 4, /* store_fp. */
1375 4, /* load_pred. */
1376 4 /* store_pred. */
1377 }, /* memmov_cost. */
1378 1, /* issue_rate */
1379 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1380 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1381 "16", /* function_align. */
1382 "4", /* jump_align. */
1383 "8", /* loop_align. */
1384 2, /* int_reassoc_width. */
1385 4, /* fp_reassoc_width. */
1386 1, /* fma_reassoc_width. */
1387 1, /* vec_reassoc_width. */
1388 2, /* min_div_recip_mul_sf. */
1389 2, /* min_div_recip_mul_df. */
1390 0, /* max_case_values. */
1391 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1392 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1393 &generic_prefetch_tune
1396 static const struct tune_params cortexa53_tunings =
1398 &cortexa53_extra_costs,
1399 &generic_addrcost_table,
1400 &cortexa53_regmove_cost,
1401 &generic_vector_cost,
1402 &generic_branch_cost,
1403 &generic_approx_modes,
1404 SVE_NOT_IMPLEMENTED, /* sve_width */
1405 { 4, /* load_int. */
1406 4, /* store_int. */
1407 4, /* load_fp. */
1408 4, /* store_fp. */
1409 4, /* load_pred. */
1410 4 /* store_pred. */
1411 }, /* memmov_cost. */
1412 2, /* issue_rate */
1413 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1414 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1415 "16", /* function_align. */
1416 "4", /* jump_align. */
1417 "8", /* loop_align. */
1418 2, /* int_reassoc_width. */
1419 4, /* fp_reassoc_width. */
1420 1, /* fma_reassoc_width. */
1421 1, /* vec_reassoc_width. */
1422 2, /* min_div_recip_mul_sf. */
1423 2, /* min_div_recip_mul_df. */
1424 0, /* max_case_values. */
1425 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1426 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1427 &generic_prefetch_tune
1430 static const struct tune_params cortexa57_tunings =
1432 &cortexa57_extra_costs,
1433 &generic_addrcost_table,
1434 &cortexa57_regmove_cost,
1435 &cortexa57_vector_cost,
1436 &generic_branch_cost,
1437 &generic_approx_modes,
1438 SVE_NOT_IMPLEMENTED, /* sve_width */
1439 { 4, /* load_int. */
1440 4, /* store_int. */
1441 4, /* load_fp. */
1442 4, /* store_fp. */
1443 4, /* load_pred. */
1444 4 /* store_pred. */
1445 }, /* memmov_cost. */
1446 3, /* issue_rate */
1447 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1448 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1449 "16", /* function_align. */
1450 "4", /* jump_align. */
1451 "8", /* loop_align. */
1452 2, /* int_reassoc_width. */
1453 4, /* fp_reassoc_width. */
1454 1, /* fma_reassoc_width. */
1455 1, /* vec_reassoc_width. */
1456 2, /* min_div_recip_mul_sf. */
1457 2, /* min_div_recip_mul_df. */
1458 0, /* max_case_values. */
1459 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1460 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1461 &generic_prefetch_tune
1464 static const struct tune_params cortexa72_tunings =
1466 &cortexa57_extra_costs,
1467 &generic_addrcost_table,
1468 &cortexa57_regmove_cost,
1469 &cortexa57_vector_cost,
1470 &generic_branch_cost,
1471 &generic_approx_modes,
1472 SVE_NOT_IMPLEMENTED, /* sve_width */
1473 { 4, /* load_int. */
1474 4, /* store_int. */
1475 4, /* load_fp. */
1476 4, /* store_fp. */
1477 4, /* load_pred. */
1478 4 /* store_pred. */
1479 }, /* memmov_cost. */
1480 3, /* issue_rate */
1481 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1482 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1483 "16", /* function_align. */
1484 "4", /* jump_align. */
1485 "8", /* loop_align. */
1486 2, /* int_reassoc_width. */
1487 4, /* fp_reassoc_width. */
1488 1, /* fma_reassoc_width. */
1489 1, /* vec_reassoc_width. */
1490 2, /* min_div_recip_mul_sf. */
1491 2, /* min_div_recip_mul_df. */
1492 0, /* max_case_values. */
1493 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1494 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1495 &generic_prefetch_tune
1498 static const struct tune_params cortexa73_tunings =
1500 &cortexa57_extra_costs,
1501 &generic_addrcost_table,
1502 &cortexa57_regmove_cost,
1503 &cortexa57_vector_cost,
1504 &generic_branch_cost,
1505 &generic_approx_modes,
1506 SVE_NOT_IMPLEMENTED, /* sve_width */
1507 { 4, /* load_int. */
1508 4, /* store_int. */
1509 4, /* load_fp. */
1510 4, /* store_fp. */
1511 4, /* load_pred. */
1512 4 /* store_pred. */
1513 }, /* memmov_cost. */
1514 2, /* issue_rate. */
1515 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1516 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1517 "16", /* function_align. */
1518 "4", /* jump_align. */
1519 "8", /* loop_align. */
1520 2, /* int_reassoc_width. */
1521 4, /* fp_reassoc_width. */
1522 1, /* fma_reassoc_width. */
1523 1, /* vec_reassoc_width. */
1524 2, /* min_div_recip_mul_sf. */
1525 2, /* min_div_recip_mul_df. */
1526 0, /* max_case_values. */
1527 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1528 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1529 &generic_prefetch_tune
1534 static const struct tune_params exynosm1_tunings =
1536 &exynosm1_extra_costs,
1537 &exynosm1_addrcost_table,
1538 &exynosm1_regmove_cost,
1539 &exynosm1_vector_cost,
1540 &generic_branch_cost,
1541 &exynosm1_approx_modes,
1542 SVE_NOT_IMPLEMENTED, /* sve_width */
1543 { 4, /* load_int. */
1544 4, /* store_int. */
1545 4, /* load_fp. */
1546 4, /* store_fp. */
1547 4, /* load_pred. */
1548 4 /* store_pred. */
1549 }, /* memmov_cost. */
1550 3, /* issue_rate */
1551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
1552 "4", /* function_align. */
1553 "4", /* jump_align. */
1554 "4", /* loop_align. */
1555 2, /* int_reassoc_width. */
1556 4, /* fp_reassoc_width. */
1557 1, /* fma_reassoc_width. */
1558 1, /* vec_reassoc_width. */
1559 2, /* min_div_recip_mul_sf. */
1560 2, /* min_div_recip_mul_df. */
1561 48, /* max_case_values. */
1562 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1563 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1564 &exynosm1_prefetch_tune
1567 static const struct tune_params thunderxt88_tunings =
1569 &thunderx_extra_costs,
1570 &generic_addrcost_table,
1571 &thunderx_regmove_cost,
1572 &thunderx_vector_cost,
1573 &generic_branch_cost,
1574 &generic_approx_modes,
1575 SVE_NOT_IMPLEMENTED, /* sve_width */
1576 { 6, /* load_int. */
1577 6, /* store_int. */
1578 6, /* load_fp. */
1579 6, /* store_fp. */
1580 6, /* load_pred. */
1581 6 /* store_pred. */
1582 }, /* memmov_cost. */
1583 2, /* issue_rate */
1584 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1585 "8", /* function_align. */
1586 "8", /* jump_align. */
1587 "8", /* loop_align. */
1588 2, /* int_reassoc_width. */
1589 4, /* fp_reassoc_width. */
1590 1, /* fma_reassoc_width. */
1591 1, /* vec_reassoc_width. */
1592 2, /* min_div_recip_mul_sf. */
1593 2, /* min_div_recip_mul_df. */
1594 0, /* max_case_values. */
1595 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1596 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1597 &thunderxt88_prefetch_tune
1600 static const struct tune_params thunderx_tunings =
1602 &thunderx_extra_costs,
1603 &generic_addrcost_table,
1604 &thunderx_regmove_cost,
1605 &thunderx_vector_cost,
1606 &generic_branch_cost,
1607 &generic_approx_modes,
1608 SVE_NOT_IMPLEMENTED, /* sve_width */
1609 { 6, /* load_int. */
1610 6, /* store_int. */
1611 6, /* load_fp. */
1612 6, /* store_fp. */
1613 6, /* load_pred. */
1614 6 /* store_pred. */
1615 }, /* memmov_cost. */
1616 2, /* issue_rate */
1617 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1618 "8", /* function_align. */
1619 "8", /* jump_align. */
1620 "8", /* loop_align. */
1621 2, /* int_reassoc_width. */
1622 4, /* fp_reassoc_width. */
1623 1, /* fma_reassoc_width. */
1624 1, /* vec_reassoc_width. */
1625 2, /* min_div_recip_mul_sf. */
1626 2, /* min_div_recip_mul_df. */
1627 0, /* max_case_values. */
1628 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1629 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1630 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1631 &thunderx_prefetch_tune
1634 static const struct tune_params tsv110_tunings =
1636 &tsv110_extra_costs,
1637 &tsv110_addrcost_table,
1638 &tsv110_regmove_cost,
1639 &tsv110_vector_cost,
1640 &generic_branch_cost,
1641 &generic_approx_modes,
1642 SVE_NOT_IMPLEMENTED, /* sve_width */
1643 { 4, /* load_int. */
1644 4, /* store_int. */
1645 4, /* load_fp. */
1646 4, /* store_fp. */
1647 4, /* load_pred. */
1648 4 /* store_pred. */
1649 }, /* memmov_cost. */
1650 4, /* issue_rate */
1651 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1652 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1653 "16", /* function_align. */
1654 "4", /* jump_align. */
1655 "8", /* loop_align. */
1656 2, /* int_reassoc_width. */
1657 4, /* fp_reassoc_width. */
1658 1, /* fma_reassoc_width. */
1659 1, /* vec_reassoc_width. */
1660 2, /* min_div_recip_mul_sf. */
1661 2, /* min_div_recip_mul_df. */
1662 0, /* max_case_values. */
1663 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1664 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1665 &tsv110_prefetch_tune
1668 static const struct tune_params xgene1_tunings =
1670 &xgene1_extra_costs,
1671 &xgene1_addrcost_table,
1672 &xgene1_regmove_cost,
1673 &xgene1_vector_cost,
1674 &generic_branch_cost,
1675 &xgene1_approx_modes,
1676 SVE_NOT_IMPLEMENTED, /* sve_width */
1677 { 6, /* load_int. */
1678 6, /* store_int. */
1679 6, /* load_fp. */
1680 6, /* store_fp. */
1681 6, /* load_pred. */
1682 6 /* store_pred. */
1683 }, /* memmov_cost. */
1684 4, /* issue_rate */
1685 AARCH64_FUSE_NOTHING, /* fusible_ops */
1686 "16", /* function_align. */
1687 "16", /* jump_align. */
1688 "16", /* loop_align. */
1689 2, /* int_reassoc_width. */
1690 4, /* fp_reassoc_width. */
1691 1, /* fma_reassoc_width. */
1692 1, /* vec_reassoc_width. */
1693 2, /* min_div_recip_mul_sf. */
1694 2, /* min_div_recip_mul_df. */
1695 17, /* max_case_values. */
1696 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1697 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1698 &xgene1_prefetch_tune
1701 static const struct tune_params emag_tunings =
1703 &xgene1_extra_costs,
1704 &xgene1_addrcost_table,
1705 &xgene1_regmove_cost,
1706 &xgene1_vector_cost,
1707 &generic_branch_cost,
1708 &xgene1_approx_modes,
1709 SVE_NOT_IMPLEMENTED,
1710 { 6, /* load_int. */
1711 6, /* store_int. */
1712 6, /* load_fp. */
1713 6, /* store_fp. */
1714 6, /* load_pred. */
1715 6 /* store_pred. */
1716 }, /* memmov_cost. */
1717 4, /* issue_rate */
1718 AARCH64_FUSE_NOTHING, /* fusible_ops */
1719 "16", /* function_align. */
1720 "16", /* jump_align. */
1721 "16", /* loop_align. */
1722 2, /* int_reassoc_width. */
1723 4, /* fp_reassoc_width. */
1724 1, /* fma_reassoc_width. */
1725 1, /* vec_reassoc_width. */
1726 2, /* min_div_recip_mul_sf. */
1727 2, /* min_div_recip_mul_df. */
1728 17, /* max_case_values. */
1729 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1730 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1731 &xgene1_prefetch_tune
1734 static const struct tune_params qdf24xx_tunings =
1736 &qdf24xx_extra_costs,
1737 &qdf24xx_addrcost_table,
1738 &qdf24xx_regmove_cost,
1739 &qdf24xx_vector_cost,
1740 &generic_branch_cost,
1741 &generic_approx_modes,
1742 SVE_NOT_IMPLEMENTED, /* sve_width */
1743 { 4, /* load_int. */
1744 4, /* store_int. */
1745 4, /* load_fp. */
1746 4, /* store_fp. */
1747 4, /* load_pred. */
1748 4 /* store_pred. */
1749 }, /* memmov_cost. */
1750 4, /* issue_rate */
1751 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1752 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1753 "16", /* function_align. */
1754 "8", /* jump_align. */
1755 "16", /* loop_align. */
1756 2, /* int_reassoc_width. */
1757 4, /* fp_reassoc_width. */
1758 1, /* fma_reassoc_width. */
1759 1, /* vec_reassoc_width. */
1760 2, /* min_div_recip_mul_sf. */
1761 2, /* min_div_recip_mul_df. */
1762 0, /* max_case_values. */
1763 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1764 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1765 &qdf24xx_prefetch_tune
1768 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1769 for now. */
1770 static const struct tune_params saphira_tunings =
1772 &generic_extra_costs,
1773 &generic_addrcost_table,
1774 &generic_regmove_cost,
1775 &generic_vector_cost,
1776 &generic_branch_cost,
1777 &generic_approx_modes,
1778 SVE_NOT_IMPLEMENTED, /* sve_width */
1779 { 4, /* load_int. */
1780 4, /* store_int. */
1781 4, /* load_fp. */
1782 4, /* store_fp. */
1783 4, /* load_pred. */
1784 4 /* store_pred. */
1785 }, /* memmov_cost. */
1786 4, /* issue_rate */
1787 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1788 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1789 "16", /* function_align. */
1790 "8", /* jump_align. */
1791 "16", /* loop_align. */
1792 2, /* int_reassoc_width. */
1793 4, /* fp_reassoc_width. */
1794 1, /* fma_reassoc_width. */
1795 1, /* vec_reassoc_width. */
1796 2, /* min_div_recip_mul_sf. */
1797 2, /* min_div_recip_mul_df. */
1798 0, /* max_case_values. */
1799 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1800 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1801 &generic_prefetch_tune
1804 static const struct tune_params thunderx2t99_tunings =
1806 &thunderx2t99_extra_costs,
1807 &thunderx2t99_addrcost_table,
1808 &thunderx2t99_regmove_cost,
1809 &thunderx2t99_vector_cost,
1810 &generic_branch_cost,
1811 &generic_approx_modes,
1812 SVE_NOT_IMPLEMENTED, /* sve_width */
1813 { 4, /* load_int. */
1814 4, /* store_int. */
1815 4, /* load_fp. */
1816 4, /* store_fp. */
1817 4, /* load_pred. */
1818 4 /* store_pred. */
1819 }, /* memmov_cost. */
1820 4, /* issue_rate. */
1821 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1822 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1823 "16", /* function_align. */
1824 "8", /* jump_align. */
1825 "16", /* loop_align. */
1826 3, /* int_reassoc_width. */
1827 2, /* fp_reassoc_width. */
1828 1, /* fma_reassoc_width. */
1829 2, /* vec_reassoc_width. */
1830 2, /* min_div_recip_mul_sf. */
1831 2, /* min_div_recip_mul_df. */
1832 0, /* max_case_values. */
1833 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1834 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1835 &thunderx2t99_prefetch_tune
1838 static const struct tune_params thunderx3t110_tunings =
1840 &thunderx3t110_extra_costs,
1841 &thunderx3t110_addrcost_table,
1842 &thunderx3t110_regmove_cost,
1843 &thunderx3t110_vector_cost,
1844 &generic_branch_cost,
1845 &generic_approx_modes,
1846 SVE_NOT_IMPLEMENTED, /* sve_width */
1847 { 4, /* load_int. */
1848 4, /* store_int. */
1849 4, /* load_fp. */
1850 4, /* store_fp. */
1851 4, /* load_pred. */
1852 4 /* store_pred. */
1853 }, /* memmov_cost. */
1854 6, /* issue_rate. */
1855 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1856 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1857 "16", /* function_align. */
1858 "8", /* jump_align. */
1859 "16", /* loop_align. */
1860 3, /* int_reassoc_width. */
1861 2, /* fp_reassoc_width. */
1862 1, /* fma_reassoc_width. */
1863 2, /* vec_reassoc_width. */
1864 2, /* min_div_recip_mul_sf. */
1865 2, /* min_div_recip_mul_df. */
1866 0, /* max_case_values. */
1867 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1868 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1869 &thunderx3t110_prefetch_tune
1872 static const struct tune_params neoversen1_tunings =
1874 &cortexa76_extra_costs,
1875 &generic_addrcost_table,
1876 &generic_regmove_cost,
1877 &cortexa57_vector_cost,
1878 &generic_branch_cost,
1879 &generic_approx_modes,
1880 SVE_NOT_IMPLEMENTED, /* sve_width */
1881 { 4, /* load_int. */
1882 2, /* store_int. */
1883 5, /* load_fp. */
1884 2, /* store_fp. */
1885 4, /* load_pred. */
1886 4 /* store_pred. */
1887 }, /* memmov_cost. */
1888 3, /* issue_rate */
1889 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1890 "32:16", /* function_align. */
1891 "4", /* jump_align. */
1892 "32:16", /* loop_align. */
1893 2, /* int_reassoc_width. */
1894 4, /* fp_reassoc_width. */
1895 1, /* fma_reassoc_width. */
1896 2, /* vec_reassoc_width. */
1897 2, /* min_div_recip_mul_sf. */
1898 2, /* min_div_recip_mul_df. */
1899 0, /* max_case_values. */
1900 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1901 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1902 &generic_prefetch_tune
1905 static const struct tune_params ampere1_tunings =
1907 &ampere1_extra_costs,
1908 &generic_addrcost_table,
1909 &generic_regmove_cost,
1910 &ampere1_vector_cost,
1911 &generic_branch_cost,
1912 &generic_approx_modes,
1913 SVE_NOT_IMPLEMENTED, /* sve_width */
1914 { 4, /* load_int. */
1915 4, /* store_int. */
1916 4, /* load_fp. */
1917 4, /* store_fp. */
1918 4, /* load_pred. */
1919 4 /* store_pred. */
1920 }, /* memmov_cost. */
1921 4, /* issue_rate */
1922 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1923 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1924 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1925 AARCH64_FUSE_CMP_BRANCH),
1926 /* fusible_ops */
1927 "32", /* function_align. */
1928 "4", /* jump_align. */
1929 "32:16", /* loop_align. */
1930 2, /* int_reassoc_width. */
1931 4, /* fp_reassoc_width. */
1932 1, /* fma_reassoc_width. */
1933 2, /* vec_reassoc_width. */
1934 2, /* min_div_recip_mul_sf. */
1935 2, /* min_div_recip_mul_df. */
1936 0, /* max_case_values. */
1937 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1938 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1939 &ampere1_prefetch_tune
1942 static const struct tune_params ampere1a_tunings =
1944 &ampere1a_extra_costs,
1945 &generic_addrcost_table,
1946 &generic_regmove_cost,
1947 &ampere1_vector_cost,
1948 &generic_branch_cost,
1949 &generic_approx_modes,
1950 SVE_NOT_IMPLEMENTED, /* sve_width */
1951 { 4, /* load_int. */
1952 4, /* store_int. */
1953 4, /* load_fp. */
1954 4, /* store_fp. */
1955 4, /* load_pred. */
1956 4 /* store_pred. */
1957 }, /* memmov_cost. */
1958 4, /* issue_rate */
1959 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1960 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1961 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1962 AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1963 AARCH64_FUSE_ADDSUB_2REG_CONST1),
1964 /* fusible_ops */
1965 "32", /* function_align. */
1966 "4", /* jump_align. */
1967 "32:16", /* loop_align. */
1968 2, /* int_reassoc_width. */
1969 4, /* fp_reassoc_width. */
1970 1, /* fma_reassoc_width. */
1971 2, /* vec_reassoc_width. */
1972 2, /* min_div_recip_mul_sf. */
1973 2, /* min_div_recip_mul_df. */
1974 0, /* max_case_values. */
1975 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1976 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1977 &ampere1_prefetch_tune
1980 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1982 2, /* int_stmt_cost */
1983 2, /* fp_stmt_cost */
1984 4, /* ld2_st2_permute_cost */
1985 4, /* ld3_st3_permute_cost */
1986 5, /* ld4_st4_permute_cost */
1987 3, /* permute_cost */
1988 4, /* reduc_i8_cost */
1989 4, /* reduc_i16_cost */
1990 2, /* reduc_i32_cost */
1991 2, /* reduc_i64_cost */
1992 6, /* reduc_f16_cost */
1993 3, /* reduc_f32_cost */
1994 2, /* reduc_f64_cost */
1995 2, /* store_elt_extra_cost */
1996 /* This value is just inherited from the Cortex-A57 table. */
1997 8, /* vec_to_scalar_cost */
1998 /* This depends very much on what the scalar value is and
1999 where it comes from. E.g. some constants take two dependent
2000 instructions or a load, while others might be moved from a GPR.
2001 4 seems to be a reasonable compromise in practice. */
2002 4, /* scalar_to_vec_cost */
2003 4, /* align_load_cost */
2004 4, /* unalign_load_cost */
2005 /* Although stores have a latency of 2 and compete for the
2006 vector pipes, in practice it's better not to model that. */
2007 1, /* unalign_store_cost */
2008 1 /* store_cost */
2011 static const sve_vec_cost neoversev1_sve_vector_cost =
2014 2, /* int_stmt_cost */
2015 2, /* fp_stmt_cost */
2016 4, /* ld2_st2_permute_cost */
2017 7, /* ld3_st3_permute_cost */
2018 8, /* ld4_st4_permute_cost */
2019 3, /* permute_cost */
2020 /* Theoretically, a reduction involving 31 scalar ADDs could
2021 complete in ~9 cycles and would have a cost of 31. [SU]ADDV
2022 completes in 14 cycles, so give it a cost of 31 + 5. */
2023 36, /* reduc_i8_cost */
2024 /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
2025 22, /* reduc_i16_cost */
2026 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
2027 14, /* reduc_i32_cost */
2028 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
2029 11, /* reduc_i64_cost */
2030 /* Theoretically, a reduction involving 15 scalar FADDs could
2031 complete in ~9 cycles and would have a cost of 30. FADDV
2032 completes in 13 cycles, so give it a cost of 30 + 4. */
2033 34, /* reduc_f16_cost */
2034 /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
2035 19, /* reduc_f32_cost */
2036 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
2037 11, /* reduc_f64_cost */
2038 2, /* store_elt_extra_cost */
2039 /* This value is just inherited from the Cortex-A57 table. */
2040 8, /* vec_to_scalar_cost */
2041 /* See the comment above the Advanced SIMD versions. */
2042 4, /* scalar_to_vec_cost */
2043 4, /* align_load_cost */
2044 4, /* unalign_load_cost */
2045 /* Although stores have a latency of 2 and compete for the
2046 vector pipes, in practice it's better not to model that. */
2047 1, /* unalign_store_cost */
2048 1 /* store_cost */
2050 3, /* clast_cost */
2051 19, /* fadda_f16_cost */
2052 11, /* fadda_f32_cost */
2053 8, /* fadda_f64_cost */
2054 32, /* gather_load_x32_cost */
2055 16, /* gather_load_x64_cost */
2056 3 /* scatter_store_elt_cost */
2059 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2061 3, /* loads_stores_per_cycle */
2062 2, /* stores_per_cycle */
2063 4, /* general_ops_per_cycle */
2064 0, /* fp_simd_load_general_ops */
2065 1 /* fp_simd_store_general_ops */
2068 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2071 3, /* loads_stores_per_cycle */
2072 2, /* stores_per_cycle */
2073 4, /* general_ops_per_cycle */
2074 0, /* fp_simd_load_general_ops */
2075 1 /* fp_simd_store_general_ops */
2077 2, /* ld2_st2_general_ops */
2078 2, /* ld3_st3_general_ops */
2079 3 /* ld4_st4_general_ops */
2082 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2086 2, /* loads_per_cycle */
2087 2, /* stores_per_cycle */
2088 2, /* general_ops_per_cycle */
2089 0, /* fp_simd_load_general_ops */
2090 1 /* fp_simd_store_general_ops */
2092 2, /* ld2_st2_general_ops */
2093 2, /* ld3_st3_general_ops */
2094 3 /* ld4_st4_general_ops */
2096 1, /* pred_ops_per_cycle */
2097 2, /* while_pred_ops */
2098 2, /* int_cmp_pred_ops */
2099 1, /* fp_cmp_pred_ops */
2100 1, /* gather_scatter_pair_general_ops */
2101 1 /* gather_scatter_pair_pred_ops */
2104 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2106 &neoversev1_scalar_issue_info,
2107 &neoversev1_advsimd_issue_info,
2108 &neoversev1_sve_issue_info
2111 /* Neoverse V1 costs for vector insn classes. */
2112 static const struct cpu_vector_cost neoversev1_vector_cost =
2114 1, /* scalar_int_stmt_cost */
2115 2, /* scalar_fp_stmt_cost */
2116 4, /* scalar_load_cost */
2117 1, /* scalar_store_cost */
2118 1, /* cond_taken_branch_cost */
2119 1, /* cond_not_taken_branch_cost */
2120 &neoversev1_advsimd_vector_cost, /* advsimd */
2121 &neoversev1_sve_vector_cost, /* sve */
2122 &neoversev1_vec_issue_info /* issue_info */
2125 static const struct tune_params neoversev1_tunings =
2127 &cortexa76_extra_costs,
2128 &neoversev1_addrcost_table,
2129 &neoversev1_regmove_cost,
2130 &neoversev1_vector_cost,
2131 &generic_branch_cost,
2132 &generic_approx_modes,
2133 SVE_256, /* sve_width */
2134 { 4, /* load_int. */
2135 2, /* store_int. */
2136 6, /* load_fp. */
2137 2, /* store_fp. */
2138 6, /* load_pred. */
2139 1 /* store_pred. */
2140 }, /* memmov_cost. */
2141 3, /* issue_rate */
2142 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2143 "32:16", /* function_align. */
2144 "4", /* jump_align. */
2145 "32:16", /* loop_align. */
2146 2, /* int_reassoc_width. */
2147 4, /* fp_reassoc_width. */
2148 4, /* fma_reassoc_width. */
2149 2, /* vec_reassoc_width. */
2150 2, /* min_div_recip_mul_sf. */
2151 2, /* min_div_recip_mul_df. */
2152 0, /* max_case_values. */
2153 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2154 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2155 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2156 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2157 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
2158 &generic_prefetch_tune
2161 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2164 2, /* int_stmt_cost */
2165 2, /* fp_stmt_cost */
2166 4, /* ld2_st2_permute_cost */
2167 5, /* ld3_st3_permute_cost */
2168 5, /* ld4_st4_permute_cost */
2169 3, /* permute_cost */
2170 /* Theoretically, a reduction involving 15 scalar ADDs could
2171 complete in ~5 cycles and would have a cost of 15. Assume that
2172 [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
2173 21, /* reduc_i8_cost */
2174 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2175 13, /* reduc_i16_cost */
2176 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2177 9, /* reduc_i32_cost */
2178 /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
2179 8, /* reduc_i64_cost */
2180 /* Theoretically, a reduction involving 7 scalar FADDs could
2181 complete in ~6 cycles and would have a cost of 14. Assume that
2182 FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
2183 16, /* reduc_f16_cost */
2184 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2185 8, /* reduc_f32_cost */
2186 /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
2187 4, /* reduc_f64_cost */
2188 2, /* store_elt_extra_cost */
2189 /* This value is just inherited from the Cortex-A57 table. */
2190 8, /* vec_to_scalar_cost */
2191 /* This depends very much on what the scalar value is and
2192 where it comes from. E.g. some constants take two dependent
2193 instructions or a load, while others might be moved from a GPR.
2194 4 seems to be a reasonable compromise in practice. */
2195 4, /* scalar_to_vec_cost */
2196 4, /* align_load_cost */
2197 4, /* unalign_load_cost */
2198 /* Although stores generally have a latency of 2 and compete for the
2199 vector pipes, in practice it's better not to model that. */
2200 1, /* unalign_store_cost */
2201 1 /* store_cost */
2203 3, /* clast_cost */
2204 10, /* fadda_f16_cost */
2205 6, /* fadda_f32_cost */
2206 4, /* fadda_f64_cost */
2207 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2208 (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2209 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2210 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2211 (cost 2) to that, to avoid the difference being lost in rounding.
2213 There is no easy comparison between a strided Advanced SIMD x32 load
2214 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2215 operation more than a 64-bit gather. */
2216 14, /* gather_load_x32_cost */
2217 12, /* gather_load_x64_cost */
2218 3 /* scatter_store_elt_cost */
2221 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2225 3, /* loads_per_cycle */
2226 2, /* stores_per_cycle */
2227 4, /* general_ops_per_cycle */
2228 0, /* fp_simd_load_general_ops */
2229 1 /* fp_simd_store_general_ops */
2231 2, /* ld2_st2_general_ops */
2232 2, /* ld3_st3_general_ops */
2233 3 /* ld4_st4_general_ops */
2235 2, /* pred_ops_per_cycle */
2236 2, /* while_pred_ops */
2237 2, /* int_cmp_pred_ops */
2238 1, /* fp_cmp_pred_ops */
2239 1, /* gather_scatter_pair_general_ops */
2240 1 /* gather_scatter_pair_pred_ops */
2243 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2245 &neoversev1_scalar_issue_info,
2246 &neoversev1_advsimd_issue_info,
2247 &neoverse512tvb_sve_issue_info
2250 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2252 1, /* scalar_int_stmt_cost */
2253 2, /* scalar_fp_stmt_cost */
2254 4, /* scalar_load_cost */
2255 1, /* scalar_store_cost */
2256 1, /* cond_taken_branch_cost */
2257 1, /* cond_not_taken_branch_cost */
2258 &neoversev1_advsimd_vector_cost, /* advsimd */
2259 &neoverse512tvb_sve_vector_cost, /* sve */
2260 &neoverse512tvb_vec_issue_info /* issue_info */
2263 static const struct tune_params neoverse512tvb_tunings =
2265 &cortexa76_extra_costs,
2266 &neoversev1_addrcost_table,
2267 &neoversev1_regmove_cost,
2268 &neoverse512tvb_vector_cost,
2269 &generic_branch_cost,
2270 &generic_approx_modes,
2271 SVE_128 | SVE_256, /* sve_width */
2272 { 4, /* load_int. */
2273 2, /* store_int. */
2274 6, /* load_fp. */
2275 2, /* store_fp. */
2276 6, /* load_pred. */
2277 1 /* store_pred. */
2278 }, /* memmov_cost. */
2279 3, /* issue_rate */
2280 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2281 "32:16", /* function_align. */
2282 "4", /* jump_align. */
2283 "32:16", /* loop_align. */
2284 2, /* int_reassoc_width. */
2285 4, /* fp_reassoc_width. */
2286 4, /* fma_reassoc_width. */
2287 2, /* vec_reassoc_width. */
2288 2, /* min_div_recip_mul_sf. */
2289 2, /* min_div_recip_mul_df. */
2290 0, /* max_case_values. */
2291 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2292 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2293 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2294 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2295 &generic_prefetch_tune
2298 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2300 2, /* int_stmt_cost */
2301 2, /* fp_stmt_cost */
2302 2, /* ld2_st2_permute_cost */
2303 2, /* ld3_st3_permute_cost */
2304 3, /* ld4_st4_permute_cost */
2305 3, /* permute_cost */
2306 4, /* reduc_i8_cost */
2307 4, /* reduc_i16_cost */
2308 2, /* reduc_i32_cost */
2309 2, /* reduc_i64_cost */
2310 6, /* reduc_f16_cost */
2311 4, /* reduc_f32_cost */
2312 2, /* reduc_f64_cost */
2313 2, /* store_elt_extra_cost */
2314 /* This value is just inherited from the Cortex-A57 table. */
2315 8, /* vec_to_scalar_cost */
2316 /* This depends very much on what the scalar value is and
2317 where it comes from. E.g. some constants take two dependent
2318 instructions or a load, while others might be moved from a GPR.
2319 4 seems to be a reasonable compromise in practice. */
2320 4, /* scalar_to_vec_cost */
2321 4, /* align_load_cost */
2322 4, /* unalign_load_cost */
2323 /* Although stores have a latency of 2 and compete for the
2324 vector pipes, in practice it's better not to model that. */
2325 1, /* unalign_store_cost */
2326 1 /* store_cost */
2329 static const sve_vec_cost neoversen2_sve_vector_cost =
2332 2, /* int_stmt_cost */
2333 2, /* fp_stmt_cost */
2334 3, /* ld2_st2_permute_cost */
2335 4, /* ld3_st3_permute_cost */
2336 4, /* ld4_st4_permute_cost */
2337 3, /* permute_cost */
2338 /* Theoretically, a reduction involving 15 scalar ADDs could
2339 complete in ~5 cycles and would have a cost of 15. [SU]ADDV
2340 completes in 11 cycles, so give it a cost of 15 + 6. */
2341 21, /* reduc_i8_cost */
2342 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2343 13, /* reduc_i16_cost */
2344 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2345 9, /* reduc_i32_cost */
2346 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2347 2, /* reduc_i64_cost */
2348 /* Theoretically, a reduction involving 7 scalar FADDs could
2349 complete in ~8 cycles and would have a cost of 14. FADDV
2350 completes in 6 cycles, so give it a cost of 14 - 2. */
2351 12, /* reduc_f16_cost */
2352 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
2353 6, /* reduc_f32_cost */
2354 /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
2355 2, /* reduc_f64_cost */
2356 2, /* store_elt_extra_cost */
2357 /* This value is just inherited from the Cortex-A57 table. */
2358 8, /* vec_to_scalar_cost */
2359 /* See the comment above the Advanced SIMD versions. */
2360 4, /* scalar_to_vec_cost */
2361 4, /* align_load_cost */
2362 4, /* unalign_load_cost */
2363 /* Although stores have a latency of 2 and compete for the
2364 vector pipes, in practice it's better not to model that. */
2365 1, /* unalign_store_cost */
2366 1 /* store_cost */
2368 3, /* clast_cost */
2369 10, /* fadda_f16_cost */
2370 6, /* fadda_f32_cost */
2371 4, /* fadda_f64_cost */
2372 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2373 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2374 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2375 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2376 (cost 2) to that, to avoid the difference being lost in rounding.
2378 There is no easy comparison between a strided Advanced SIMD x32 load
2379 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2380 operation more than a 64-bit gather. */
2381 14, /* gather_load_x32_cost */
2382 12, /* gather_load_x64_cost */
2383 3 /* scatter_store_elt_cost */
2386 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2388 3, /* loads_stores_per_cycle */
2389 2, /* stores_per_cycle */
2390 4, /* general_ops_per_cycle */
2391 0, /* fp_simd_load_general_ops */
2392 1 /* fp_simd_store_general_ops */
2395 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2398 3, /* loads_stores_per_cycle */
2399 2, /* stores_per_cycle */
2400 2, /* general_ops_per_cycle */
2401 0, /* fp_simd_load_general_ops */
2402 1 /* fp_simd_store_general_ops */
2404 2, /* ld2_st2_general_ops */
2405 2, /* ld3_st3_general_ops */
2406 3 /* ld4_st4_general_ops */
2409 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2413 3, /* loads_per_cycle */
2414 2, /* stores_per_cycle */
2415 2, /* general_ops_per_cycle */
2416 0, /* fp_simd_load_general_ops */
2417 1 /* fp_simd_store_general_ops */
2419 2, /* ld2_st2_general_ops */
2420 3, /* ld3_st3_general_ops */
2421 3 /* ld4_st4_general_ops */
2423 2, /* pred_ops_per_cycle */
2424 2, /* while_pred_ops */
2425 2, /* int_cmp_pred_ops */
2426 1, /* fp_cmp_pred_ops */
2427 1, /* gather_scatter_pair_general_ops */
2428 1 /* gather_scatter_pair_pred_ops */
2431 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2433 &neoversen2_scalar_issue_info,
2434 &neoversen2_advsimd_issue_info,
2435 &neoversen2_sve_issue_info
2438 /* Neoverse N2 costs for vector insn classes. */
2439 static const struct cpu_vector_cost neoversen2_vector_cost =
2441 1, /* scalar_int_stmt_cost */
2442 2, /* scalar_fp_stmt_cost */
2443 4, /* scalar_load_cost */
2444 1, /* scalar_store_cost */
2445 1, /* cond_taken_branch_cost */
2446 1, /* cond_not_taken_branch_cost */
2447 &neoversen2_advsimd_vector_cost, /* advsimd */
2448 &neoversen2_sve_vector_cost, /* sve */
2449 &neoversen2_vec_issue_info /* issue_info */
2452 static const struct tune_params neoversen2_tunings =
2454 &cortexa76_extra_costs,
2455 &neoversen2_addrcost_table,
2456 &neoversen2_regmove_cost,
2457 &neoversen2_vector_cost,
2458 &generic_branch_cost,
2459 &generic_approx_modes,
2460 SVE_128, /* sve_width */
2461 { 4, /* load_int. */
2462 1, /* store_int. */
2463 6, /* load_fp. */
2464 2, /* store_fp. */
2465 6, /* load_pred. */
2466 1 /* store_pred. */
2467 }, /* memmov_cost. */
2468 3, /* issue_rate */
2469 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2470 "32:16", /* function_align. */
2471 "4", /* jump_align. */
2472 "32:16", /* loop_align. */
2473 2, /* int_reassoc_width. */
2474 4, /* fp_reassoc_width. */
2475 1, /* fma_reassoc_width. */
2476 2, /* vec_reassoc_width. */
2477 2, /* min_div_recip_mul_sf. */
2478 2, /* min_div_recip_mul_df. */
2479 0, /* max_case_values. */
2480 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2481 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2482 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2483 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2484 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2485 &generic_prefetch_tune
2488 static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
2490 2, /* int_stmt_cost */
2491 2, /* fp_stmt_cost */
2492 2, /* ld2_st2_permute_cost */
2493 2, /* ld3_st3_permute_cost */
2494 3, /* ld4_st4_permute_cost */
2495 3, /* permute_cost */
2496 4, /* reduc_i8_cost */
2497 4, /* reduc_i16_cost */
2498 2, /* reduc_i32_cost */
2499 2, /* reduc_i64_cost */
2500 6, /* reduc_f16_cost */
2501 3, /* reduc_f32_cost */
2502 2, /* reduc_f64_cost */
2503 2, /* store_elt_extra_cost */
2504 /* This value is just inherited from the Cortex-A57 table. */
2505 8, /* vec_to_scalar_cost */
2506 /* This depends very much on what the scalar value is and
2507 where it comes from. E.g. some constants take two dependent
2508 instructions or a load, while others might be moved from a GPR.
2509 4 seems to be a reasonable compromise in practice. */
2510 4, /* scalar_to_vec_cost */
2511 4, /* align_load_cost */
2512 4, /* unalign_load_cost */
2513 /* Although stores have a latency of 2 and compete for the
2514 vector pipes, in practice it's better not to model that. */
2515 1, /* unalign_store_cost */
2516 1 /* store_cost */
2519 static const sve_vec_cost neoversev2_sve_vector_cost =
2522 2, /* int_stmt_cost */
2523 2, /* fp_stmt_cost */
2524 3, /* ld2_st2_permute_cost */
2525 3, /* ld3_st3_permute_cost */
2526 4, /* ld4_st4_permute_cost */
2527 3, /* permute_cost */
2528 /* Theoretically, a reduction involving 15 scalar ADDs could
2529 complete in ~3 cycles and would have a cost of 15. [SU]ADDV
2530 completes in 11 cycles, so give it a cost of 15 + 8. */
2531 21, /* reduc_i8_cost */
2532 /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
2533 14, /* reduc_i16_cost */
2534 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
2535 7, /* reduc_i32_cost */
2536 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2537 2, /* reduc_i64_cost */
2538 /* Theoretically, a reduction involving 7 scalar FADDs could
2539 complete in ~6 cycles and would have a cost of 14. FADDV
2540 completes in 8 cycles, so give it a cost of 14 + 2. */
2541 16, /* reduc_f16_cost */
2542 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2543 8, /* reduc_f32_cost */
2544 /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
2545 4, /* reduc_f64_cost */
2546 2, /* store_elt_extra_cost */
2547 /* This value is just inherited from the Cortex-A57 table. */
2548 8, /* vec_to_scalar_cost */
2549 /* See the comment above the Advanced SIMD versions. */
2550 4, /* scalar_to_vec_cost */
2551 4, /* align_load_cost */
2552 4, /* unalign_load_cost */
2553 /* Although stores have a latency of 2 and compete for the
2554 vector pipes, in practice it's better not to model that. */
2555 1, /* unalign_store_cost */
2556 1 /* store_cost */
2558 3, /* clast_cost */
2559 10, /* fadda_f16_cost */
2560 6, /* fadda_f32_cost */
2561 4, /* fadda_f64_cost */
2562 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2563 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2564 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2565 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2566 (cost 2) to that, to avoid the difference being lost in rounding.
2568 There is no easy comparison between a strided Advanced SIMD x32 load
2569 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2570 operation more than a 64-bit gather. */
2571 14, /* gather_load_x32_cost */
2572 12, /* gather_load_x64_cost */
2573 3 /* scatter_store_elt_cost */
2576 static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
2578 3, /* loads_stores_per_cycle */
2579 2, /* stores_per_cycle */
2580 6, /* general_ops_per_cycle */
2581 0, /* fp_simd_load_general_ops */
2582 1 /* fp_simd_store_general_ops */
2585 static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
2588 3, /* loads_stores_per_cycle */
2589 2, /* stores_per_cycle */
2590 4, /* general_ops_per_cycle */
2591 0, /* fp_simd_load_general_ops */
2592 1 /* fp_simd_store_general_ops */
2594 2, /* ld2_st2_general_ops */
2595 2, /* ld3_st3_general_ops */
2596 3 /* ld4_st4_general_ops */
2599 static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
2603 3, /* loads_per_cycle */
2604 2, /* stores_per_cycle */
2605 4, /* general_ops_per_cycle */
2606 0, /* fp_simd_load_general_ops */
2607 1 /* fp_simd_store_general_ops */
2609 2, /* ld2_st2_general_ops */
2610 3, /* ld3_st3_general_ops */
2611 3 /* ld4_st4_general_ops */
2613 2, /* pred_ops_per_cycle */
2614 2, /* while_pred_ops */
2615 2, /* int_cmp_pred_ops */
2616 1, /* fp_cmp_pred_ops */
2617 1, /* gather_scatter_pair_general_ops */
2618 1 /* gather_scatter_pair_pred_ops */
2621 static const aarch64_vec_issue_info neoversev2_vec_issue_info =
2623 &neoversev2_scalar_issue_info,
2624 &neoversev2_advsimd_issue_info,
2625 &neoversev2_sve_issue_info
2628 /* Demeter costs for vector insn classes. */
2629 static const struct cpu_vector_cost neoversev2_vector_cost =
2631 1, /* scalar_int_stmt_cost */
2632 2, /* scalar_fp_stmt_cost */
2633 4, /* scalar_load_cost */
2634 1, /* scalar_store_cost */
2635 1, /* cond_taken_branch_cost */
2636 1, /* cond_not_taken_branch_cost */
2637 &neoversev2_advsimd_vector_cost, /* advsimd */
2638 &neoversev2_sve_vector_cost, /* sve */
2639 &neoversev2_vec_issue_info /* issue_info */
2642 static const struct tune_params neoversev2_tunings =
2644 &cortexa76_extra_costs,
2645 &neoversev2_addrcost_table,
2646 &neoversev2_regmove_cost,
2647 &neoversev2_vector_cost,
2648 &generic_branch_cost,
2649 &generic_approx_modes,
2650 SVE_128, /* sve_width */
2651 { 4, /* load_int. */
2652 2, /* store_int. */
2653 6, /* load_fp. */
2654 1, /* store_fp. */
2655 6, /* load_pred. */
2656 2 /* store_pred. */
2657 }, /* memmov_cost. */
2658 5, /* issue_rate */
2659 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2660 "32:16", /* function_align. */
2661 "4", /* jump_align. */
2662 "32:16", /* loop_align. */
2663 3, /* int_reassoc_width. */
2664 6, /* fp_reassoc_width. */
2665 4, /* fma_reassoc_width. */
2666 3, /* vec_reassoc_width. */
2667 2, /* min_div_recip_mul_sf. */
2668 2, /* min_div_recip_mul_df. */
2669 0, /* max_case_values. */
2670 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2671 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2672 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2673 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2674 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2675 &generic_prefetch_tune
2678 static const struct tune_params a64fx_tunings =
2680 &a64fx_extra_costs,
2681 &a64fx_addrcost_table,
2682 &a64fx_regmove_cost,
2683 &a64fx_vector_cost,
2684 &generic_branch_cost,
2685 &generic_approx_modes,
2686 SVE_512, /* sve_width */
2687 { 4, /* load_int. */
2688 4, /* store_int. */
2689 4, /* load_fp. */
2690 4, /* store_fp. */
2691 4, /* load_pred. */
2692 4 /* store_pred. */
2693 }, /* memmov_cost. */
2694 7, /* issue_rate */
2695 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2696 "32", /* function_align. */
2697 "16", /* jump_align. */
2698 "32", /* loop_align. */
2699 4, /* int_reassoc_width. */
2700 2, /* fp_reassoc_width. */
2701 1, /* fma_reassoc_width. */
2702 2, /* vec_reassoc_width. */
2703 2, /* min_div_recip_mul_sf. */
2704 2, /* min_div_recip_mul_df. */
2705 0, /* max_case_values. */
2706 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2707 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
2708 &a64fx_prefetch_tune
2711 /* Support for fine-grained override of the tuning structures. */
2712 struct aarch64_tuning_override_function
2714 const char* name;
2715 void (*parse_override)(const char*, struct tune_params*);
2718 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2719 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2720 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2722 static const struct aarch64_tuning_override_function
2723 aarch64_tuning_override_functions[] =
2725 { "fuse", aarch64_parse_fuse_string },
2726 { "tune", aarch64_parse_tune_string },
2727 { "sve_width", aarch64_parse_sve_width_string },
2728 { NULL, NULL }
2731 /* A processor implementing AArch64. */
2732 struct processor
2734 const char *name;
2735 aarch64_processor ident;
2736 aarch64_processor sched_core;
2737 aarch64_arch arch;
2738 aarch64_feature_flags flags;
2739 const tune_params *tune;
2742 /* Architectures implementing AArch64. */
2743 static CONSTEXPR const processor all_architectures[] =
2745 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
2746 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
2747 feature_deps::ARCH_IDENT ().enable, NULL},
2748 #include "aarch64-arches.def"
2749 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2752 /* Processor cores implementing AArch64. */
2753 static const struct processor all_cores[] =
2755 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
2756 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2757 feature_deps::cpu_##IDENT, &COSTS##_tunings},
2758 #include "aarch64-cores.def"
2759 {"generic", generic, cortexa53, AARCH64_ARCH_V8A,
2760 feature_deps::V8A ().enable, &generic_tunings},
2761 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2764 /* The current tuning set. */
2765 struct tune_params aarch64_tune_params = generic_tunings;
2767 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
2769 static tree
2770 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2771 int, bool *no_add_attrs)
2773 /* Since we set fn_type_req to true, the caller should have checked
2774 this for us. */
2775 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2776 switch ((arm_pcs) fntype_abi (*node).id ())
2778 case ARM_PCS_AAPCS64:
2779 case ARM_PCS_SIMD:
2780 return NULL_TREE;
2782 case ARM_PCS_SVE:
2783 error ("the %qE attribute cannot be applied to an SVE function type",
2784 name);
2785 *no_add_attrs = true;
2786 return NULL_TREE;
2788 case ARM_PCS_TLSDESC:
2789 case ARM_PCS_UNKNOWN:
2790 break;
2792 gcc_unreachable ();
2795 /* Table of machine attributes. */
2796 static const struct attribute_spec aarch64_attribute_table[] =
2798 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2799 affects_type_identity, handler, exclude } */
2800 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
2801 handle_aarch64_vector_pcs_attribute, NULL },
2802 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
2803 aarch64_sve::handle_arm_sve_vector_bits_attribute,
2804 NULL },
2805 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
2806 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
2807 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
2808 { NULL, 0, 0, false, false, false, false, NULL, NULL }
2811 /* An ISA extension in the co-processor and main instruction set space. */
2812 struct aarch64_option_extension
2814 const char *const name;
2815 const unsigned long flags_on;
2816 const unsigned long flags_off;
2819 typedef enum aarch64_cond_code
2821 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2822 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2823 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2825 aarch64_cc;
2827 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2829 struct aarch64_branch_protect_type
2831 /* The type's name that the user passes to the branch-protection option
2832 string. */
2833 const char* name;
2834 /* Function to handle the protection type and set global variables.
2835 First argument is the string token corresponding with this type and the
2836 second argument is the next token in the option string.
2837 Return values:
2838 * AARCH64_PARSE_OK: Handling was sucessful.
2839 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2840 should print an error.
2841 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2842 own error. */
2843 enum aarch64_parse_opt_result (*handler)(char*, char*);
2844 /* A list of types that can follow this type in the option string. */
2845 const aarch64_branch_protect_type* subtypes;
2846 unsigned int num_subtypes;
2849 static enum aarch64_parse_opt_result
2850 aarch64_handle_no_branch_protection (char* str, char* rest)
2852 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2853 aarch64_enable_bti = 0;
2854 if (rest)
2856 error ("unexpected %<%s%> after %<%s%>", rest, str);
2857 return AARCH64_PARSE_INVALID_FEATURE;
2859 return AARCH64_PARSE_OK;
2862 static enum aarch64_parse_opt_result
2863 aarch64_handle_standard_branch_protection (char* str, char* rest)
2865 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2866 aarch64_ra_sign_key = AARCH64_KEY_A;
2867 aarch64_enable_bti = 1;
2868 if (rest)
2870 error ("unexpected %<%s%> after %<%s%>", rest, str);
2871 return AARCH64_PARSE_INVALID_FEATURE;
2873 return AARCH64_PARSE_OK;
2876 static enum aarch64_parse_opt_result
2877 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2878 char* rest ATTRIBUTE_UNUSED)
2880 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2881 aarch64_ra_sign_key = AARCH64_KEY_A;
2882 return AARCH64_PARSE_OK;
2885 static enum aarch64_parse_opt_result
2886 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2887 char* rest ATTRIBUTE_UNUSED)
2889 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2890 return AARCH64_PARSE_OK;
2893 static enum aarch64_parse_opt_result
2894 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2895 char* rest ATTRIBUTE_UNUSED)
2897 aarch64_ra_sign_key = AARCH64_KEY_B;
2898 return AARCH64_PARSE_OK;
2901 static enum aarch64_parse_opt_result
2902 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2903 char* rest ATTRIBUTE_UNUSED)
2905 aarch64_enable_bti = 1;
2906 return AARCH64_PARSE_OK;
2909 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2910 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2911 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2912 { NULL, NULL, NULL, 0 }
2915 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2916 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2917 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2918 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2919 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2920 { "bti", aarch64_handle_bti_protection, NULL, 0 },
2921 { NULL, NULL, NULL, 0 }
2924 /* The condition codes of the processor, and the inverse function. */
2925 static const char * const aarch64_condition_codes[] =
2927 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2928 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2931 /* The preferred condition codes for SVE conditions. */
2932 static const char *const aarch64_sve_condition_codes[] =
2934 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2935 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2938 /* Return the assembly token for svpattern value VALUE. */
2940 static const char *
2941 svpattern_token (enum aarch64_svpattern pattern)
2943 switch (pattern)
2945 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2946 AARCH64_FOR_SVPATTERN (CASE)
2947 #undef CASE
2948 case AARCH64_NUM_SVPATTERNS:
2949 break;
2951 gcc_unreachable ();
2954 /* Return the location of a piece that is known to be passed or returned
2955 in registers. FIRST_ZR is the first unused vector argument register
2956 and FIRST_PR is the first unused predicate argument register. */
2959 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2960 unsigned int first_pr) const
2962 gcc_assert (VECTOR_MODE_P (mode)
2963 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2964 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2966 if (num_zr > 0 && num_pr == 0)
2967 return gen_rtx_REG (mode, first_zr);
2969 if (num_zr == 0 && num_pr == 1)
2970 return gen_rtx_REG (mode, first_pr);
2972 gcc_unreachable ();
2975 /* Return the total number of vector registers required by the PST. */
2977 unsigned int
2978 pure_scalable_type_info::num_zr () const
2980 unsigned int res = 0;
2981 for (unsigned int i = 0; i < pieces.length (); ++i)
2982 res += pieces[i].num_zr;
2983 return res;
2986 /* Return the total number of predicate registers required by the PST. */
2988 unsigned int
2989 pure_scalable_type_info::num_pr () const
2991 unsigned int res = 0;
2992 for (unsigned int i = 0; i < pieces.length (); ++i)
2993 res += pieces[i].num_pr;
2994 return res;
2997 /* Return the location of a PST that is known to be passed or returned
2998 in registers. FIRST_ZR is the first unused vector argument register
2999 and FIRST_PR is the first unused predicate argument register. */
3002 pure_scalable_type_info::get_rtx (machine_mode mode,
3003 unsigned int first_zr,
3004 unsigned int first_pr) const
3006 /* Try to return a single REG if possible. This leads to better
3007 code generation; it isn't required for correctness. */
3008 if (mode == pieces[0].mode)
3010 gcc_assert (pieces.length () == 1);
3011 return pieces[0].get_rtx (first_zr, first_pr);
3014 /* Build up a PARALLEL that contains the individual pieces. */
3015 rtvec rtxes = rtvec_alloc (pieces.length ());
3016 for (unsigned int i = 0; i < pieces.length (); ++i)
3018 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
3019 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
3020 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
3021 first_zr += pieces[i].num_zr;
3022 first_pr += pieces[i].num_pr;
3024 return gen_rtx_PARALLEL (mode, rtxes);
3027 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
3028 in the AAPCS64. */
3030 pure_scalable_type_info::analysis_result
3031 pure_scalable_type_info::analyze (const_tree type)
3033 /* Prevent accidental reuse. */
3034 gcc_assert (pieces.is_empty ());
3036 /* No code will be generated for erroneous types, so we won't establish
3037 an ABI mapping. */
3038 if (type == error_mark_node)
3039 return NO_ABI_IDENTITY;
3041 /* Zero-sized types disappear in the language->ABI mapping. */
3042 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3043 return NO_ABI_IDENTITY;
3045 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
3046 piece p = {};
3047 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
3049 machine_mode mode = TYPE_MODE_RAW (type);
3050 gcc_assert (VECTOR_MODE_P (mode)
3051 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
3053 p.mode = p.orig_mode = mode;
3054 add_piece (p);
3055 return IS_PST;
3058 /* Check for user-defined PSTs. */
3059 if (TREE_CODE (type) == ARRAY_TYPE)
3060 return analyze_array (type);
3061 if (TREE_CODE (type) == RECORD_TYPE)
3062 return analyze_record (type);
3064 return ISNT_PST;
3067 /* Analyze a type that is known not to be passed or returned in memory.
3068 Return true if it has an ABI identity and is a Pure Scalable Type. */
3070 bool
3071 pure_scalable_type_info::analyze_registers (const_tree type)
3073 analysis_result result = analyze (type);
3074 gcc_assert (result != DOESNT_MATTER);
3075 return result == IS_PST;
3078 /* Subroutine of analyze for handling ARRAY_TYPEs. */
3080 pure_scalable_type_info::analysis_result
3081 pure_scalable_type_info::analyze_array (const_tree type)
3083 /* Analyze the element type. */
3084 pure_scalable_type_info element_info;
3085 analysis_result result = element_info.analyze (TREE_TYPE (type));
3086 if (result != IS_PST)
3087 return result;
3089 /* An array of unknown, flexible or variable length will be passed and
3090 returned by reference whatever we do. */
3091 tree nelts_minus_one = array_type_nelts (type);
3092 if (!tree_fits_uhwi_p (nelts_minus_one))
3093 return DOESNT_MATTER;
3095 /* Likewise if the array is constant-sized but too big to be interesting.
3096 The double checks against MAX_PIECES are to protect against overflow. */
3097 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
3098 if (count > MAX_PIECES)
3099 return DOESNT_MATTER;
3100 count += 1;
3101 if (count * element_info.pieces.length () > MAX_PIECES)
3102 return DOESNT_MATTER;
3104 /* The above checks should have weeded out elements of unknown size. */
3105 poly_uint64 element_bytes;
3106 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3107 gcc_unreachable ();
3109 /* Build up the list of individual vectors and predicates. */
3110 gcc_assert (!element_info.pieces.is_empty ());
3111 for (unsigned int i = 0; i < count; ++i)
3112 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3114 piece p = element_info.pieces[j];
3115 p.offset += i * element_bytes;
3116 add_piece (p);
3118 return IS_PST;
3121 /* Subroutine of analyze for handling RECORD_TYPEs. */
3123 pure_scalable_type_info::analysis_result
3124 pure_scalable_type_info::analyze_record (const_tree type)
3126 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3128 if (TREE_CODE (field) != FIELD_DECL)
3129 continue;
3131 /* Zero-sized fields disappear in the language->ABI mapping. */
3132 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3133 continue;
3135 /* All fields with an ABI identity must be PSTs for the record as
3136 a whole to be a PST. If any individual field is too big to be
3137 interesting then the record is too. */
3138 pure_scalable_type_info field_info;
3139 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3140 if (subresult == NO_ABI_IDENTITY)
3141 continue;
3142 if (subresult != IS_PST)
3143 return subresult;
3145 /* Since all previous fields are PSTs, we ought to be able to track
3146 the field offset using poly_ints. */
3147 tree bitpos = bit_position (field);
3148 gcc_assert (poly_int_tree_p (bitpos));
3150 /* For the same reason, it shouldn't be possible to create a PST field
3151 whose offset isn't byte-aligned. */
3152 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3153 BITS_PER_UNIT);
3155 /* Punt if the record is too big to be interesting. */
3156 poly_uint64 bytepos;
3157 if (!wide_bytepos.to_uhwi (&bytepos)
3158 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3159 return DOESNT_MATTER;
3161 /* Add the individual vectors and predicates in the field to the
3162 record's list. */
3163 gcc_assert (!field_info.pieces.is_empty ());
3164 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3166 piece p = field_info.pieces[i];
3167 p.offset += bytepos;
3168 add_piece (p);
3171 /* Empty structures disappear in the language->ABI mapping. */
3172 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3175 /* Add P to the list of pieces in the type. */
3177 void
3178 pure_scalable_type_info::add_piece (const piece &p)
3180 /* Try to fold the new piece into the previous one to form a
3181 single-mode PST. For example, if we see three consecutive vectors
3182 of the same mode, we can represent them using the corresponding
3183 3-tuple mode.
3185 This is purely an optimization. */
3186 if (!pieces.is_empty ())
3188 piece &prev = pieces.last ();
3189 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3190 unsigned int nelems1, nelems2;
3191 if (prev.orig_mode == p.orig_mode
3192 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3193 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3194 GET_MODE_NUNITS (p.orig_mode), &nelems1)
3195 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3196 GET_MODE_NUNITS (p.orig_mode), &nelems2)
3197 && targetm.array_mode (p.orig_mode,
3198 nelems1 + nelems2).exists (&prev.mode))
3200 prev.num_zr += p.num_zr;
3201 prev.num_pr += p.num_pr;
3202 return;
3205 pieces.quick_push (p);
3208 /* Return true if at least one possible value of type TYPE includes at
3209 least one object of Pure Scalable Type, in the sense of the AAPCS64.
3211 This is a relatively expensive test for some types, so it should
3212 generally be made as late as possible. */
3214 static bool
3215 aarch64_some_values_include_pst_objects_p (const_tree type)
3217 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3218 return false;
3220 if (aarch64_sve::builtin_type_p (type))
3221 return true;
3223 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3224 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3226 if (RECORD_OR_UNION_TYPE_P (type))
3227 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3228 if (TREE_CODE (field) == FIELD_DECL
3229 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3230 return true;
3232 return false;
3235 /* Return the descriptor of the SIMD ABI. */
3237 static const predefined_function_abi &
3238 aarch64_simd_abi (void)
3240 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3241 if (!simd_abi.initialized_p ())
3243 HARD_REG_SET full_reg_clobbers
3244 = default_function_abi.full_reg_clobbers ();
3245 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3246 if (FP_SIMD_SAVED_REGNUM_P (regno))
3247 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3248 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3250 return simd_abi;
3253 /* Return the descriptor of the SVE PCS. */
3255 static const predefined_function_abi &
3256 aarch64_sve_abi (void)
3258 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3259 if (!sve_abi.initialized_p ())
3261 HARD_REG_SET full_reg_clobbers
3262 = default_function_abi.full_reg_clobbers ();
3263 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3264 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3265 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3266 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3267 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3269 return sve_abi;
3272 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3273 wraps, otherwise return X itself. */
3275 static rtx
3276 strip_salt (rtx x)
3278 rtx search = x;
3279 if (GET_CODE (search) == CONST)
3280 search = XEXP (search, 0);
3281 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3282 x = XVECEXP (search, 0, 0);
3283 return x;
3286 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3287 expression. */
3289 static rtx
3290 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3292 return strip_salt (strip_offset (addr, offset));
3295 /* Generate code to enable conditional branches in functions over 1 MiB. */
3296 const char *
3297 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3298 const char * branch_format)
3300 rtx_code_label * tmp_label = gen_label_rtx ();
3301 char label_buf[256];
3302 char buffer[128];
3303 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3304 CODE_LABEL_NUMBER (tmp_label));
3305 const char *label_ptr = targetm.strip_name_encoding (label_buf);
3306 rtx dest_label = operands[pos_label];
3307 operands[pos_label] = tmp_label;
3309 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3310 output_asm_insn (buffer, operands);
3312 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3313 operands[pos_label] = dest_label;
3314 output_asm_insn (buffer, operands);
3315 return "";
3318 void
3319 aarch64_err_no_fpadvsimd (machine_mode mode)
3321 if (TARGET_GENERAL_REGS_ONLY)
3322 if (FLOAT_MODE_P (mode))
3323 error ("%qs is incompatible with the use of floating-point types",
3324 "-mgeneral-regs-only");
3325 else
3326 error ("%qs is incompatible with the use of vector types",
3327 "-mgeneral-regs-only");
3328 else
3329 if (FLOAT_MODE_P (mode))
3330 error ("%qs feature modifier is incompatible with the use of"
3331 " floating-point types", "+nofp");
3332 else
3333 error ("%qs feature modifier is incompatible with the use of"
3334 " vector types", "+nofp");
3337 /* Report when we try to do something that requires SVE when SVE is disabled.
3338 This is an error of last resort and isn't very high-quality. It usually
3339 involves attempts to measure the vector length in some way. */
3340 static void
3341 aarch64_report_sve_required (void)
3343 static bool reported_p = false;
3345 /* Avoid reporting a slew of messages for a single oversight. */
3346 if (reported_p)
3347 return;
3349 error ("this operation requires the SVE ISA extension");
3350 inform (input_location, "you can enable SVE using the command-line"
3351 " option %<-march%>, or by using the %<target%>"
3352 " attribute or pragma");
3353 reported_p = true;
3356 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3357 registers. */
3358 inline bool
3359 pr_or_ffr_regnum_p (unsigned int regno)
3361 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3364 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3365 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3366 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3367 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3368 and GENERAL_REGS is lower than the memory cost (in this case the best class
3369 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
3370 cost results in bad allocations with many redundant int<->FP moves which
3371 are expensive on various cores.
3372 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3373 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
3374 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
3375 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
3376 The result of this is that it is no longer inefficient to have a higher
3377 memory move cost than the register move cost.
3380 static reg_class_t
3381 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3382 reg_class_t best_class)
3384 machine_mode mode;
3386 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3387 || !reg_class_subset_p (FP_REGS, allocno_class))
3388 return allocno_class;
3390 if (!reg_class_subset_p (GENERAL_REGS, best_class)
3391 || !reg_class_subset_p (FP_REGS, best_class))
3392 return best_class;
3394 mode = PSEUDO_REGNO_MODE (regno);
3395 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3398 static unsigned int
3399 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3401 if (GET_MODE_UNIT_SIZE (mode) == 4)
3402 return aarch64_tune_params.min_div_recip_mul_sf;
3403 return aarch64_tune_params.min_div_recip_mul_df;
3406 /* Return the reassociation width of treeop OPC with mode MODE. */
3407 static int
3408 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3410 if (VECTOR_MODE_P (mode))
3411 return aarch64_tune_params.vec_reassoc_width;
3412 if (INTEGRAL_MODE_P (mode))
3413 return aarch64_tune_params.int_reassoc_width;
3414 /* Reassociation reduces the number of FMAs which may result in worse
3415 performance. Use a per-CPU setting for FMA reassociation which allows
3416 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
3417 CPUs with many FP pipes to enable reassociation.
3418 Since the reassociation pass doesn't understand FMA at all, assume
3419 that any FP addition might turn into FMA. */
3420 if (FLOAT_MODE_P (mode))
3421 return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
3422 : aarch64_tune_params.fp_reassoc_width;
3423 return 1;
3426 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
3427 unsigned
3428 aarch64_debugger_regno (unsigned regno)
3430 if (GP_REGNUM_P (regno))
3431 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3432 else if (regno == SP_REGNUM)
3433 return AARCH64_DWARF_SP;
3434 else if (FP_REGNUM_P (regno))
3435 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3436 else if (PR_REGNUM_P (regno))
3437 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3438 else if (regno == VG_REGNUM)
3439 return AARCH64_DWARF_VG;
3441 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3442 equivalent DWARF register. */
3443 return DWARF_FRAME_REGISTERS;
3446 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3447 integer, otherwise return X unmodified. */
3448 static rtx
3449 aarch64_bit_representation (rtx x)
3451 if (CONST_DOUBLE_P (x))
3452 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3453 return x;
3456 /* Return an estimate for the number of quadwords in an SVE vector. This is
3457 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
3458 static unsigned int
3459 aarch64_estimated_sve_vq ()
3461 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3464 /* Return true if MODE is an SVE predicate mode. */
3465 static bool
3466 aarch64_sve_pred_mode_p (machine_mode mode)
3468 return (TARGET_SVE
3469 && (mode == VNx16BImode
3470 || mode == VNx8BImode
3471 || mode == VNx4BImode
3472 || mode == VNx2BImode));
3475 /* Three mutually-exclusive flags describing a vector or predicate type. */
3476 const unsigned int VEC_ADVSIMD = 1;
3477 const unsigned int VEC_SVE_DATA = 2;
3478 const unsigned int VEC_SVE_PRED = 4;
3479 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3480 a structure of 2, 3 or 4 vectors. */
3481 const unsigned int VEC_STRUCT = 8;
3482 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3483 vector has fewer significant bytes than a full SVE vector. */
3484 const unsigned int VEC_PARTIAL = 16;
3485 /* Useful combinations of the above. */
3486 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
3487 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3489 /* Return a set of flags describing the vector properties of mode MODE.
3490 Ignore modes that are not supported by the current target. */
3491 static unsigned int
3492 aarch64_classify_vector_mode (machine_mode mode)
3494 if (aarch64_sve_pred_mode_p (mode))
3495 return VEC_SVE_PRED;
3497 /* Make the decision based on the mode's enum value rather than its
3498 properties, so that we keep the correct classification regardless
3499 of -msve-vector-bits. */
3500 switch (mode)
3502 /* Partial SVE QI vectors. */
3503 case E_VNx2QImode:
3504 case E_VNx4QImode:
3505 case E_VNx8QImode:
3506 /* Partial SVE HI vectors. */
3507 case E_VNx2HImode:
3508 case E_VNx4HImode:
3509 /* Partial SVE SI vector. */
3510 case E_VNx2SImode:
3511 /* Partial SVE HF vectors. */
3512 case E_VNx2HFmode:
3513 case E_VNx4HFmode:
3514 /* Partial SVE BF vectors. */
3515 case E_VNx2BFmode:
3516 case E_VNx4BFmode:
3517 /* Partial SVE SF vector. */
3518 case E_VNx2SFmode:
3519 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3521 case E_VNx16QImode:
3522 case E_VNx8HImode:
3523 case E_VNx4SImode:
3524 case E_VNx2DImode:
3525 case E_VNx8BFmode:
3526 case E_VNx8HFmode:
3527 case E_VNx4SFmode:
3528 case E_VNx2DFmode:
3529 return TARGET_SVE ? VEC_SVE_DATA : 0;
3531 /* x2 SVE vectors. */
3532 case E_VNx32QImode:
3533 case E_VNx16HImode:
3534 case E_VNx8SImode:
3535 case E_VNx4DImode:
3536 case E_VNx16BFmode:
3537 case E_VNx16HFmode:
3538 case E_VNx8SFmode:
3539 case E_VNx4DFmode:
3540 /* x3 SVE vectors. */
3541 case E_VNx48QImode:
3542 case E_VNx24HImode:
3543 case E_VNx12SImode:
3544 case E_VNx6DImode:
3545 case E_VNx24BFmode:
3546 case E_VNx24HFmode:
3547 case E_VNx12SFmode:
3548 case E_VNx6DFmode:
3549 /* x4 SVE vectors. */
3550 case E_VNx64QImode:
3551 case E_VNx32HImode:
3552 case E_VNx16SImode:
3553 case E_VNx8DImode:
3554 case E_VNx32BFmode:
3555 case E_VNx32HFmode:
3556 case E_VNx16SFmode:
3557 case E_VNx8DFmode:
3558 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3560 case E_OImode:
3561 case E_CImode:
3562 case E_XImode:
3563 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3565 /* Structures of 64-bit Advanced SIMD vectors. */
3566 case E_V2x8QImode:
3567 case E_V2x4HImode:
3568 case E_V2x2SImode:
3569 case E_V2x1DImode:
3570 case E_V2x4BFmode:
3571 case E_V2x4HFmode:
3572 case E_V2x2SFmode:
3573 case E_V2x1DFmode:
3574 case E_V3x8QImode:
3575 case E_V3x4HImode:
3576 case E_V3x2SImode:
3577 case E_V3x1DImode:
3578 case E_V3x4BFmode:
3579 case E_V3x4HFmode:
3580 case E_V3x2SFmode:
3581 case E_V3x1DFmode:
3582 case E_V4x8QImode:
3583 case E_V4x4HImode:
3584 case E_V4x2SImode:
3585 case E_V4x1DImode:
3586 case E_V4x4BFmode:
3587 case E_V4x4HFmode:
3588 case E_V4x2SFmode:
3589 case E_V4x1DFmode:
3590 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3592 /* Structures of 128-bit Advanced SIMD vectors. */
3593 case E_V2x16QImode:
3594 case E_V2x8HImode:
3595 case E_V2x4SImode:
3596 case E_V2x2DImode:
3597 case E_V2x8BFmode:
3598 case E_V2x8HFmode:
3599 case E_V2x4SFmode:
3600 case E_V2x2DFmode:
3601 case E_V3x16QImode:
3602 case E_V3x8HImode:
3603 case E_V3x4SImode:
3604 case E_V3x2DImode:
3605 case E_V3x8BFmode:
3606 case E_V3x8HFmode:
3607 case E_V3x4SFmode:
3608 case E_V3x2DFmode:
3609 case E_V4x16QImode:
3610 case E_V4x8HImode:
3611 case E_V4x4SImode:
3612 case E_V4x2DImode:
3613 case E_V4x8BFmode:
3614 case E_V4x8HFmode:
3615 case E_V4x4SFmode:
3616 case E_V4x2DFmode:
3617 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3619 /* 64-bit Advanced SIMD vectors. */
3620 case E_V8QImode:
3621 case E_V4HImode:
3622 case E_V2SImode:
3623 case E_V1DImode:
3624 case E_V4HFmode:
3625 case E_V4BFmode:
3626 case E_V2SFmode:
3627 case E_V1DFmode:
3628 /* 128-bit Advanced SIMD vectors. */
3629 case E_V16QImode:
3630 case E_V8HImode:
3631 case E_V4SImode:
3632 case E_V2DImode:
3633 case E_V8HFmode:
3634 case E_V8BFmode:
3635 case E_V4SFmode:
3636 case E_V2DFmode:
3637 return TARGET_FLOAT ? VEC_ADVSIMD : 0;
3639 default:
3640 return 0;
3644 /* Return true if MODE is any of the Advanced SIMD structure modes. */
3645 bool
3646 aarch64_advsimd_struct_mode_p (machine_mode mode)
3648 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3649 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3652 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
3653 static bool
3654 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3656 return (aarch64_classify_vector_mode (mode)
3657 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3660 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
3661 static bool
3662 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3664 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3667 /* Return true if MODE is any of the data vector modes, including
3668 structure modes. */
3669 static bool
3670 aarch64_vector_data_mode_p (machine_mode mode)
3672 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3675 /* Return true if MODE is any form of SVE mode, including predicates,
3676 vectors and structures. */
3677 bool
3678 aarch64_sve_mode_p (machine_mode mode)
3680 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3683 /* Return true if MODE is an SVE data vector mode; either a single vector
3684 or a structure of vectors. */
3685 static bool
3686 aarch64_sve_data_mode_p (machine_mode mode)
3688 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3691 /* Return the number of defined bytes in one constituent vector of
3692 SVE mode MODE, which has vector flags VEC_FLAGS. */
3693 static poly_int64
3694 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3696 if (vec_flags & VEC_PARTIAL)
3697 /* A single partial vector. */
3698 return GET_MODE_SIZE (mode);
3700 if (vec_flags & VEC_SVE_DATA)
3701 /* A single vector or a tuple. */
3702 return BYTES_PER_SVE_VECTOR;
3704 /* A single predicate. */
3705 gcc_assert (vec_flags & VEC_SVE_PRED);
3706 return BYTES_PER_SVE_PRED;
3709 /* If MODE holds an array of vectors, return the number of vectors
3710 in the array, otherwise return 1. */
3712 static unsigned int
3713 aarch64_ldn_stn_vectors (machine_mode mode)
3715 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3716 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3717 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3718 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3719 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3720 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3721 return exact_div (GET_MODE_SIZE (mode),
3722 BYTES_PER_SVE_VECTOR).to_constant ();
3723 return 1;
3726 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3727 corresponding vector structure mode. */
3728 static opt_machine_mode
3729 aarch64_advsimd_vector_array_mode (machine_mode mode,
3730 unsigned HOST_WIDE_INT nelems)
3732 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3733 if (known_eq (GET_MODE_SIZE (mode), 8))
3734 flags |= VEC_PARTIAL;
3736 machine_mode struct_mode;
3737 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3738 if (aarch64_classify_vector_mode (struct_mode) == flags
3739 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3740 && known_eq (GET_MODE_NUNITS (struct_mode),
3741 GET_MODE_NUNITS (mode) * nelems))
3742 return struct_mode;
3743 return opt_machine_mode ();
3746 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
3748 opt_machine_mode
3749 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3751 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3752 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3753 machine_mode mode;
3754 FOR_EACH_MODE_IN_CLASS (mode, mclass)
3755 if (inner_mode == GET_MODE_INNER (mode)
3756 && known_eq (nunits, GET_MODE_NUNITS (mode))
3757 && aarch64_sve_data_mode_p (mode))
3758 return mode;
3759 return opt_machine_mode ();
3762 /* Implement target hook TARGET_ARRAY_MODE. */
3763 static opt_machine_mode
3764 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3766 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3767 && IN_RANGE (nelems, 2, 4))
3768 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3769 GET_MODE_NUNITS (mode) * nelems);
3770 if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3771 && IN_RANGE (nelems, 2, 4))
3772 return aarch64_advsimd_vector_array_mode (mode, nelems);
3774 return opt_machine_mode ();
3777 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
3778 static bool
3779 aarch64_array_mode_supported_p (machine_mode mode,
3780 unsigned HOST_WIDE_INT nelems)
3782 if (TARGET_SIMD
3783 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3784 || AARCH64_VALID_SIMD_DREG_MODE (mode))
3785 && (nelems >= 2 && nelems <= 4))
3786 return true;
3788 return false;
3791 /* MODE is some form of SVE vector mode. For data modes, return the number
3792 of vector register bits that each element of MODE occupies, such as 64
3793 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3794 in a 64-bit container). For predicate modes, return the number of
3795 data bits controlled by each significant predicate bit. */
3797 static unsigned int
3798 aarch64_sve_container_bits (machine_mode mode)
3800 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3801 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3802 ? BITS_PER_SVE_VECTOR
3803 : GET_MODE_BITSIZE (mode));
3804 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3807 /* Return the SVE predicate mode to use for elements that have
3808 ELEM_NBYTES bytes, if such a mode exists. */
3810 opt_machine_mode
3811 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3813 if (TARGET_SVE)
3815 if (elem_nbytes == 1)
3816 return VNx16BImode;
3817 if (elem_nbytes == 2)
3818 return VNx8BImode;
3819 if (elem_nbytes == 4)
3820 return VNx4BImode;
3821 if (elem_nbytes == 8)
3822 return VNx2BImode;
3824 return opt_machine_mode ();
3827 /* Return the SVE predicate mode that should be used to control
3828 SVE mode MODE. */
3830 machine_mode
3831 aarch64_sve_pred_mode (machine_mode mode)
3833 unsigned int bits = aarch64_sve_container_bits (mode);
3834 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3837 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
3839 static opt_machine_mode
3840 aarch64_get_mask_mode (machine_mode mode)
3842 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3843 if (vec_flags & VEC_SVE_DATA)
3844 return aarch64_sve_pred_mode (mode);
3846 return default_get_mask_mode (mode);
3849 /* Return the integer element mode associated with SVE mode MODE. */
3851 static scalar_int_mode
3852 aarch64_sve_element_int_mode (machine_mode mode)
3854 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3855 ? BITS_PER_SVE_VECTOR
3856 : GET_MODE_BITSIZE (mode));
3857 unsigned int elt_bits = vector_element_size (vector_bits,
3858 GET_MODE_NUNITS (mode));
3859 return int_mode_for_size (elt_bits, 0).require ();
3862 /* Return an integer element mode that contains exactly
3863 aarch64_sve_container_bits (MODE) bits. This is wider than
3864 aarch64_sve_element_int_mode if MODE is a partial vector,
3865 otherwise it's the same. */
3867 static scalar_int_mode
3868 aarch64_sve_container_int_mode (machine_mode mode)
3870 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3873 /* Return the integer vector mode associated with SVE mode MODE.
3874 Unlike related_int_vector_mode, this can handle the case in which
3875 MODE is a predicate (and thus has a different total size). */
3877 machine_mode
3878 aarch64_sve_int_mode (machine_mode mode)
3880 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3881 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3884 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
3886 static opt_machine_mode
3887 aarch64_vectorize_related_mode (machine_mode vector_mode,
3888 scalar_mode element_mode,
3889 poly_uint64 nunits)
3891 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3893 /* If we're operating on SVE vectors, try to return an SVE mode. */
3894 poly_uint64 sve_nunits;
3895 if ((vec_flags & VEC_SVE_DATA)
3896 && multiple_p (BYTES_PER_SVE_VECTOR,
3897 GET_MODE_SIZE (element_mode), &sve_nunits))
3899 machine_mode sve_mode;
3900 if (maybe_ne (nunits, 0U))
3902 /* Try to find a full or partial SVE mode with exactly
3903 NUNITS units. */
3904 if (multiple_p (sve_nunits, nunits)
3905 && aarch64_sve_data_mode (element_mode,
3906 nunits).exists (&sve_mode))
3907 return sve_mode;
3909 else
3911 /* Take the preferred number of units from the number of bytes
3912 that fit in VECTOR_MODE. We always start by "autodetecting"
3913 a full vector mode with preferred_simd_mode, so vectors
3914 chosen here will also be full vector modes. Then
3915 autovectorize_vector_modes tries smaller starting modes
3916 and thus smaller preferred numbers of units. */
3917 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3918 if (aarch64_sve_data_mode (element_mode,
3919 sve_nunits).exists (&sve_mode))
3920 return sve_mode;
3924 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
3925 if (TARGET_SIMD
3926 && (vec_flags & VEC_ADVSIMD)
3927 && known_eq (nunits, 0U)
3928 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3929 && maybe_ge (GET_MODE_BITSIZE (element_mode)
3930 * GET_MODE_NUNITS (vector_mode), 128U))
3932 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3933 if (VECTOR_MODE_P (res))
3934 return res;
3937 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3940 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
3941 prefer to use the first arithmetic operand as the else value if
3942 the else value doesn't matter, since that exactly matches the SVE
3943 destructive merging form. For ternary operations we could either
3944 pick the first operand and use FMAD-like instructions or the last
3945 operand and use FMLA-like instructions; the latter seems more
3946 natural. */
3948 static tree
3949 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3951 return nops == 3 ? ops[2] : ops[0];
3954 /* Implement TARGET_HARD_REGNO_NREGS. */
3956 static unsigned int
3957 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3959 /* ??? Logically we should only need to provide a value when
3960 HARD_REGNO_MODE_OK says that the combination is valid,
3961 but at the moment we need to handle all modes. Just ignore
3962 any runtime parts for registers that can't store them. */
3963 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3964 switch (aarch64_regno_regclass (regno))
3966 case FP_REGS:
3967 case FP_LO_REGS:
3968 case FP_LO8_REGS:
3970 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3971 if (vec_flags & VEC_SVE_DATA)
3972 return exact_div (GET_MODE_SIZE (mode),
3973 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3974 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3975 return GET_MODE_SIZE (mode).to_constant () / 8;
3976 return CEIL (lowest_size, UNITS_PER_VREG);
3978 case PR_REGS:
3979 case PR_LO_REGS:
3980 case PR_HI_REGS:
3981 case FFR_REGS:
3982 case PR_AND_FFR_REGS:
3983 return 1;
3984 default:
3985 return CEIL (lowest_size, UNITS_PER_WORD);
3987 gcc_unreachable ();
3990 /* Implement TARGET_HARD_REGNO_MODE_OK. */
3992 static bool
3993 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3995 if (mode == V8DImode)
3996 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
3997 && multiple_p (regno - R0_REGNUM, 2);
3999 if (GET_MODE_CLASS (mode) == MODE_CC)
4000 return regno == CC_REGNUM;
4002 if (regno == VG_REGNUM)
4003 /* This must have the same size as _Unwind_Word. */
4004 return mode == DImode;
4006 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4007 if (vec_flags & VEC_SVE_PRED)
4008 return pr_or_ffr_regnum_p (regno);
4010 if (pr_or_ffr_regnum_p (regno))
4011 return false;
4013 if (regno == SP_REGNUM)
4014 /* The purpose of comparing with ptr_mode is to support the
4015 global register variable associated with the stack pointer
4016 register via the syntax of asm ("wsp") in ILP32. */
4017 return mode == Pmode || mode == ptr_mode;
4019 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
4020 return mode == Pmode;
4022 if (GP_REGNUM_P (regno))
4024 if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
4025 return false;
4026 if (known_le (GET_MODE_SIZE (mode), 8))
4027 return true;
4028 if (known_le (GET_MODE_SIZE (mode), 16))
4029 return (regno & 1) == 0;
4031 else if (FP_REGNUM_P (regno))
4033 if (vec_flags & VEC_STRUCT)
4034 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
4035 else
4036 return !VECTOR_MODE_P (mode) || vec_flags != 0;
4039 return false;
4042 /* Return true if a function with type FNTYPE returns its value in
4043 SVE vector or predicate registers. */
4045 static bool
4046 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
4048 tree return_type = TREE_TYPE (fntype);
4050 pure_scalable_type_info pst_info;
4051 switch (pst_info.analyze (return_type))
4053 case pure_scalable_type_info::IS_PST:
4054 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
4055 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
4057 case pure_scalable_type_info::DOESNT_MATTER:
4058 gcc_assert (aarch64_return_in_memory_1 (return_type));
4059 return false;
4061 case pure_scalable_type_info::NO_ABI_IDENTITY:
4062 case pure_scalable_type_info::ISNT_PST:
4063 return false;
4065 gcc_unreachable ();
4068 /* Return true if a function with type FNTYPE takes arguments in
4069 SVE vector or predicate registers. */
4071 static bool
4072 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
4074 CUMULATIVE_ARGS args_so_far_v;
4075 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
4076 NULL_TREE, 0, true);
4077 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4079 for (tree chain = TYPE_ARG_TYPES (fntype);
4080 chain && chain != void_list_node;
4081 chain = TREE_CHAIN (chain))
4083 tree arg_type = TREE_VALUE (chain);
4084 if (arg_type == error_mark_node)
4085 return false;
4087 function_arg_info arg (arg_type, /*named=*/true);
4088 apply_pass_by_reference_rules (&args_so_far_v, arg);
4089 pure_scalable_type_info pst_info;
4090 if (pst_info.analyze_registers (arg.type))
4092 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4093 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4094 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4095 return true;
4098 targetm.calls.function_arg_advance (args_so_far, arg);
4100 return false;
4103 /* Implement TARGET_FNTYPE_ABI. */
4105 static const predefined_function_abi &
4106 aarch64_fntype_abi (const_tree fntype)
4108 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4109 return aarch64_simd_abi ();
4111 if (aarch64_returns_value_in_sve_regs_p (fntype)
4112 || aarch64_takes_arguments_in_sve_regs_p (fntype))
4113 return aarch64_sve_abi ();
4115 return default_function_abi;
4118 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
4120 static bool
4121 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4123 return (aarch64_sve::builtin_type_p (type1)
4124 == aarch64_sve::builtin_type_p (type2));
4127 /* Return true if we should emit CFI for register REGNO. */
4129 static bool
4130 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4132 return (GP_REGNUM_P (regno)
4133 || !default_function_abi.clobbers_full_reg_p (regno));
4136 /* Return the mode we should use to save and restore register REGNO. */
4138 static machine_mode
4139 aarch64_reg_save_mode (unsigned int regno)
4141 if (GP_REGNUM_P (regno))
4142 return DImode;
4144 if (FP_REGNUM_P (regno))
4145 switch (crtl->abi->id ())
4147 case ARM_PCS_AAPCS64:
4148 /* Only the low 64 bits are saved by the base PCS. */
4149 return DFmode;
4151 case ARM_PCS_SIMD:
4152 /* The vector PCS saves the low 128 bits (which is the full
4153 register on non-SVE targets). */
4154 return TFmode;
4156 case ARM_PCS_SVE:
4157 /* Use vectors of DImode for registers that need frame
4158 information, so that the first 64 bytes of the save slot
4159 are always the equivalent of what storing D<n> would give. */
4160 if (aarch64_emit_cfi_for_reg_p (regno))
4161 return VNx2DImode;
4163 /* Use vectors of bytes otherwise, so that the layout is
4164 endian-agnostic, and so that we can use LDR and STR for
4165 big-endian targets. */
4166 return VNx16QImode;
4168 case ARM_PCS_TLSDESC:
4169 case ARM_PCS_UNKNOWN:
4170 break;
4173 if (PR_REGNUM_P (regno))
4174 /* Save the full predicate register. */
4175 return VNx16BImode;
4177 gcc_unreachable ();
4180 /* Implement TARGET_INSN_CALLEE_ABI. */
4182 const predefined_function_abi &
4183 aarch64_insn_callee_abi (const rtx_insn *insn)
4185 rtx pat = PATTERN (insn);
4186 gcc_assert (GET_CODE (pat) == PARALLEL);
4187 rtx unspec = XVECEXP (pat, 0, 1);
4188 gcc_assert (GET_CODE (unspec) == UNSPEC
4189 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4190 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4193 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
4194 the lower 64 bits of a 128-bit register. Tell the compiler the callee
4195 clobbers the top 64 bits when restoring the bottom 64 bits. */
4197 static bool
4198 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4199 unsigned int regno,
4200 machine_mode mode)
4202 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4204 poly_int64 per_register_size = GET_MODE_SIZE (mode);
4205 unsigned int nregs = hard_regno_nregs (regno, mode);
4206 if (nregs > 1)
4207 per_register_size = exact_div (per_register_size, nregs);
4208 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4209 return maybe_gt (per_register_size, 16);
4210 return maybe_gt (per_register_size, 8);
4212 return false;
4215 /* Implement REGMODE_NATURAL_SIZE. */
4216 poly_uint64
4217 aarch64_regmode_natural_size (machine_mode mode)
4219 /* The natural size for SVE data modes is one SVE data vector,
4220 and similarly for predicates. We can't independently modify
4221 anything smaller than that. */
4222 /* ??? For now, only do this for variable-width SVE registers.
4223 Doing it for constant-sized registers breaks lower-subreg.cc. */
4224 /* ??? And once that's fixed, we should probably have similar
4225 code for Advanced SIMD. */
4226 if (!aarch64_sve_vg.is_constant ())
4228 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4229 if (vec_flags & VEC_SVE_PRED)
4230 return BYTES_PER_SVE_PRED;
4231 if (vec_flags & VEC_SVE_DATA)
4232 return BYTES_PER_SVE_VECTOR;
4234 return UNITS_PER_WORD;
4237 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
4238 machine_mode
4239 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4240 machine_mode mode)
4242 /* The predicate mode determines which bits are significant and
4243 which are "don't care". Decreasing the number of lanes would
4244 lose data while increasing the number of lanes would make bits
4245 unnecessarily significant. */
4246 if (PR_REGNUM_P (regno))
4247 return mode;
4248 if (known_ge (GET_MODE_SIZE (mode), 4))
4249 return mode;
4250 else
4251 return SImode;
4254 /* Return true if I's bits are consecutive ones from the MSB. */
4255 bool
4256 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4258 return exact_log2 (-i) != HOST_WIDE_INT_M1;
4261 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
4262 that strcpy from constants will be faster. */
4264 static HOST_WIDE_INT
4265 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4267 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4268 return MAX (align, BITS_PER_WORD);
4269 return align;
4272 /* Return true if calls to DECL should be treated as
4273 long-calls (ie called via a register). */
4274 static bool
4275 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4277 return false;
4280 /* Return true if calls to symbol-ref SYM should be treated as
4281 long-calls (ie called via a register). */
4282 bool
4283 aarch64_is_long_call_p (rtx sym)
4285 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4288 /* Return true if calls to symbol-ref SYM should not go through
4289 plt stubs. */
4291 bool
4292 aarch64_is_noplt_call_p (rtx sym)
4294 const_tree decl = SYMBOL_REF_DECL (sym);
4296 if (flag_pic
4297 && decl
4298 && (!flag_plt
4299 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4300 && !targetm.binds_local_p (decl))
4301 return true;
4303 return false;
4306 /* Emit an insn that's a simple single-set. Both the operands must be
4307 known to be valid. */
4308 inline static rtx_insn *
4309 emit_set_insn (rtx x, rtx y)
4311 return emit_insn (gen_rtx_SET (x, y));
4314 /* X and Y are two things to compare using CODE. Emit the compare insn and
4315 return the rtx for register 0 in the proper mode. */
4317 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4319 machine_mode cmp_mode = GET_MODE (x);
4320 machine_mode cc_mode;
4321 rtx cc_reg;
4323 if (cmp_mode == TImode)
4325 gcc_assert (code == NE);
4327 cc_mode = CCmode;
4328 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4330 rtx x_lo = operand_subword (x, 0, 0, TImode);
4331 rtx y_lo = operand_subword (y, 0, 0, TImode);
4332 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4334 rtx x_hi = operand_subword (x, 1, 0, TImode);
4335 rtx y_hi = operand_subword (y, 1, 0, TImode);
4336 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4337 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4338 GEN_INT (AARCH64_EQ)));
4340 else
4342 cc_mode = SELECT_CC_MODE (code, x, y);
4343 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4344 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4346 return cc_reg;
4349 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
4351 static rtx
4352 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4353 machine_mode y_mode)
4355 if (y_mode == E_QImode || y_mode == E_HImode)
4357 if (CONST_INT_P (y))
4359 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4360 y_mode = SImode;
4362 else
4364 rtx t, cc_reg;
4365 machine_mode cc_mode;
4367 t = gen_rtx_ZERO_EXTEND (SImode, y);
4368 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4369 cc_mode = CC_SWPmode;
4370 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4371 emit_set_insn (cc_reg, t);
4372 return cc_reg;
4376 if (!aarch64_plus_operand (y, y_mode))
4377 y = force_reg (y_mode, y);
4379 return aarch64_gen_compare_reg (code, x, y);
4382 /* Consider the operation:
4384 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4386 where:
4388 - CODE is [SU]MAX or [SU]MIN
4389 - OPERANDS[2] and OPERANDS[3] are constant integers
4390 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4391 - all operands have mode MODE
4393 Decide whether it is possible to implement the operation using:
4395 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4397 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4399 followed by:
4401 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4403 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
4404 If GENERATE_P is true, also update OPERANDS as follows:
4406 OPERANDS[4] = -OPERANDS[3]
4407 OPERANDS[5] = the rtl condition representing <cond>
4408 OPERANDS[6] = <tmp>
4409 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
4410 bool
4411 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4413 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4414 rtx dst = operands[0];
4415 rtx maxmin_op = operands[2];
4416 rtx add_op = operands[3];
4417 machine_mode mode = GET_MODE (dst);
4419 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4420 == (x >= y ? x : y) - z
4421 == (x > y ? x : y) - z
4422 == (x > y - 1 ? x : y) - z
4424 min (x, y) - z == (x <= y - 1 ? x : y) - z
4425 == (x <= y ? x : y) - z
4426 == (x < y ? x : y) - z
4427 == (x < y + 1 ? x : y) - z
4429 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4430 which x is compared with z. Set DIFF to y - z. Thus the supported
4431 combinations are as follows, with DIFF being the value after the ":":
4433 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
4434 == x >= y ? x - y : 0 [z == y]
4435 == x > y ? x - y : 0 [z == y]
4436 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
4438 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
4439 == x <= y ? x - y : 0 [z == y]
4440 == x < y ? x - y : 0 [z == y]
4441 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
4442 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4443 auto add_val = rtx_mode_t (add_op, mode);
4444 auto sub_val = wi::neg (add_val);
4445 auto diff = wi::sub (maxmin_val, sub_val);
4446 if (!(diff == 0
4447 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4448 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4449 return false;
4451 if (!generate_p)
4452 return true;
4454 rtx_code cmp;
4455 switch (code)
4457 case SMAX:
4458 cmp = diff == 1 ? GT : GE;
4459 break;
4460 case UMAX:
4461 cmp = diff == 1 ? GTU : GEU;
4462 break;
4463 case SMIN:
4464 cmp = diff == -1 ? LT : LE;
4465 break;
4466 case UMIN:
4467 cmp = diff == -1 ? LTU : LEU;
4468 break;
4469 default:
4470 gcc_unreachable ();
4472 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4474 operands[4] = immed_wide_int_const (sub_val, mode);
4475 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4476 if (can_create_pseudo_p ())
4477 operands[6] = gen_reg_rtx (mode);
4478 else
4479 operands[6] = dst;
4480 operands[7] = immed_wide_int_const (diff, mode);
4482 return true;
4486 /* Build the SYMBOL_REF for __tls_get_addr. */
4488 static GTY(()) rtx tls_get_addr_libfunc;
4491 aarch64_tls_get_addr (void)
4493 if (!tls_get_addr_libfunc)
4494 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4495 return tls_get_addr_libfunc;
4498 /* Return the TLS model to use for ADDR. */
4500 static enum tls_model
4501 tls_symbolic_operand_type (rtx addr)
4503 enum tls_model tls_kind = TLS_MODEL_NONE;
4504 poly_int64 offset;
4505 addr = strip_offset_and_salt (addr, &offset);
4506 if (SYMBOL_REF_P (addr))
4507 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4509 return tls_kind;
4512 /* We'll allow lo_sum's in addresses in our legitimate addresses
4513 so that combine would take care of combining addresses where
4514 necessary, but for generation purposes, we'll generate the address
4515 as :
4516 RTL Absolute
4517 tmp = hi (symbol_ref); adrp x1, foo
4518 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
4521 PIC TLS
4522 adrp x1, :got:foo adrp tmp, :tlsgd:foo
4523 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
4524 bl __tls_get_addr
4527 Load TLS symbol, depending on TLS mechanism and TLS access model.
4529 Global Dynamic - Traditional TLS:
4530 adrp tmp, :tlsgd:imm
4531 add dest, tmp, #:tlsgd_lo12:imm
4532 bl __tls_get_addr
4534 Global Dynamic - TLS Descriptors:
4535 adrp dest, :tlsdesc:imm
4536 ldr tmp, [dest, #:tlsdesc_lo12:imm]
4537 add dest, dest, #:tlsdesc_lo12:imm
4538 blr tmp
4539 mrs tp, tpidr_el0
4540 add dest, dest, tp
4542 Initial Exec:
4543 mrs tp, tpidr_el0
4544 adrp tmp, :gottprel:imm
4545 ldr dest, [tmp, #:gottprel_lo12:imm]
4546 add dest, dest, tp
4548 Local Exec:
4549 mrs tp, tpidr_el0
4550 add t0, tp, #:tprel_hi12:imm, lsl #12
4551 add t0, t0, #:tprel_lo12_nc:imm
4554 static void
4555 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4556 enum aarch64_symbol_type type)
4558 switch (type)
4560 case SYMBOL_SMALL_ABSOLUTE:
4562 /* In ILP32, the mode of dest can be either SImode or DImode. */
4563 rtx tmp_reg = dest;
4564 machine_mode mode = GET_MODE (dest);
4566 gcc_assert (mode == Pmode || mode == ptr_mode);
4568 if (can_create_pseudo_p ())
4569 tmp_reg = gen_reg_rtx (mode);
4571 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4572 emit_insn (gen_add_losym (dest, tmp_reg, imm));
4573 return;
4576 case SYMBOL_TINY_ABSOLUTE:
4577 emit_insn (gen_rtx_SET (dest, imm));
4578 return;
4580 case SYMBOL_SMALL_GOT_28K:
4582 machine_mode mode = GET_MODE (dest);
4583 rtx gp_rtx = pic_offset_table_rtx;
4584 rtx insn;
4585 rtx mem;
4587 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4588 here before rtl expand. Tree IVOPT will generate rtl pattern to
4589 decide rtx costs, in which case pic_offset_table_rtx is not
4590 initialized. For that case no need to generate the first adrp
4591 instruction as the final cost for global variable access is
4592 one instruction. */
4593 if (gp_rtx != NULL)
4595 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4596 using the page base as GOT base, the first page may be wasted,
4597 in the worst scenario, there is only 28K space for GOT).
4599 The generate instruction sequence for accessing global variable
4602 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4604 Only one instruction needed. But we must initialize
4605 pic_offset_table_rtx properly. We generate initialize insn for
4606 every global access, and allow CSE to remove all redundant.
4608 The final instruction sequences will look like the following
4609 for multiply global variables access.
4611 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4613 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4614 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4615 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4616 ... */
4618 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4619 crtl->uses_pic_offset_table = 1;
4620 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4622 if (mode != GET_MODE (gp_rtx))
4623 gp_rtx = gen_lowpart (mode, gp_rtx);
4627 if (mode == ptr_mode)
4629 if (mode == DImode)
4630 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4631 else
4632 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4634 mem = XVECEXP (SET_SRC (insn), 0, 0);
4636 else
4638 gcc_assert (mode == Pmode);
4640 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4641 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4644 /* The operand is expected to be MEM. Whenever the related insn
4645 pattern changed, above code which calculate mem should be
4646 updated. */
4647 gcc_assert (MEM_P (mem));
4648 MEM_READONLY_P (mem) = 1;
4649 MEM_NOTRAP_P (mem) = 1;
4650 emit_insn (insn);
4651 return;
4654 case SYMBOL_SMALL_GOT_4G:
4655 emit_insn (gen_rtx_SET (dest, imm));
4656 return;
4658 case SYMBOL_SMALL_TLSGD:
4660 rtx_insn *insns;
4661 /* The return type of __tls_get_addr is the C pointer type
4662 so use ptr_mode. */
4663 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4664 rtx tmp_reg = dest;
4666 if (GET_MODE (dest) != ptr_mode)
4667 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4669 start_sequence ();
4670 if (ptr_mode == SImode)
4671 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4672 else
4673 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4674 insns = get_insns ();
4675 end_sequence ();
4677 RTL_CONST_CALL_P (insns) = 1;
4678 emit_libcall_block (insns, tmp_reg, result, imm);
4679 /* Convert back to the mode of the dest adding a zero_extend
4680 from SImode (ptr_mode) to DImode (Pmode). */
4681 if (dest != tmp_reg)
4682 convert_move (dest, tmp_reg, true);
4683 return;
4686 case SYMBOL_SMALL_TLSDESC:
4688 machine_mode mode = GET_MODE (dest);
4689 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4690 rtx tp;
4692 gcc_assert (mode == Pmode || mode == ptr_mode);
4694 /* In ILP32, the got entry is always of SImode size. Unlike
4695 small GOT, the dest is fixed at reg 0. */
4696 if (TARGET_ILP32)
4697 emit_insn (gen_tlsdesc_small_si (imm));
4698 else
4699 emit_insn (gen_tlsdesc_small_di (imm));
4700 tp = aarch64_load_tp (NULL);
4702 if (mode != Pmode)
4703 tp = gen_lowpart (mode, tp);
4705 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4706 if (REG_P (dest))
4707 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4708 return;
4711 case SYMBOL_SMALL_TLSIE:
4713 /* In ILP32, the mode of dest can be either SImode or DImode,
4714 while the got entry is always of SImode size. The mode of
4715 dest depends on how dest is used: if dest is assigned to a
4716 pointer (e.g. in the memory), it has SImode; it may have
4717 DImode if dest is dereferenced to access the memeory.
4718 This is why we have to handle three different tlsie_small
4719 patterns here (two patterns for ILP32). */
4720 machine_mode mode = GET_MODE (dest);
4721 rtx tmp_reg = gen_reg_rtx (mode);
4722 rtx tp = aarch64_load_tp (NULL);
4724 if (mode == ptr_mode)
4726 if (mode == DImode)
4727 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4728 else
4730 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4731 tp = gen_lowpart (mode, tp);
4734 else
4736 gcc_assert (mode == Pmode);
4737 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4740 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4741 if (REG_P (dest))
4742 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4743 return;
4746 case SYMBOL_TLSLE12:
4747 case SYMBOL_TLSLE24:
4748 case SYMBOL_TLSLE32:
4749 case SYMBOL_TLSLE48:
4751 machine_mode mode = GET_MODE (dest);
4752 rtx tp = aarch64_load_tp (NULL);
4754 if (mode != Pmode)
4755 tp = gen_lowpart (mode, tp);
4757 switch (type)
4759 case SYMBOL_TLSLE12:
4760 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4761 (dest, tp, imm));
4762 break;
4763 case SYMBOL_TLSLE24:
4764 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4765 (dest, tp, imm));
4766 break;
4767 case SYMBOL_TLSLE32:
4768 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4769 (dest, imm));
4770 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4771 (dest, dest, tp));
4772 break;
4773 case SYMBOL_TLSLE48:
4774 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4775 (dest, imm));
4776 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4777 (dest, dest, tp));
4778 break;
4779 default:
4780 gcc_unreachable ();
4783 if (REG_P (dest))
4784 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4785 return;
4788 case SYMBOL_TINY_GOT:
4790 rtx insn;
4791 machine_mode mode = GET_MODE (dest);
4793 if (mode == ptr_mode)
4794 insn = gen_ldr_got_tiny (mode, dest, imm);
4795 else
4797 gcc_assert (mode == Pmode);
4798 insn = gen_ldr_got_tiny_sidi (dest, imm);
4801 emit_insn (insn);
4802 return;
4805 case SYMBOL_TINY_TLSIE:
4807 machine_mode mode = GET_MODE (dest);
4808 rtx tp = aarch64_load_tp (NULL);
4810 if (mode == ptr_mode)
4812 if (mode == DImode)
4813 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4814 else
4816 tp = gen_lowpart (mode, tp);
4817 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4820 else
4822 gcc_assert (mode == Pmode);
4823 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4826 if (REG_P (dest))
4827 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4828 return;
4831 default:
4832 gcc_unreachable ();
4836 /* Emit a move from SRC to DEST. Assume that the move expanders can
4837 handle all moves if !can_create_pseudo_p (). The distinction is
4838 important because, unlike emit_move_insn, the move expanders know
4839 how to force Pmode objects into the constant pool even when the
4840 constant pool address is not itself legitimate. */
4841 static rtx
4842 aarch64_emit_move (rtx dest, rtx src)
4844 return (can_create_pseudo_p ()
4845 ? emit_move_insn (dest, src)
4846 : emit_move_insn_1 (dest, src));
4849 /* Apply UNOPTAB to OP and store the result in DEST. */
4851 static void
4852 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4854 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4855 if (dest != tmp)
4856 emit_move_insn (dest, tmp);
4859 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
4861 static void
4862 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4864 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4865 OPTAB_DIRECT);
4866 if (dest != tmp)
4867 emit_move_insn (dest, tmp);
4870 /* Split a 128-bit move operation into two 64-bit move operations,
4871 taking care to handle partial overlap of register to register
4872 copies. Special cases are needed when moving between GP regs and
4873 FP regs. SRC can be a register, constant or memory; DST a register
4874 or memory. If either operand is memory it must not have any side
4875 effects. */
4876 void
4877 aarch64_split_128bit_move (rtx dst, rtx src)
4879 rtx dst_lo, dst_hi;
4880 rtx src_lo, src_hi;
4882 machine_mode mode = GET_MODE (dst);
4884 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
4885 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4886 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4888 if (REG_P (dst) && REG_P (src))
4890 int src_regno = REGNO (src);
4891 int dst_regno = REGNO (dst);
4893 /* Handle FP <-> GP regs. */
4894 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4896 src_lo = gen_lowpart (word_mode, src);
4897 src_hi = gen_highpart (word_mode, src);
4899 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4900 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4901 return;
4903 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4905 dst_lo = gen_lowpart (word_mode, dst);
4906 dst_hi = gen_highpart (word_mode, dst);
4908 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4909 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4910 return;
4914 dst_lo = gen_lowpart (word_mode, dst);
4915 dst_hi = gen_highpart (word_mode, dst);
4916 src_lo = gen_lowpart (word_mode, src);
4917 src_hi = gen_highpart_mode (word_mode, mode, src);
4919 /* At most one pairing may overlap. */
4920 if (reg_overlap_mentioned_p (dst_lo, src_hi))
4922 aarch64_emit_move (dst_hi, src_hi);
4923 aarch64_emit_move (dst_lo, src_lo);
4925 else
4927 aarch64_emit_move (dst_lo, src_lo);
4928 aarch64_emit_move (dst_hi, src_hi);
4932 /* Return true if we should split a move from 128-bit value SRC
4933 to 128-bit register DEST. */
4935 bool
4936 aarch64_split_128bit_move_p (rtx dst, rtx src)
4938 if (FP_REGNUM_P (REGNO (dst)))
4939 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4940 /* All moves to GPRs need to be split. */
4941 return true;
4944 /* Split a complex SIMD move. */
4946 void
4947 aarch64_split_simd_move (rtx dst, rtx src)
4949 machine_mode src_mode = GET_MODE (src);
4950 machine_mode dst_mode = GET_MODE (dst);
4952 gcc_assert (VECTOR_MODE_P (dst_mode));
4954 if (REG_P (dst) && REG_P (src))
4956 gcc_assert (VECTOR_MODE_P (src_mode));
4957 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4961 bool
4962 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4963 machine_mode ymode, rtx y)
4965 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4966 gcc_assert (r != NULL);
4967 return rtx_equal_p (x, r);
4970 /* Return TARGET if it is nonnull and a register of mode MODE.
4971 Otherwise, return a fresh register of mode MODE if we can,
4972 or TARGET reinterpreted as MODE if we can't. */
4974 static rtx
4975 aarch64_target_reg (rtx target, machine_mode mode)
4977 if (target && REG_P (target) && GET_MODE (target) == mode)
4978 return target;
4979 if (!can_create_pseudo_p ())
4981 gcc_assert (target);
4982 return gen_lowpart (mode, target);
4984 return gen_reg_rtx (mode);
4987 /* Return a register that contains the constant in BUILDER, given that
4988 the constant is a legitimate move operand. Use TARGET as the register
4989 if it is nonnull and convenient. */
4991 static rtx
4992 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4994 rtx src = builder.build ();
4995 target = aarch64_target_reg (target, GET_MODE (src));
4996 emit_insn (gen_rtx_SET (target, src));
4997 return target;
5000 static rtx
5001 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
5003 if (can_create_pseudo_p ())
5004 return force_reg (mode, value);
5005 else
5007 gcc_assert (x);
5008 aarch64_emit_move (x, value);
5009 return x;
5013 /* Return true if predicate value X is a constant in which every element
5014 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
5015 value, i.e. as a predicate in which all bits are significant. */
5017 static bool
5018 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
5020 if (!CONST_VECTOR_P (x))
5021 return false;
5023 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
5024 GET_MODE_NUNITS (GET_MODE (x)));
5025 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
5026 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
5027 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
5029 unsigned int nelts = const_vector_encoded_nelts (x);
5030 for (unsigned int i = 0; i < nelts; ++i)
5032 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
5033 if (!CONST_INT_P (elt))
5034 return false;
5036 builder.quick_push (elt);
5037 for (unsigned int j = 1; j < factor; ++j)
5038 builder.quick_push (const0_rtx);
5040 builder.finalize ();
5041 return true;
5044 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
5045 widest predicate element size it can have (that is, the largest size
5046 for which each element would still be 0 or 1). */
5048 unsigned int
5049 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
5051 /* Start with the most optimistic assumption: that we only need
5052 one bit per pattern. This is what we will use if only the first
5053 bit in each pattern is ever set. */
5054 unsigned int mask = GET_MODE_SIZE (DImode);
5055 mask |= builder.npatterns ();
5057 /* Look for set bits. */
5058 unsigned int nelts = builder.encoded_nelts ();
5059 for (unsigned int i = 1; i < nelts; ++i)
5060 if (INTVAL (builder.elt (i)) != 0)
5062 if (i & 1)
5063 return 1;
5064 mask |= i;
5066 return mask & -mask;
5069 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5070 return that predicate mode, otherwise return opt_machine_mode (). */
5072 opt_machine_mode
5073 aarch64_ptrue_all_mode (rtx x)
5075 gcc_assert (GET_MODE (x) == VNx16BImode);
5076 if (!CONST_VECTOR_P (x)
5077 || !CONST_VECTOR_DUPLICATE_P (x)
5078 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5079 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5080 return opt_machine_mode ();
5082 unsigned int nelts = const_vector_encoded_nelts (x);
5083 for (unsigned int i = 1; i < nelts; ++i)
5084 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5085 return opt_machine_mode ();
5087 return aarch64_sve_pred_mode (nelts);
5090 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
5091 that the constant would have with predicate element size ELT_SIZE
5092 (ignoring the upper bits in each element) and return:
5094 * -1 if all bits are set
5095 * N if the predicate has N leading set bits followed by all clear bits
5096 * 0 if the predicate does not have any of these forms. */
5099 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5100 unsigned int elt_size)
5102 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5103 followed by set bits. */
5104 if (builder.nelts_per_pattern () == 3)
5105 return 0;
5107 /* Skip over leading set bits. */
5108 unsigned int nelts = builder.encoded_nelts ();
5109 unsigned int i = 0;
5110 for (; i < nelts; i += elt_size)
5111 if (INTVAL (builder.elt (i)) == 0)
5112 break;
5113 unsigned int vl = i / elt_size;
5115 /* Check for the all-true case. */
5116 if (i == nelts)
5117 return -1;
5119 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5120 repeating pattern of set bits followed by clear bits. */
5121 if (builder.nelts_per_pattern () != 2)
5122 return 0;
5124 /* We have a "foreground" value and a duplicated "background" value.
5125 If the background might repeat and the last set bit belongs to it,
5126 we might have set bits followed by clear bits followed by set bits. */
5127 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5128 return 0;
5130 /* Make sure that the rest are all clear. */
5131 for (; i < nelts; i += elt_size)
5132 if (INTVAL (builder.elt (i)) != 0)
5133 return 0;
5135 return vl;
5138 /* See if there is an svpattern that encodes an SVE predicate of mode
5139 PRED_MODE in which the first VL bits are set and the rest are clear.
5140 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5141 A VL of -1 indicates an all-true vector. */
5143 aarch64_svpattern
5144 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5146 if (vl < 0)
5147 return AARCH64_SV_ALL;
5149 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5150 return AARCH64_NUM_SVPATTERNS;
5152 if (vl >= 1 && vl <= 8)
5153 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5155 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5156 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5158 int max_vl;
5159 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5161 if (vl == (max_vl / 3) * 3)
5162 return AARCH64_SV_MUL3;
5163 /* These would only trigger for non-power-of-2 lengths. */
5164 if (vl == (max_vl & -4))
5165 return AARCH64_SV_MUL4;
5166 if (vl == (1 << floor_log2 (max_vl)))
5167 return AARCH64_SV_POW2;
5168 if (vl == max_vl)
5169 return AARCH64_SV_ALL;
5171 return AARCH64_NUM_SVPATTERNS;
5174 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5175 bits has the lowest bit set and the upper bits clear. This is the
5176 VNx16BImode equivalent of a PTRUE for controlling elements of
5177 ELT_SIZE bytes. However, because the constant is VNx16BImode,
5178 all bits are significant, even the upper zeros. */
5181 aarch64_ptrue_all (unsigned int elt_size)
5183 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5184 builder.quick_push (const1_rtx);
5185 for (unsigned int i = 1; i < elt_size; ++i)
5186 builder.quick_push (const0_rtx);
5187 return builder.build ();
5190 /* Return an all-true predicate register of mode MODE. */
5193 aarch64_ptrue_reg (machine_mode mode)
5195 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5196 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5197 return gen_lowpart (mode, reg);
5200 /* Return an all-false predicate register of mode MODE. */
5203 aarch64_pfalse_reg (machine_mode mode)
5205 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5206 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5207 return gen_lowpart (mode, reg);
5210 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5211 for it. PRED2[0] is the predicate for the instruction whose result
5212 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5213 for it. Return true if we can prove that the two predicates are
5214 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5215 with PRED1[0] without changing behavior. */
5217 bool
5218 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5220 machine_mode mode = GET_MODE (pred1[0]);
5221 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5222 && mode == GET_MODE (pred2[0])
5223 && aarch64_sve_ptrue_flag (pred1[1], SImode)
5224 && aarch64_sve_ptrue_flag (pred2[1], SImode));
5226 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5227 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5228 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5229 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5230 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5233 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5234 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5235 Use TARGET as the target register if nonnull and convenient. */
5237 static rtx
5238 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5239 machine_mode data_mode, rtx op1, rtx op2)
5241 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5242 expand_operand ops[5];
5243 create_output_operand (&ops[0], target, pred_mode);
5244 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5245 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5246 create_input_operand (&ops[3], op1, data_mode);
5247 create_input_operand (&ops[4], op2, data_mode);
5248 expand_insn (icode, 5, ops);
5249 return ops[0].value;
5252 /* Use a comparison to convert integer vector SRC into MODE, which is
5253 the corresponding SVE predicate mode. Use TARGET for the result
5254 if it's nonnull and convenient. */
5257 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5259 machine_mode src_mode = GET_MODE (src);
5260 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5261 src, CONST0_RTX (src_mode));
5264 /* Return the assembly token for svprfop value PRFOP. */
5266 static const char *
5267 svprfop_token (enum aarch64_svprfop prfop)
5269 switch (prfop)
5271 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5272 AARCH64_FOR_SVPRFOP (CASE)
5273 #undef CASE
5274 case AARCH64_NUM_SVPRFOPS:
5275 break;
5277 gcc_unreachable ();
5280 /* Return the assembly string for an SVE prefetch operation with
5281 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5282 and that SUFFIX is the format for the remaining operands. */
5284 char *
5285 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5286 const char *suffix)
5288 static char buffer[128];
5289 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5290 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5291 mnemonic, svprfop_token (prfop), suffix);
5292 gcc_assert (written < sizeof (buffer));
5293 return buffer;
5296 /* Check whether we can calculate the number of elements in PATTERN
5297 at compile time, given that there are NELTS_PER_VQ elements per
5298 128-bit block. Return the value if so, otherwise return -1. */
5300 HOST_WIDE_INT
5301 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5303 unsigned int vl, const_vg;
5304 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5305 vl = 1 + (pattern - AARCH64_SV_VL1);
5306 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5307 vl = 16 << (pattern - AARCH64_SV_VL16);
5308 else if (aarch64_sve_vg.is_constant (&const_vg))
5310 /* There are two vector granules per quadword. */
5311 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5312 switch (pattern)
5314 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5315 case AARCH64_SV_MUL4: return nelts & -4;
5316 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5317 case AARCH64_SV_ALL: return nelts;
5318 default: gcc_unreachable ();
5321 else
5322 return -1;
5324 /* There are two vector granules per quadword. */
5325 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5326 if (known_le (vl, nelts_all))
5327 return vl;
5329 /* Requesting more elements than are available results in a PFALSE. */
5330 if (known_gt (vl, nelts_all))
5331 return 0;
5333 return -1;
5336 /* Return true if we can move VALUE into a register using a single
5337 CNT[BHWD] instruction. */
5339 static bool
5340 aarch64_sve_cnt_immediate_p (poly_int64 value)
5342 HOST_WIDE_INT factor = value.coeffs[0];
5343 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
5344 return (value.coeffs[1] == factor
5345 && IN_RANGE (factor, 2, 16 * 16)
5346 && (factor & 1) == 0
5347 && factor <= 16 * (factor & -factor));
5350 /* Likewise for rtx X. */
5352 bool
5353 aarch64_sve_cnt_immediate_p (rtx x)
5355 poly_int64 value;
5356 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5359 /* Return the asm string for an instruction with a CNT-like vector size
5360 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5361 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5362 first part of the operands template (the part that comes before the
5363 vector size itself). PATTERN is the pattern to use. FACTOR is the
5364 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
5365 in each quadword. If it is zero, we can use any element size. */
5367 static char *
5368 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5369 aarch64_svpattern pattern,
5370 unsigned int factor,
5371 unsigned int nelts_per_vq)
5373 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5375 if (nelts_per_vq == 0)
5376 /* There is some overlap in the ranges of the four CNT instructions.
5377 Here we always use the smallest possible element size, so that the
5378 multiplier is 1 whereever possible. */
5379 nelts_per_vq = factor & -factor;
5380 int shift = std::min (exact_log2 (nelts_per_vq), 4);
5381 gcc_assert (IN_RANGE (shift, 1, 4));
5382 char suffix = "dwhb"[shift - 1];
5384 factor >>= shift;
5385 unsigned int written;
5386 if (pattern == AARCH64_SV_ALL && factor == 1)
5387 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5388 prefix, suffix, operands);
5389 else if (factor == 1)
5390 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5391 prefix, suffix, operands, svpattern_token (pattern));
5392 else
5393 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5394 prefix, suffix, operands, svpattern_token (pattern),
5395 factor);
5396 gcc_assert (written < sizeof (buffer));
5397 return buffer;
5400 /* Return the asm string for an instruction with a CNT-like vector size
5401 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5402 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5403 first part of the operands template (the part that comes before the
5404 vector size itself). X is the value of the vector size operand,
5405 as a polynomial integer rtx; we need to convert this into an "all"
5406 pattern with a multiplier. */
5408 char *
5409 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5410 rtx x)
5412 poly_int64 value = rtx_to_poly_int64 (x);
5413 gcc_assert (aarch64_sve_cnt_immediate_p (value));
5414 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5415 value.coeffs[1], 0);
5418 /* Return the asm string for an instruction with a CNT-like vector size
5419 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5420 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5421 first part of the operands template (the part that comes before the
5422 vector size itself). CNT_PAT[0..2] are the operands of the
5423 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
5425 char *
5426 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5427 const char *operands, rtx *cnt_pat)
5429 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5430 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5431 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5432 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5433 factor, nelts_per_vq);
5436 /* Return true if we can add X using a single SVE INC or DEC instruction. */
5438 bool
5439 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5441 poly_int64 value;
5442 return (poly_int_rtx_p (x, &value)
5443 && (aarch64_sve_cnt_immediate_p (value)
5444 || aarch64_sve_cnt_immediate_p (-value)));
5447 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5448 operand 0. */
5450 char *
5451 aarch64_output_sve_scalar_inc_dec (rtx offset)
5453 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5454 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5455 if (offset_value.coeffs[1] > 0)
5456 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5457 offset_value.coeffs[1], 0);
5458 else
5459 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5460 -offset_value.coeffs[1], 0);
5463 /* Return true if we can add VALUE to a register using a single ADDVL
5464 or ADDPL instruction. */
5466 static bool
5467 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5469 HOST_WIDE_INT factor = value.coeffs[0];
5470 if (factor == 0 || value.coeffs[1] != factor)
5471 return false;
5472 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5473 and a value of 16 is one vector width. */
5474 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5475 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5478 /* Likewise for rtx X. */
5480 bool
5481 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5483 poly_int64 value;
5484 return (poly_int_rtx_p (x, &value)
5485 && aarch64_sve_addvl_addpl_immediate_p (value));
5488 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5489 to operand 1 and storing the result in operand 0. */
5491 char *
5492 aarch64_output_sve_addvl_addpl (rtx offset)
5494 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5495 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5496 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5498 int factor = offset_value.coeffs[1];
5499 if ((factor & 15) == 0)
5500 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5501 else
5502 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5503 return buffer;
5506 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5507 instruction. If it is, store the number of elements in each vector
5508 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5509 factor in *FACTOR_OUT (if nonnull). */
5511 bool
5512 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5513 unsigned int *nelts_per_vq_out)
5515 rtx elt;
5516 poly_int64 value;
5518 if (!const_vec_duplicate_p (x, &elt)
5519 || !poly_int_rtx_p (elt, &value))
5520 return false;
5522 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5523 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5524 /* There's no vector INCB. */
5525 return false;
5527 HOST_WIDE_INT factor = value.coeffs[0];
5528 if (value.coeffs[1] != factor)
5529 return false;
5531 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
5532 if ((factor % nelts_per_vq) != 0
5533 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5534 return false;
5536 if (factor_out)
5537 *factor_out = factor;
5538 if (nelts_per_vq_out)
5539 *nelts_per_vq_out = nelts_per_vq;
5540 return true;
5543 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5544 instruction. */
5546 bool
5547 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5549 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5552 /* Return the asm template for an SVE vector INC or DEC instruction.
5553 OPERANDS gives the operands before the vector count and X is the
5554 value of the vector count operand itself. */
5556 char *
5557 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5559 int factor;
5560 unsigned int nelts_per_vq;
5561 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5562 gcc_unreachable ();
5563 if (factor < 0)
5564 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5565 -factor, nelts_per_vq);
5566 else
5567 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5568 factor, nelts_per_vq);
5571 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5573 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5575 0x0000000100000001ull,
5576 0x0001000100010001ull,
5577 0x0101010101010101ull,
5578 0x1111111111111111ull,
5579 0x5555555555555555ull,
5584 /* Return true if 64-bit VAL is a valid bitmask immediate. */
5585 static bool
5586 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
5588 unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
5589 int bits;
5591 /* Check for a single sequence of one bits and return quickly if so.
5592 The special cases of all ones and all zeroes returns false. */
5593 tmp = val + (val & -val);
5595 if (tmp == (tmp & -tmp))
5596 return (val + 1) > 1;
5598 /* Invert if the immediate doesn't start with a zero bit - this means we
5599 only need to search for sequences of one bits. */
5600 if (val & 1)
5601 val = ~val;
5603 /* Find the first set bit and set tmp to val with the first sequence of one
5604 bits removed. Return success if there is a single sequence of ones. */
5605 first_one = val & -val;
5606 tmp = val & (val + first_one);
5608 if (tmp == 0)
5609 return true;
5611 /* Find the next set bit and compute the difference in bit position. */
5612 next_one = tmp & -tmp;
5613 bits = clz_hwi (first_one) - clz_hwi (next_one);
5614 mask = val ^ tmp;
5616 /* Check the bit position difference is a power of 2, and that the first
5617 sequence of one bits fits within 'bits' bits. */
5618 if ((mask >> bits) != 0 || bits != (bits & -bits))
5619 return false;
5621 /* Check the sequence of one bits is repeated 64/bits times. */
5622 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5626 /* Return true if VAL is a valid bitmask immediate for MODE. */
5627 bool
5628 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5630 if (mode == DImode)
5631 return aarch64_bitmask_imm (val);
5633 if (mode == SImode)
5634 return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
5636 /* Replicate small immediates to fit 64 bits. */
5637 int size = GET_MODE_UNIT_PRECISION (mode);
5638 val &= (HOST_WIDE_INT_1U << size) - 1;
5639 val *= bitmask_imm_mul[__builtin_clz (size) - 26];
5641 return aarch64_bitmask_imm (val);
5645 /* Return true if the immediate VAL can be a bitfield immediate
5646 by changing the given MASK bits in VAL to zeroes, ones or bits
5647 from the other half of VAL. Return the new immediate in VAL2. */
5648 static inline bool
5649 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
5650 unsigned HOST_WIDE_INT &val2,
5651 unsigned HOST_WIDE_INT mask)
5653 val2 = val & ~mask;
5654 if (val2 != val && aarch64_bitmask_imm (val2))
5655 return true;
5656 val2 = val | mask;
5657 if (val2 != val && aarch64_bitmask_imm (val2))
5658 return true;
5659 val = val & ~mask;
5660 val2 = val | (((val >> 32) | (val << 32)) & mask);
5661 if (val2 != val && aarch64_bitmask_imm (val2))
5662 return true;
5663 val2 = val | (((val >> 16) | (val << 48)) & mask);
5664 if (val2 != val && aarch64_bitmask_imm (val2))
5665 return true;
5666 return false;
5670 /* Return true if VAL is a valid MOVZ immediate. */
5671 static inline bool
5672 aarch64_is_movz (unsigned HOST_WIDE_INT val)
5674 return (val >> (ctz_hwi (val) & 48)) < 65536;
5678 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
5679 bool
5680 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
5682 return aarch64_is_movz (val) || aarch64_is_movz (~val)
5683 || aarch64_bitmask_imm (val);
5687 /* Return true if VAL is an immediate that can be created by a single
5688 MOV instruction. */
5689 bool
5690 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5692 gcc_assert (mode == SImode || mode == DImode);
5694 if (val < 65536)
5695 return true;
5697 unsigned HOST_WIDE_INT mask =
5698 (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
5700 if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
5701 return true;
5703 val = (val & mask) | ((val << 32) & ~mask);
5704 return aarch64_bitmask_imm (val);
5708 static int
5709 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5710 machine_mode mode)
5712 int i;
5713 unsigned HOST_WIDE_INT val, val2, mask;
5714 int one_match, zero_match;
5715 int num_insns;
5717 gcc_assert (mode == SImode || mode == DImode);
5719 val = INTVAL (imm);
5721 if (aarch64_move_imm (val, mode))
5723 if (generate)
5724 emit_insn (gen_rtx_SET (dest, imm));
5725 return 1;
5728 if ((val >> 32) == 0 || mode == SImode)
5730 if (generate)
5732 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5733 if (mode == SImode)
5734 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5735 GEN_INT ((val >> 16) & 0xffff)));
5736 else
5737 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5738 GEN_INT ((val >> 16) & 0xffff)));
5740 return 2;
5743 /* Remaining cases are all for DImode. */
5745 mask = 0xffff;
5746 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5747 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5748 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5749 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5751 /* Try a bitmask immediate and a movk to generate the immediate
5752 in 2 instructions. */
5754 if (zero_match < 2 && one_match < 2)
5756 for (i = 0; i < 64; i += 16)
5758 if (aarch64_check_bitmask (val, val2, mask << i))
5759 break;
5761 val2 = val & ~(mask << i);
5762 if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
5763 break;
5766 if (i != 64)
5768 if (generate)
5770 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5771 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5772 GEN_INT ((val >> i) & 0xffff)));
5774 return 2;
5778 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
5779 if (zero_match + one_match == 0)
5781 for (i = 0; i < 48; i += 16)
5782 for (int j = i + 16; j < 64; j += 16)
5783 if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
5785 if (generate)
5787 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5788 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5789 GEN_INT ((val >> i) & 0xffff)));
5790 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
5791 GEN_INT ((val >> j) & 0xffff)));
5793 return 3;
5797 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5798 are emitted by the initial mov. If one_match > zero_match, skip set bits,
5799 otherwise skip zero bits. */
5801 num_insns = 1;
5802 mask = 0xffff;
5803 val2 = one_match > zero_match ? ~val : val;
5804 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5806 if (generate)
5807 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5808 ? (val | ~(mask << i))
5809 : (val & (mask << i)))));
5810 for (i += 16; i < 64; i += 16)
5812 if ((val2 & (mask << i)) == 0)
5813 continue;
5814 if (generate)
5815 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5816 GEN_INT ((val >> i) & 0xffff)));
5817 num_insns ++;
5820 return num_insns;
5823 /* Return whether imm is a 128-bit immediate which is simple enough to
5824 expand inline. */
5825 bool
5826 aarch64_mov128_immediate (rtx imm)
5828 if (CONST_INT_P (imm))
5829 return true;
5831 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5833 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5834 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5836 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5837 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5841 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5842 a left shift of 0 or 12 bits. */
5843 bool
5844 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
5846 return val < 4096 || (val & 0xfff000) == val;
5849 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5850 that can be created with a left shift of 0 or 12. */
5851 static HOST_WIDE_INT
5852 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
5854 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5855 handle correctly. */
5856 gcc_assert (val < 0x1000000);
5858 if (val < 4096)
5859 return val;
5861 return val & 0xfff000;
5865 /* Test whether:
5867 X = (X & AND_VAL) | IOR_VAL;
5869 can be implemented using:
5871 MOVK X, #(IOR_VAL >> shift), LSL #shift
5873 Return the shift if so, otherwise return -1. */
5875 aarch64_movk_shift (const wide_int_ref &and_val,
5876 const wide_int_ref &ior_val)
5878 unsigned int precision = and_val.get_precision ();
5879 unsigned HOST_WIDE_INT mask = 0xffff;
5880 for (unsigned int shift = 0; shift < precision; shift += 16)
5882 if (and_val == ~mask && (ior_val & mask) == ior_val)
5883 return shift;
5884 mask <<= 16;
5886 return -1;
5889 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5890 Assumed precondition: VAL_IN Is not zero. */
5892 unsigned HOST_WIDE_INT
5893 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5895 int lowest_bit_set = ctz_hwi (val_in);
5896 int highest_bit_set = floor_log2 (val_in);
5897 gcc_assert (val_in != 0);
5899 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5900 (HOST_WIDE_INT_1U << lowest_bit_set));
5903 /* Create constant where bits outside of lowest bit set to highest bit set
5904 are set to 1. */
5906 unsigned HOST_WIDE_INT
5907 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5909 return val_in | ~aarch64_and_split_imm1 (val_in);
5912 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5914 bool
5915 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5917 scalar_int_mode int_mode;
5918 if (!is_a <scalar_int_mode> (mode, &int_mode))
5919 return false;
5921 if (aarch64_bitmask_imm (val_in, int_mode))
5922 return false;
5924 if (aarch64_move_imm (val_in, int_mode))
5925 return false;
5927 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5929 return aarch64_bitmask_imm (imm2, int_mode);
5932 /* Return the number of temporary registers that aarch64_add_offset_1
5933 would need to add OFFSET to a register. */
5935 static unsigned int
5936 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5938 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5941 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
5942 a non-polynomial OFFSET. MODE is the mode of the addition.
5943 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5944 be set and CFA adjustments added to the generated instructions.
5946 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5947 temporary if register allocation is already complete. This temporary
5948 register may overlap DEST but must not overlap SRC. If TEMP1 is known
5949 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5950 the immediate again.
5952 Since this function may be used to adjust the stack pointer, we must
5953 ensure that it cannot cause transient stack deallocation (for example
5954 by first incrementing SP and then decrementing when adjusting by a
5955 large immediate). */
5957 static void
5958 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5959 rtx src, HOST_WIDE_INT offset, rtx temp1,
5960 bool frame_related_p, bool emit_move_imm)
5962 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5963 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5965 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5966 rtx_insn *insn;
5968 if (!moffset)
5970 if (!rtx_equal_p (dest, src))
5972 insn = emit_insn (gen_rtx_SET (dest, src));
5973 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5975 return;
5978 /* Single instruction adjustment. */
5979 if (aarch64_uimm12_shift (moffset))
5981 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5982 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5983 return;
5986 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
5987 and either:
5989 a) the offset cannot be loaded by a 16-bit move or
5990 b) there is no spare register into which we can move it. */
5991 if (moffset < 0x1000000
5992 && ((!temp1 && !can_create_pseudo_p ())
5993 || !aarch64_move_imm (moffset, mode)))
5995 HOST_WIDE_INT low_off = moffset & 0xfff;
5997 low_off = offset < 0 ? -low_off : low_off;
5998 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
5999 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6000 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
6001 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6002 return;
6005 /* Emit a move immediate if required and an addition/subtraction. */
6006 if (emit_move_imm)
6008 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
6009 temp1 = aarch64_force_temporary (mode, temp1,
6010 gen_int_mode (moffset, mode));
6012 insn = emit_insn (offset < 0
6013 ? gen_sub3_insn (dest, src, temp1)
6014 : gen_add3_insn (dest, src, temp1));
6015 if (frame_related_p)
6017 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6018 rtx adj = plus_constant (mode, src, offset);
6019 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
6023 /* Return the number of temporary registers that aarch64_add_offset
6024 would need to move OFFSET into a register or add OFFSET to a register;
6025 ADD_P is true if we want the latter rather than the former. */
6027 static unsigned int
6028 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
6030 /* This follows the same structure as aarch64_add_offset. */
6031 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
6032 return 0;
6034 unsigned int count = 0;
6035 HOST_WIDE_INT factor = offset.coeffs[1];
6036 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6037 poly_int64 poly_offset (factor, factor);
6038 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6039 /* Need one register for the ADDVL/ADDPL result. */
6040 count += 1;
6041 else if (factor != 0)
6043 factor = abs (factor);
6044 if (factor > 16 * (factor & -factor))
6045 /* Need one register for the CNT result and one for the multiplication
6046 factor. If necessary, the second temporary can be reused for the
6047 constant part of the offset. */
6048 return 2;
6049 /* Need one register for the CNT result (which might then
6050 be shifted). */
6051 count += 1;
6053 return count + aarch64_add_offset_1_temporaries (constant);
6056 /* If X can be represented as a poly_int64, return the number
6057 of temporaries that are required to add it to a register.
6058 Return -1 otherwise. */
6061 aarch64_add_offset_temporaries (rtx x)
6063 poly_int64 offset;
6064 if (!poly_int_rtx_p (x, &offset))
6065 return -1;
6066 return aarch64_offset_temporaries (true, offset);
6069 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
6070 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
6071 be set and CFA adjustments added to the generated instructions.
6073 TEMP1, if nonnull, is a register of mode MODE that can be used as a
6074 temporary if register allocation is already complete. This temporary
6075 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
6076 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
6077 false to avoid emitting the immediate again.
6079 TEMP2, if nonnull, is a second temporary register that doesn't
6080 overlap either DEST or REG.
6082 Since this function may be used to adjust the stack pointer, we must
6083 ensure that it cannot cause transient stack deallocation (for example
6084 by first incrementing SP and then decrementing when adjusting by a
6085 large immediate). */
6087 static void
6088 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6089 poly_int64 offset, rtx temp1, rtx temp2,
6090 bool frame_related_p, bool emit_move_imm = true)
6092 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
6093 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
6094 gcc_assert (temp1 == NULL_RTX
6095 || !frame_related_p
6096 || !reg_overlap_mentioned_p (temp1, dest));
6097 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
6099 /* Try using ADDVL or ADDPL to add the whole value. */
6100 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
6102 rtx offset_rtx = gen_int_mode (offset, mode);
6103 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6104 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6105 return;
6108 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
6109 SVE vector register, over and above the minimum size of 128 bits.
6110 This is equivalent to half the value returned by CNTD with a
6111 vector shape of ALL. */
6112 HOST_WIDE_INT factor = offset.coeffs[1];
6113 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6115 /* Try using ADDVL or ADDPL to add the VG-based part. */
6116 poly_int64 poly_offset (factor, factor);
6117 if (src != const0_rtx
6118 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6120 rtx offset_rtx = gen_int_mode (poly_offset, mode);
6121 if (frame_related_p)
6123 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6124 RTX_FRAME_RELATED_P (insn) = true;
6125 src = dest;
6127 else
6129 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
6130 src = aarch64_force_temporary (mode, temp1, addr);
6131 temp1 = temp2;
6132 temp2 = NULL_RTX;
6135 /* Otherwise use a CNT-based sequence. */
6136 else if (factor != 0)
6138 /* Use a subtraction if we have a negative factor. */
6139 rtx_code code = PLUS;
6140 if (factor < 0)
6142 factor = -factor;
6143 code = MINUS;
6146 /* Calculate CNTD * FACTOR / 2. First try to fold the division
6147 into the multiplication. */
6148 rtx val;
6149 int shift = 0;
6150 if (factor & 1)
6151 /* Use a right shift by 1. */
6152 shift = -1;
6153 else
6154 factor /= 2;
6155 HOST_WIDE_INT low_bit = factor & -factor;
6156 if (factor <= 16 * low_bit)
6158 if (factor > 16 * 8)
6160 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
6161 the value with the minimum multiplier and shift it into
6162 position. */
6163 int extra_shift = exact_log2 (low_bit);
6164 shift += extra_shift;
6165 factor >>= extra_shift;
6167 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
6169 else
6171 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
6172 directly, since that should increase the chances of being
6173 able to use a shift and add sequence. If LOW_BIT itself
6174 is out of range, just use CNTD. */
6175 if (low_bit <= 16 * 8)
6176 factor /= low_bit;
6177 else
6178 low_bit = 1;
6180 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
6181 val = aarch64_force_temporary (mode, temp1, val);
6183 if (can_create_pseudo_p ())
6185 rtx coeff1 = gen_int_mode (factor, mode);
6186 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
6188 else
6190 /* Go back to using a negative multiplication factor if we have
6191 no register from which to subtract. */
6192 if (code == MINUS && src == const0_rtx)
6194 factor = -factor;
6195 code = PLUS;
6197 rtx coeff1 = gen_int_mode (factor, mode);
6198 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
6199 val = gen_rtx_MULT (mode, val, coeff1);
6203 if (shift > 0)
6205 /* Multiply by 1 << SHIFT. */
6206 val = aarch64_force_temporary (mode, temp1, val);
6207 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
6209 else if (shift == -1)
6211 /* Divide by 2. */
6212 val = aarch64_force_temporary (mode, temp1, val);
6213 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
6216 /* Calculate SRC +/- CNTD * FACTOR / 2. */
6217 if (src != const0_rtx)
6219 val = aarch64_force_temporary (mode, temp1, val);
6220 val = gen_rtx_fmt_ee (code, mode, src, val);
6222 else if (code == MINUS)
6224 val = aarch64_force_temporary (mode, temp1, val);
6225 val = gen_rtx_NEG (mode, val);
6228 if (constant == 0 || frame_related_p)
6230 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
6231 if (frame_related_p)
6233 RTX_FRAME_RELATED_P (insn) = true;
6234 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6235 gen_rtx_SET (dest, plus_constant (Pmode, src,
6236 poly_offset)));
6238 src = dest;
6239 if (constant == 0)
6240 return;
6242 else
6244 src = aarch64_force_temporary (mode, temp1, val);
6245 temp1 = temp2;
6246 temp2 = NULL_RTX;
6249 emit_move_imm = true;
6252 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
6253 frame_related_p, emit_move_imm);
6256 /* Like aarch64_add_offset, but the offset is given as an rtx rather
6257 than a poly_int64. */
6259 void
6260 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6261 rtx offset_rtx, rtx temp1, rtx temp2)
6263 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
6264 temp1, temp2, false);
6267 /* Add DELTA to the stack pointer, marking the instructions frame-related.
6268 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
6269 if TEMP1 already contains abs (DELTA). */
6271 static inline void
6272 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
6274 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
6275 temp1, temp2, true, emit_move_imm);
6278 /* Subtract DELTA from the stack pointer, marking the instructions
6279 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
6280 if nonnull. */
6282 static inline void
6283 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
6284 bool emit_move_imm = true)
6286 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
6287 temp1, temp2, frame_related_p, emit_move_imm);
6290 /* Set DEST to (vec_series BASE STEP). */
6292 static void
6293 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6295 machine_mode mode = GET_MODE (dest);
6296 scalar_mode inner = GET_MODE_INNER (mode);
6298 /* Each operand can be a register or an immediate in the range [-16, 15]. */
6299 if (!aarch64_sve_index_immediate_p (base))
6300 base = force_reg (inner, base);
6301 if (!aarch64_sve_index_immediate_p (step))
6302 step = force_reg (inner, step);
6304 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6307 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6308 register of mode MODE. Use TARGET for the result if it's nonnull
6309 and convenient.
6311 The two vector modes must have the same element mode. The behavior
6312 is to duplicate architectural lane N of SRC into architectural lanes
6313 N + I * STEP of the result. On big-endian targets, architectural
6314 lane 0 of an Advanced SIMD vector is the last element of the vector
6315 in memory layout, so for big-endian targets this operation has the
6316 effect of reversing SRC before duplicating it. Callers need to
6317 account for this. */
6320 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6322 machine_mode src_mode = GET_MODE (src);
6323 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6324 insn_code icode = (BYTES_BIG_ENDIAN
6325 ? code_for_aarch64_vec_duplicate_vq_be (mode)
6326 : code_for_aarch64_vec_duplicate_vq_le (mode));
6328 unsigned int i = 0;
6329 expand_operand ops[3];
6330 create_output_operand (&ops[i++], target, mode);
6331 create_output_operand (&ops[i++], src, src_mode);
6332 if (BYTES_BIG_ENDIAN)
6334 /* Create a PARALLEL describing the reversal of SRC. */
6335 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6336 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6337 nelts_per_vq - 1, -1);
6338 create_fixed_operand (&ops[i++], sel);
6340 expand_insn (icode, i, ops);
6341 return ops[0].value;
6344 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6345 the memory image into DEST. Return true on success. */
6347 static bool
6348 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6350 src = force_const_mem (GET_MODE (src), src);
6351 if (!src)
6352 return false;
6354 /* Make sure that the address is legitimate. */
6355 if (!aarch64_sve_ld1rq_operand_p (src))
6357 rtx addr = force_reg (Pmode, XEXP (src, 0));
6358 src = replace_equiv_address (src, addr);
6361 machine_mode mode = GET_MODE (dest);
6362 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6363 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6364 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6365 return true;
6368 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6369 by N "background" values. Try to move it into TARGET using:
6371 PTRUE PRED.<T>, VL<N>
6372 MOV TRUE.<T>, #<foreground>
6373 MOV FALSE.<T>, #<background>
6374 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6376 The PTRUE is always a single instruction but the MOVs might need a
6377 longer sequence. If the background value is zero (as it often is),
6378 the sequence can sometimes collapse to a PTRUE followed by a
6379 zero-predicated move.
6381 Return the target on success, otherwise return null. */
6383 static rtx
6384 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6386 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6388 /* Make sure that the PTRUE is valid. */
6389 machine_mode mode = GET_MODE (src);
6390 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6391 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6392 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6393 == AARCH64_NUM_SVPATTERNS)
6394 return NULL_RTX;
6396 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6397 rtx_vector_builder true_builder (mode, npatterns, 1);
6398 rtx_vector_builder false_builder (mode, npatterns, 1);
6399 for (unsigned int i = 0; i < npatterns; ++i)
6401 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6402 pred_builder.quick_push (CONST1_RTX (BImode));
6404 for (unsigned int i = 0; i < npatterns; ++i)
6406 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6407 pred_builder.quick_push (CONST0_RTX (BImode));
6409 expand_operand ops[4];
6410 create_output_operand (&ops[0], target, mode);
6411 create_input_operand (&ops[1], true_builder.build (), mode);
6412 create_input_operand (&ops[2], false_builder.build (), mode);
6413 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6414 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6415 return target;
6418 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6419 SVE data mode and isn't a legitimate constant. Use TARGET for the
6420 result if convenient.
6422 The returned register can have whatever mode seems most natural
6423 given the contents of SRC. */
6425 static rtx
6426 aarch64_expand_sve_const_vector (rtx target, rtx src)
6428 machine_mode mode = GET_MODE (src);
6429 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6430 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6431 scalar_mode elt_mode = GET_MODE_INNER (mode);
6432 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6433 unsigned int container_bits = aarch64_sve_container_bits (mode);
6434 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6436 if (nelts_per_pattern == 1
6437 && encoded_bits <= 128
6438 && container_bits != elt_bits)
6440 /* We have a partial vector mode and a constant whose full-vector
6441 equivalent would occupy a repeating 128-bit sequence. Build that
6442 full-vector equivalent instead, so that we have the option of
6443 using LD1RQ and Advanced SIMD operations. */
6444 unsigned int repeat = container_bits / elt_bits;
6445 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6446 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6447 for (unsigned int i = 0; i < npatterns; ++i)
6448 for (unsigned int j = 0; j < repeat; ++j)
6449 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6450 target = aarch64_target_reg (target, full_mode);
6451 return aarch64_expand_sve_const_vector (target, builder.build ());
6454 if (nelts_per_pattern == 1 && encoded_bits == 128)
6456 /* The constant is a duplicated quadword but can't be narrowed
6457 beyond a quadword. Get the memory image of the first quadword
6458 as a 128-bit vector and try using LD1RQ to load it from memory.
6460 The effect for both endiannesses is to load memory lane N into
6461 architectural lanes N + I * STEP of the result. On big-endian
6462 targets, the layout of the 128-bit vector in an Advanced SIMD
6463 register would be different from its layout in an SVE register,
6464 but this 128-bit vector is a memory value only. */
6465 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6466 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6467 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6468 return target;
6471 if (nelts_per_pattern == 1 && encoded_bits < 128)
6473 /* The vector is a repeating sequence of 64 bits or fewer.
6474 See if we can load them using an Advanced SIMD move and then
6475 duplicate it to fill a vector. This is better than using a GPR
6476 move because it keeps everything in the same register file. */
6477 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6478 rtx_vector_builder builder (vq_mode, npatterns, 1);
6479 for (unsigned int i = 0; i < npatterns; ++i)
6481 /* We want memory lane N to go into architectural lane N,
6482 so reverse for big-endian targets. The DUP .Q pattern
6483 has a compensating reverse built-in. */
6484 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6485 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6487 rtx vq_src = builder.build ();
6488 if (aarch64_simd_valid_immediate (vq_src, NULL))
6490 vq_src = force_reg (vq_mode, vq_src);
6491 return aarch64_expand_sve_dupq (target, mode, vq_src);
6494 /* Get an integer representation of the repeating part of Advanced
6495 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
6496 which for big-endian targets is lane-swapped wrt a normal
6497 Advanced SIMD vector. This means that for both endiannesses,
6498 memory lane N of SVE vector SRC corresponds to architectural
6499 lane N of a register holding VQ_SRC. This in turn means that
6500 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6501 as a single 128-bit value) and thus that memory lane 0 of SRC is
6502 in the lsb of the integer. Duplicating the integer therefore
6503 ensures that memory lane N of SRC goes into architectural lane
6504 N + I * INDEX of the SVE register. */
6505 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6506 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6507 if (elt_value)
6509 /* Pretend that we had a vector of INT_MODE to start with. */
6510 elt_mode = int_mode;
6511 mode = aarch64_full_sve_mode (int_mode).require ();
6513 /* If the integer can be moved into a general register by a
6514 single instruction, do that and duplicate the result. */
6515 if (CONST_INT_P (elt_value)
6516 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
6518 elt_value = force_reg (elt_mode, elt_value);
6519 return expand_vector_broadcast (mode, elt_value);
6522 else if (npatterns == 1)
6523 /* We're duplicating a single value, but can't do better than
6524 force it to memory and load from there. This handles things
6525 like symbolic constants. */
6526 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6528 if (elt_value)
6530 /* Load the element from memory if we can, otherwise move it into
6531 a register and use a DUP. */
6532 rtx op = force_const_mem (elt_mode, elt_value);
6533 if (!op)
6534 op = force_reg (elt_mode, elt_value);
6535 return expand_vector_broadcast (mode, op);
6539 /* Try using INDEX. */
6540 rtx base, step;
6541 if (const_vec_series_p (src, &base, &step))
6543 aarch64_expand_vec_series (target, base, step);
6544 return target;
6547 /* From here on, it's better to force the whole constant to memory
6548 if we can. */
6549 if (GET_MODE_NUNITS (mode).is_constant ())
6550 return NULL_RTX;
6552 if (nelts_per_pattern == 2)
6553 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6554 return res;
6556 /* Expand each pattern individually. */
6557 gcc_assert (npatterns > 1);
6558 rtx_vector_builder builder;
6559 auto_vec<rtx, 16> vectors (npatterns);
6560 for (unsigned int i = 0; i < npatterns; ++i)
6562 builder.new_vector (mode, 1, nelts_per_pattern);
6563 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6564 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6565 vectors.quick_push (force_reg (mode, builder.build ()));
6568 /* Use permutes to interleave the separate vectors. */
6569 while (npatterns > 1)
6571 npatterns /= 2;
6572 for (unsigned int i = 0; i < npatterns; ++i)
6574 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6575 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6576 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6577 vectors[i] = tmp;
6580 gcc_assert (vectors[0] == target);
6581 return target;
6584 /* Use WHILE to set a predicate register of mode MODE in which the first
6585 VL bits are set and the rest are clear. Use TARGET for the register
6586 if it's nonnull and convenient. */
6588 static rtx
6589 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6590 unsigned int vl)
6592 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6593 target = aarch64_target_reg (target, mode);
6594 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6595 target, const0_rtx, limit));
6596 return target;
6599 static rtx
6600 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6602 /* BUILDER is a constant predicate in which the index of every set bit
6603 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6604 by inverting every element at a multiple of ELT_SIZE and EORing the
6605 result with an ELT_SIZE PTRUE.
6607 Return a register that contains the constant on success, otherwise
6608 return null. Use TARGET as the register if it is nonnull and
6609 convenient. */
6611 static rtx
6612 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6613 unsigned int elt_size)
6615 /* Invert every element at a multiple of ELT_SIZE, keeping the
6616 other bits zero. */
6617 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6618 builder.nelts_per_pattern ());
6619 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6620 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6621 inv_builder.quick_push (const1_rtx);
6622 else
6623 inv_builder.quick_push (const0_rtx);
6624 inv_builder.finalize ();
6626 /* See if we can load the constant cheaply. */
6627 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6628 if (!inv)
6629 return NULL_RTX;
6631 /* EOR the result with an ELT_SIZE PTRUE. */
6632 rtx mask = aarch64_ptrue_all (elt_size);
6633 mask = force_reg (VNx16BImode, mask);
6634 inv = gen_lowpart (VNx16BImode, inv);
6635 target = aarch64_target_reg (target, VNx16BImode);
6636 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6637 return target;
6640 /* BUILDER is a constant predicate in which the index of every set bit
6641 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6642 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
6643 register on success, otherwise return null. Use TARGET as the register
6644 if nonnull and convenient. */
6646 static rtx
6647 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6648 unsigned int elt_size,
6649 unsigned int permute_size)
6651 /* We're going to split the constant into two new constants A and B,
6652 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6653 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6655 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6656 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6658 where _ indicates elements that will be discarded by the permute.
6660 First calculate the ELT_SIZEs for A and B. */
6661 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6662 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6663 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6664 if (INTVAL (builder.elt (i)) != 0)
6666 if (i & permute_size)
6667 b_elt_size |= i - permute_size;
6668 else
6669 a_elt_size |= i;
6671 a_elt_size &= -a_elt_size;
6672 b_elt_size &= -b_elt_size;
6674 /* Now construct the vectors themselves. */
6675 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6676 builder.nelts_per_pattern ());
6677 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6678 builder.nelts_per_pattern ());
6679 unsigned int nelts = builder.encoded_nelts ();
6680 for (unsigned int i = 0; i < nelts; ++i)
6681 if (i & (elt_size - 1))
6683 a_builder.quick_push (const0_rtx);
6684 b_builder.quick_push (const0_rtx);
6686 else if ((i & permute_size) == 0)
6688 /* The A and B elements are significant. */
6689 a_builder.quick_push (builder.elt (i));
6690 b_builder.quick_push (builder.elt (i + permute_size));
6692 else
6694 /* The A and B elements are going to be discarded, so pick whatever
6695 is likely to give a nice constant. We are targeting element
6696 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6697 with the aim of each being a sequence of ones followed by
6698 a sequence of zeros. So:
6700 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6701 duplicate the last X_ELT_SIZE element, to extend the
6702 current sequence of ones or zeros.
6704 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6705 zero, so that the constant really does have X_ELT_SIZE and
6706 not a smaller size. */
6707 if (a_elt_size > permute_size)
6708 a_builder.quick_push (const0_rtx);
6709 else
6710 a_builder.quick_push (a_builder.elt (i - a_elt_size));
6711 if (b_elt_size > permute_size)
6712 b_builder.quick_push (const0_rtx);
6713 else
6714 b_builder.quick_push (b_builder.elt (i - b_elt_size));
6716 a_builder.finalize ();
6717 b_builder.finalize ();
6719 /* Try loading A into a register. */
6720 rtx_insn *last = get_last_insn ();
6721 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6722 if (!a)
6723 return NULL_RTX;
6725 /* Try loading B into a register. */
6726 rtx b = a;
6727 if (a_builder != b_builder)
6729 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6730 if (!b)
6732 delete_insns_since (last);
6733 return NULL_RTX;
6737 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
6738 operands but permutes them as though they had mode MODE. */
6739 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6740 target = aarch64_target_reg (target, GET_MODE (a));
6741 rtx type_reg = CONST0_RTX (mode);
6742 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6743 return target;
6746 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
6747 constant in BUILDER into an SVE predicate register. Return the register
6748 on success, otherwise return null. Use TARGET for the register if
6749 nonnull and convenient.
6751 ALLOW_RECURSE_P is true if we can use methods that would call this
6752 function recursively. */
6754 static rtx
6755 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6756 bool allow_recurse_p)
6758 if (builder.encoded_nelts () == 1)
6759 /* A PFALSE or a PTRUE .B ALL. */
6760 return aarch64_emit_set_immediate (target, builder);
6762 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6763 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6765 /* If we can load the constant using PTRUE, use it as-is. */
6766 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6767 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6768 return aarch64_emit_set_immediate (target, builder);
6770 /* Otherwise use WHILE to set the first VL bits. */
6771 return aarch64_sve_move_pred_via_while (target, mode, vl);
6774 if (!allow_recurse_p)
6775 return NULL_RTX;
6777 /* Try inverting the vector in element size ELT_SIZE and then EORing
6778 the result with an ELT_SIZE PTRUE. */
6779 if (INTVAL (builder.elt (0)) == 0)
6780 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6781 elt_size))
6782 return res;
6784 /* Try using TRN1 to permute two simpler constants. */
6785 for (unsigned int i = elt_size; i <= 8; i *= 2)
6786 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6787 elt_size, i))
6788 return res;
6790 return NULL_RTX;
6793 /* Return an SVE predicate register that contains the VNx16BImode
6794 constant in BUILDER, without going through the move expanders.
6796 The returned register can have whatever mode seems most natural
6797 given the contents of BUILDER. Use TARGET for the result if
6798 convenient. */
6800 static rtx
6801 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6803 /* Try loading the constant using pure predicate operations. */
6804 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6805 return res;
6807 /* Try forcing the constant to memory. */
6808 if (builder.full_nelts ().is_constant ())
6809 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6811 target = aarch64_target_reg (target, VNx16BImode);
6812 emit_move_insn (target, mem);
6813 return target;
6816 /* The last resort is to load the constant as an integer and then
6817 compare it against zero. Use -1 for set bits in order to increase
6818 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
6819 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6820 builder.nelts_per_pattern ());
6821 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6822 int_builder.quick_push (INTVAL (builder.elt (i))
6823 ? constm1_rtx : const0_rtx);
6824 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6825 int_builder.build ());
6828 /* Set DEST to immediate IMM. */
6830 void
6831 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6833 machine_mode mode = GET_MODE (dest);
6835 /* Check on what type of symbol it is. */
6836 scalar_int_mode int_mode;
6837 if ((SYMBOL_REF_P (imm)
6838 || LABEL_REF_P (imm)
6839 || GET_CODE (imm) == CONST
6840 || GET_CODE (imm) == CONST_POLY_INT)
6841 && is_a <scalar_int_mode> (mode, &int_mode))
6843 rtx mem;
6844 poly_int64 offset;
6845 HOST_WIDE_INT const_offset;
6846 enum aarch64_symbol_type sty;
6848 /* If we have (const (plus symbol offset)), separate out the offset
6849 before we start classifying the symbol. */
6850 rtx base = strip_offset (imm, &offset);
6852 /* We must always add an offset involving VL separately, rather than
6853 folding it into the relocation. */
6854 if (!offset.is_constant (&const_offset))
6856 if (!TARGET_SVE)
6858 aarch64_report_sve_required ();
6859 return;
6861 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6862 emit_insn (gen_rtx_SET (dest, imm));
6863 else
6865 /* Do arithmetic on 32-bit values if the result is smaller
6866 than that. */
6867 if (partial_subreg_p (int_mode, SImode))
6869 /* It is invalid to do symbol calculations in modes
6870 narrower than SImode. */
6871 gcc_assert (base == const0_rtx);
6872 dest = gen_lowpart (SImode, dest);
6873 int_mode = SImode;
6875 if (base != const0_rtx)
6877 base = aarch64_force_temporary (int_mode, dest, base);
6878 aarch64_add_offset (int_mode, dest, base, offset,
6879 NULL_RTX, NULL_RTX, false);
6881 else
6882 aarch64_add_offset (int_mode, dest, base, offset,
6883 dest, NULL_RTX, false);
6885 return;
6888 sty = aarch64_classify_symbol (base, const_offset);
6889 switch (sty)
6891 case SYMBOL_FORCE_TO_MEM:
6892 if (int_mode != ptr_mode)
6893 imm = convert_memory_address (ptr_mode, imm);
6895 if (const_offset != 0
6896 && targetm.cannot_force_const_mem (ptr_mode, imm))
6898 gcc_assert (can_create_pseudo_p ());
6899 base = aarch64_force_temporary (int_mode, dest, base);
6900 aarch64_add_offset (int_mode, dest, base, const_offset,
6901 NULL_RTX, NULL_RTX, false);
6902 return;
6905 mem = force_const_mem (ptr_mode, imm);
6906 gcc_assert (mem);
6908 /* If we aren't generating PC relative literals, then
6909 we need to expand the literal pool access carefully.
6910 This is something that needs to be done in a number
6911 of places, so could well live as a separate function. */
6912 if (!aarch64_pcrelative_literal_loads)
6914 gcc_assert (can_create_pseudo_p ());
6915 base = gen_reg_rtx (ptr_mode);
6916 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6917 if (ptr_mode != Pmode)
6918 base = convert_memory_address (Pmode, base);
6919 mem = gen_rtx_MEM (ptr_mode, base);
6922 if (int_mode != ptr_mode)
6923 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6925 emit_insn (gen_rtx_SET (dest, mem));
6927 return;
6929 case SYMBOL_SMALL_TLSGD:
6930 case SYMBOL_SMALL_TLSDESC:
6931 case SYMBOL_SMALL_TLSIE:
6932 case SYMBOL_SMALL_GOT_28K:
6933 case SYMBOL_SMALL_GOT_4G:
6934 case SYMBOL_TINY_GOT:
6935 case SYMBOL_TINY_TLSIE:
6936 if (const_offset != 0)
6938 gcc_assert(can_create_pseudo_p ());
6939 base = aarch64_force_temporary (int_mode, dest, base);
6940 aarch64_add_offset (int_mode, dest, base, const_offset,
6941 NULL_RTX, NULL_RTX, false);
6942 return;
6944 /* FALLTHRU */
6946 case SYMBOL_SMALL_ABSOLUTE:
6947 case SYMBOL_TINY_ABSOLUTE:
6948 case SYMBOL_TLSLE12:
6949 case SYMBOL_TLSLE24:
6950 case SYMBOL_TLSLE32:
6951 case SYMBOL_TLSLE48:
6952 aarch64_load_symref_appropriately (dest, imm, sty);
6953 return;
6955 default:
6956 gcc_unreachable ();
6960 if (!CONST_INT_P (imm))
6962 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6964 /* Only the low bit of each .H, .S and .D element is defined,
6965 so we can set the upper bits to whatever we like. If the
6966 predicate is all-true in MODE, prefer to set all the undefined
6967 bits as well, so that we can share a single .B predicate for
6968 all modes. */
6969 if (imm == CONSTM1_RTX (mode))
6970 imm = CONSTM1_RTX (VNx16BImode);
6972 /* All methods for constructing predicate modes wider than VNx16BI
6973 will set the upper bits of each element to zero. Expose this
6974 by moving such constants as a VNx16BI, so that all bits are
6975 significant and so that constants for different modes can be
6976 shared. The wider constant will still be available as a
6977 REG_EQUAL note. */
6978 rtx_vector_builder builder;
6979 if (aarch64_get_sve_pred_bits (builder, imm))
6981 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6982 if (dest != res)
6983 emit_move_insn (dest, gen_lowpart (mode, res));
6984 return;
6988 if (GET_CODE (imm) == HIGH
6989 || aarch64_simd_valid_immediate (imm, NULL))
6991 emit_insn (gen_rtx_SET (dest, imm));
6992 return;
6995 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6996 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6998 if (dest != res)
6999 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
7000 return;
7003 rtx mem = force_const_mem (mode, imm);
7004 gcc_assert (mem);
7005 emit_move_insn (dest, mem);
7006 return;
7009 aarch64_internal_mov_immediate (dest, imm, true, mode);
7012 /* Return the MEM rtx that provides the canary value that should be used
7013 for stack-smashing protection. MODE is the mode of the memory.
7014 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
7015 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
7016 indicates whether the caller is performing a SET or a TEST operation. */
7019 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
7020 aarch64_salt_type salt_type)
7022 rtx addr;
7023 if (aarch64_stack_protector_guard == SSP_GLOBAL)
7025 gcc_assert (MEM_P (decl_rtl));
7026 addr = XEXP (decl_rtl, 0);
7027 poly_int64 offset;
7028 rtx base = strip_offset_and_salt (addr, &offset);
7029 if (!SYMBOL_REF_P (base))
7030 return decl_rtl;
7032 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
7033 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
7034 addr = gen_rtx_CONST (Pmode, addr);
7035 addr = plus_constant (Pmode, addr, offset);
7037 else
7039 /* Calculate the address from the system register. */
7040 rtx salt = GEN_INT (salt_type);
7041 addr = gen_reg_rtx (mode);
7042 if (mode == DImode)
7043 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
7044 else
7046 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
7047 addr = convert_memory_address (Pmode, addr);
7049 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
7051 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
7054 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
7055 that is known to contain PTRUE. */
7057 void
7058 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
7060 expand_operand ops[3];
7061 machine_mode mode = GET_MODE (dest);
7062 create_output_operand (&ops[0], dest, mode);
7063 create_input_operand (&ops[1], pred, GET_MODE(pred));
7064 create_input_operand (&ops[2], src, mode);
7065 temporary_volatile_ok v (true);
7066 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
7069 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
7070 operand is in memory. In this case we need to use the predicated LD1
7071 and ST1 instead of LDR and STR, both for correctness on big-endian
7072 targets and because LD1 and ST1 support a wider range of addressing modes.
7073 PRED_MODE is the mode of the predicate.
7075 See the comment at the head of aarch64-sve.md for details about the
7076 big-endian handling. */
7078 void
7079 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
7081 machine_mode mode = GET_MODE (dest);
7082 rtx ptrue = aarch64_ptrue_reg (pred_mode);
7083 if (!register_operand (src, mode)
7084 && !register_operand (dest, mode))
7086 rtx tmp = gen_reg_rtx (mode);
7087 if (MEM_P (src))
7088 aarch64_emit_sve_pred_move (tmp, ptrue, src);
7089 else
7090 emit_move_insn (tmp, src);
7091 src = tmp;
7093 aarch64_emit_sve_pred_move (dest, ptrue, src);
7096 /* Called only on big-endian targets. See whether an SVE vector move
7097 from SRC to DEST is effectively a REV[BHW] instruction, because at
7098 least one operand is a subreg of an SVE vector that has wider or
7099 narrower elements. Return true and emit the instruction if so.
7101 For example:
7103 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
7105 represents a VIEW_CONVERT between the following vectors, viewed
7106 in memory order:
7108 R2: { [0].high, [0].low, [1].high, [1].low, ... }
7109 R1: { [0], [1], [2], [3], ... }
7111 The high part of lane X in R2 should therefore correspond to lane X*2
7112 of R1, but the register representations are:
7114 msb lsb
7115 R2: ...... [1].high [1].low [0].high [0].low
7116 R1: ...... [3] [2] [1] [0]
7118 where the low part of lane X in R2 corresponds to lane X*2 in R1.
7119 We therefore need a reverse operation to swap the high and low values
7120 around.
7122 This is purely an optimization. Without it we would spill the
7123 subreg operand to the stack in one mode and reload it in the
7124 other mode, which has the same effect as the REV. */
7126 bool
7127 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
7129 gcc_assert (BYTES_BIG_ENDIAN);
7131 /* Do not try to optimize subregs that LRA has created for matched
7132 reloads. These subregs only exist as a temporary measure to make
7133 the RTL well-formed, but they are exempt from the usual
7134 TARGET_CAN_CHANGE_MODE_CLASS rules.
7136 For example, if we have:
7138 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
7140 and the constraints require R1 and R2 to be in the same register,
7141 LRA may need to create RTL such as:
7143 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
7144 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
7145 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
7147 which forces both the input and output of the original instruction
7148 to use the same hard register. But for this to work, the normal
7149 rules have to be suppressed on the subreg input, otherwise LRA
7150 would need to reload that input too, meaning that the process
7151 would never terminate. To compensate for this, the normal rules
7152 are also suppressed for the subreg output of the first move.
7153 Ignoring the special case and handling the first move normally
7154 would therefore generate wrong code: we would reverse the elements
7155 for the first subreg but not reverse them back for the second subreg. */
7156 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
7157 dest = SUBREG_REG (dest);
7158 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
7159 src = SUBREG_REG (src);
7161 /* The optimization handles two single SVE REGs with different element
7162 sizes. */
7163 if (!REG_P (dest)
7164 || !REG_P (src)
7165 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
7166 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
7167 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
7168 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
7169 return false;
7171 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
7172 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
7173 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
7174 UNSPEC_REV_SUBREG);
7175 emit_insn (gen_rtx_SET (dest, unspec));
7176 return true;
7179 /* Return a copy of X with mode MODE, without changing its other
7180 attributes. Unlike gen_lowpart, this doesn't care whether the
7181 mode change is valid. */
7184 aarch64_replace_reg_mode (rtx x, machine_mode mode)
7186 if (GET_MODE (x) == mode)
7187 return x;
7189 x = shallow_copy_rtx (x);
7190 set_mode_and_regno (x, mode, REGNO (x));
7191 return x;
7194 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
7195 stored in wider integer containers. */
7197 static unsigned int
7198 aarch64_sve_rev_unspec (machine_mode mode)
7200 switch (GET_MODE_UNIT_SIZE (mode))
7202 case 1: return UNSPEC_REVB;
7203 case 2: return UNSPEC_REVH;
7204 case 4: return UNSPEC_REVW;
7206 gcc_unreachable ();
7209 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
7210 operands. */
7212 void
7213 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
7215 /* Decide which REV operation we need. The mode with wider elements
7216 determines the mode of the operands and the mode with the narrower
7217 elements determines the reverse width. */
7218 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
7219 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
7220 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
7221 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
7222 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
7224 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
7225 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
7227 /* Get the operands in the appropriate modes and emit the instruction. */
7228 ptrue = gen_lowpart (pred_mode, ptrue);
7229 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
7230 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
7231 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
7232 dest, ptrue, src));
7235 static bool
7236 aarch64_function_ok_for_sibcall (tree, tree exp)
7238 if (crtl->abi->id () != expr_callee_abi (exp).id ())
7239 return false;
7241 return true;
7244 /* Subroutine of aarch64_pass_by_reference for arguments that are not
7245 passed in SVE registers. */
7247 static bool
7248 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
7249 const function_arg_info &arg)
7251 HOST_WIDE_INT size;
7252 machine_mode dummymode;
7253 int nregs;
7255 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
7256 if (arg.mode == BLKmode && arg.type)
7257 size = int_size_in_bytes (arg.type);
7258 else
7259 /* No frontends can create types with variable-sized modes, so we
7260 shouldn't be asked to pass or return them. */
7261 size = GET_MODE_SIZE (arg.mode).to_constant ();
7263 /* Aggregates are passed by reference based on their size. */
7264 if (arg.aggregate_type_p ())
7265 size = int_size_in_bytes (arg.type);
7267 /* Variable sized arguments are always returned by reference. */
7268 if (size < 0)
7269 return true;
7271 /* Can this be a candidate to be passed in fp/simd register(s)? */
7272 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
7273 &dummymode, &nregs, NULL,
7274 !pcum || pcum->silent_p))
7275 return false;
7277 /* Arguments which are variable sized or larger than 2 registers are
7278 passed by reference unless they are a homogenous floating point
7279 aggregate. */
7280 return size > 2 * UNITS_PER_WORD;
7283 /* Implement TARGET_PASS_BY_REFERENCE. */
7285 static bool
7286 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7287 const function_arg_info &arg)
7289 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7291 if (!arg.type)
7292 return aarch64_pass_by_reference_1 (pcum, arg);
7294 pure_scalable_type_info pst_info;
7295 switch (pst_info.analyze (arg.type))
7297 case pure_scalable_type_info::IS_PST:
7298 if (pcum && !pcum->silent_p && !TARGET_SVE)
7299 /* We can't gracefully recover at this point, so make this a
7300 fatal error. */
7301 fatal_error (input_location, "arguments of type %qT require"
7302 " the SVE ISA extension", arg.type);
7304 /* Variadic SVE types are passed by reference. Normal non-variadic
7305 arguments are too if we've run out of registers. */
7306 return (!arg.named
7307 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7308 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7310 case pure_scalable_type_info::DOESNT_MATTER:
7311 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7312 return true;
7314 case pure_scalable_type_info::NO_ABI_IDENTITY:
7315 case pure_scalable_type_info::ISNT_PST:
7316 return aarch64_pass_by_reference_1 (pcum, arg);
7318 gcc_unreachable ();
7321 /* Return TRUE if VALTYPE is padded to its least significant bits. */
7322 static bool
7323 aarch64_return_in_msb (const_tree valtype)
7325 machine_mode dummy_mode;
7326 int dummy_int;
7328 /* Never happens in little-endian mode. */
7329 if (!BYTES_BIG_ENDIAN)
7330 return false;
7332 /* Only composite types smaller than or equal to 16 bytes can
7333 be potentially returned in registers. */
7334 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7335 || int_size_in_bytes (valtype) <= 0
7336 || int_size_in_bytes (valtype) > 16)
7337 return false;
7339 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7340 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7341 is always passed/returned in the least significant bits of fp/simd
7342 register(s). */
7343 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7344 &dummy_mode, &dummy_int, NULL,
7345 false))
7346 return false;
7348 /* Likewise pure scalable types for SVE vector and predicate registers. */
7349 pure_scalable_type_info pst_info;
7350 if (pst_info.analyze_registers (valtype))
7351 return false;
7353 return true;
7356 /* Implement TARGET_FUNCTION_VALUE.
7357 Define how to find the value returned by a function. */
7359 static rtx
7360 aarch64_function_value (const_tree type, const_tree func,
7361 bool outgoing ATTRIBUTE_UNUSED)
7363 machine_mode mode;
7364 int unsignedp;
7366 mode = TYPE_MODE (type);
7367 if (INTEGRAL_TYPE_P (type))
7368 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7370 pure_scalable_type_info pst_info;
7371 if (type && pst_info.analyze_registers (type))
7372 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7374 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7375 are returned in memory, not by value. */
7376 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7377 bool sve_p = (vec_flags & VEC_ANY_SVE);
7379 if (aarch64_return_in_msb (type))
7381 HOST_WIDE_INT size = int_size_in_bytes (type);
7383 if (size % UNITS_PER_WORD != 0)
7385 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7386 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7390 int count;
7391 machine_mode ag_mode;
7392 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7393 NULL, false))
7395 gcc_assert (!sve_p);
7396 if (!aarch64_composite_type_p (type, mode))
7398 gcc_assert (count == 1 && mode == ag_mode);
7399 return gen_rtx_REG (mode, V0_REGNUM);
7401 else if (aarch64_advsimd_full_struct_mode_p (mode)
7402 && known_eq (GET_MODE_SIZE (ag_mode), 16))
7403 return gen_rtx_REG (mode, V0_REGNUM);
7404 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7405 && known_eq (GET_MODE_SIZE (ag_mode), 8))
7406 return gen_rtx_REG (mode, V0_REGNUM);
7407 else
7409 int i;
7410 rtx par;
7412 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7413 for (i = 0; i < count; i++)
7415 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7416 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7417 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7418 XVECEXP (par, 0, i) = tmp;
7420 return par;
7423 else
7425 if (sve_p)
7427 /* Vector types can acquire a partial SVE mode using things like
7428 __attribute__((vector_size(N))), and this is potentially useful.
7429 However, the choice of mode doesn't affect the type's ABI
7430 identity, so we should treat the types as though they had
7431 the associated integer mode, just like they did before SVE
7432 was introduced.
7434 We know that the vector must be 128 bits or smaller,
7435 otherwise we'd have returned it in memory instead. */
7436 gcc_assert (type
7437 && (aarch64_some_values_include_pst_objects_p (type)
7438 || (vec_flags & VEC_PARTIAL)));
7440 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7441 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7442 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7443 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7445 return gen_rtx_REG (mode, R0_REGNUM);
7449 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7450 Return true if REGNO is the number of a hard register in which the values
7451 of called function may come back. */
7453 static bool
7454 aarch64_function_value_regno_p (const unsigned int regno)
7456 /* Maximum of 16 bytes can be returned in the general registers. Examples
7457 of 16-byte return values are: 128-bit integers and 16-byte small
7458 structures (excluding homogeneous floating-point aggregates). */
7459 if (regno == R0_REGNUM || regno == R1_REGNUM)
7460 return true;
7462 /* Up to four fp/simd registers can return a function value, e.g. a
7463 homogeneous floating-point aggregate having four members. */
7464 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7465 return TARGET_FLOAT;
7467 return false;
7470 /* Subroutine for aarch64_return_in_memory for types that are not returned
7471 in SVE registers. */
7473 static bool
7474 aarch64_return_in_memory_1 (const_tree type)
7476 HOST_WIDE_INT size;
7477 machine_mode ag_mode;
7478 int count;
7480 if (!AGGREGATE_TYPE_P (type)
7481 && TREE_CODE (type) != COMPLEX_TYPE
7482 && TREE_CODE (type) != VECTOR_TYPE)
7483 /* Simple scalar types always returned in registers. */
7484 return false;
7486 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7487 &ag_mode, &count, NULL, false))
7488 return false;
7490 /* Types larger than 2 registers returned in memory. */
7491 size = int_size_in_bytes (type);
7492 return (size < 0 || size > 2 * UNITS_PER_WORD);
7495 /* Implement TARGET_RETURN_IN_MEMORY.
7497 If the type T of the result of a function is such that
7498 void func (T arg)
7499 would require that arg be passed as a value in a register (or set of
7500 registers) according to the parameter passing rules, then the result
7501 is returned in the same registers as would be used for such an
7502 argument. */
7504 static bool
7505 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7507 pure_scalable_type_info pst_info;
7508 switch (pst_info.analyze (type))
7510 case pure_scalable_type_info::IS_PST:
7511 return (pst_info.num_zr () > NUM_FP_ARG_REGS
7512 || pst_info.num_pr () > NUM_PR_ARG_REGS);
7514 case pure_scalable_type_info::DOESNT_MATTER:
7515 gcc_assert (aarch64_return_in_memory_1 (type));
7516 return true;
7518 case pure_scalable_type_info::NO_ABI_IDENTITY:
7519 case pure_scalable_type_info::ISNT_PST:
7520 return aarch64_return_in_memory_1 (type);
7522 gcc_unreachable ();
7525 static bool
7526 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7527 const_tree type, int *nregs)
7529 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7530 return aarch64_vfp_is_call_or_return_candidate (mode, type,
7531 &pcum->aapcs_vfp_rmode,
7532 nregs, NULL, pcum->silent_p);
7535 /* Given MODE and TYPE of a function argument, return the alignment in
7536 bits. The idea is to suppress any stronger alignment requested by
7537 the user and opt for the natural alignment (specified in AAPCS64 \S
7538 4.1). ABI_BREAK is set to true if the alignment was incorrectly
7539 calculated in versions of GCC prior to GCC-9. This is a helper
7540 function for local use only. */
7542 static unsigned int
7543 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7544 unsigned int *abi_break)
7546 *abi_break = 0;
7547 if (!type)
7548 return GET_MODE_ALIGNMENT (mode);
7550 if (integer_zerop (TYPE_SIZE (type)))
7551 return 0;
7553 gcc_assert (TYPE_MODE (type) == mode);
7555 if (!AGGREGATE_TYPE_P (type))
7556 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
7558 if (TREE_CODE (type) == ARRAY_TYPE)
7559 return TYPE_ALIGN (TREE_TYPE (type));
7561 unsigned int alignment = 0;
7562 unsigned int bitfield_alignment = 0;
7563 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7564 if (TREE_CODE (field) == FIELD_DECL)
7566 /* Note that we explicitly consider zero-sized fields here,
7567 even though they don't map to AAPCS64 machine types.
7568 For example, in:
7570 struct __attribute__((aligned(8))) empty {};
7572 struct s {
7573 [[no_unique_address]] empty e;
7574 int x;
7577 "s" contains only one Fundamental Data Type (the int field)
7578 but gains 8-byte alignment and size thanks to "e". */
7579 alignment = std::max (alignment, DECL_ALIGN (field));
7580 if (DECL_BIT_FIELD_TYPE (field))
7581 bitfield_alignment
7582 = std::max (bitfield_alignment,
7583 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7586 if (bitfield_alignment > alignment)
7588 *abi_break = alignment;
7589 return bitfield_alignment;
7592 return alignment;
7595 /* Layout a function argument according to the AAPCS64 rules. The rule
7596 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
7597 mode that was originally given to us by the target hook, whereas the
7598 mode in ARG might be the result of replacing partial SVE modes with
7599 the equivalent integer mode. */
7601 static void
7602 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7604 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7605 tree type = arg.type;
7606 machine_mode mode = arg.mode;
7607 int ncrn, nvrn, nregs;
7608 bool allocate_ncrn, allocate_nvrn;
7609 HOST_WIDE_INT size;
7610 unsigned int abi_break;
7612 /* We need to do this once per argument. */
7613 if (pcum->aapcs_arg_processed)
7614 return;
7616 pcum->aapcs_arg_processed = true;
7618 pure_scalable_type_info pst_info;
7619 if (type && pst_info.analyze_registers (type))
7621 /* The PCS says that it is invalid to pass an SVE value to an
7622 unprototyped function. There is no ABI-defined location we
7623 can return in this case, so we have no real choice but to raise
7624 an error immediately, even though this is only a query function. */
7625 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7627 gcc_assert (!pcum->silent_p);
7628 error ("SVE type %qT cannot be passed to an unprototyped function",
7629 arg.type);
7630 /* Avoid repeating the message, and avoid tripping the assert
7631 below. */
7632 pcum->pcs_variant = ARM_PCS_SVE;
7635 /* We would have converted the argument into pass-by-reference
7636 form if it didn't fit in registers. */
7637 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7638 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7639 gcc_assert (arg.named
7640 && pcum->pcs_variant == ARM_PCS_SVE
7641 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7642 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7643 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7644 P0_REGNUM + pcum->aapcs_nprn);
7645 return;
7648 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7649 are passed by reference, not by value. */
7650 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7651 bool sve_p = (vec_flags & VEC_ANY_SVE);
7652 if (sve_p)
7653 /* Vector types can acquire a partial SVE mode using things like
7654 __attribute__((vector_size(N))), and this is potentially useful.
7655 However, the choice of mode doesn't affect the type's ABI
7656 identity, so we should treat the types as though they had
7657 the associated integer mode, just like they did before SVE
7658 was introduced.
7660 We know that the vector must be 128 bits or smaller,
7661 otherwise we'd have passed it in memory instead. */
7662 gcc_assert (type
7663 && (aarch64_some_values_include_pst_objects_p (type)
7664 || (vec_flags & VEC_PARTIAL)));
7666 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
7667 if (type)
7668 size = int_size_in_bytes (type);
7669 else
7670 /* No frontends can create types with variable-sized modes, so we
7671 shouldn't be asked to pass or return them. */
7672 size = GET_MODE_SIZE (mode).to_constant ();
7673 size = ROUND_UP (size, UNITS_PER_WORD);
7675 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7676 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7677 mode,
7678 type,
7679 &nregs);
7680 gcc_assert (!sve_p || !allocate_nvrn);
7682 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7683 The following code thus handles passing by SIMD/FP registers first. */
7685 nvrn = pcum->aapcs_nvrn;
7687 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7688 and homogenous short-vector aggregates (HVA). */
7689 if (allocate_nvrn)
7691 if (!pcum->silent_p && !TARGET_FLOAT)
7692 aarch64_err_no_fpadvsimd (mode);
7694 if (nvrn + nregs <= NUM_FP_ARG_REGS)
7696 pcum->aapcs_nextnvrn = nvrn + nregs;
7697 if (!aarch64_composite_type_p (type, mode))
7699 gcc_assert (nregs == 1);
7700 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7702 else if (aarch64_advsimd_full_struct_mode_p (mode)
7703 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7704 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7705 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7706 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7707 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7708 else
7710 rtx par;
7711 int i;
7712 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7713 for (i = 0; i < nregs; i++)
7715 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7716 V0_REGNUM + nvrn + i);
7717 rtx offset = gen_int_mode
7718 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7719 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7720 XVECEXP (par, 0, i) = tmp;
7722 pcum->aapcs_reg = par;
7724 return;
7726 else
7728 /* C.3 NSRN is set to 8. */
7729 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7730 goto on_stack;
7734 ncrn = pcum->aapcs_ncrn;
7735 nregs = size / UNITS_PER_WORD;
7737 /* C6 - C9. though the sign and zero extension semantics are
7738 handled elsewhere. This is the case where the argument fits
7739 entirely general registers. */
7740 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7742 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7744 /* C.8 if the argument has an alignment of 16 then the NGRN is
7745 rounded up to the next even number. */
7746 if (nregs == 2
7747 && ncrn % 2
7748 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7749 comparison is there because for > 16 * BITS_PER_UNIT
7750 alignment nregs should be > 2 and therefore it should be
7751 passed by reference rather than value. */
7752 && (aarch64_function_arg_alignment (mode, type, &abi_break)
7753 == 16 * BITS_PER_UNIT))
7755 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
7756 inform (input_location, "parameter passing for argument of type "
7757 "%qT changed in GCC 9.1", type);
7758 ++ncrn;
7759 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7762 /* If an argument with an SVE mode needs to be shifted up to the
7763 high part of the register, treat it as though it had an integer mode.
7764 Using the normal (parallel [...]) would suppress the shifting. */
7765 if (sve_p
7766 && BYTES_BIG_ENDIAN
7767 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7768 && aarch64_pad_reg_upward (mode, type, false))
7770 mode = int_mode_for_mode (mode).require ();
7771 sve_p = false;
7774 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7775 A reg is still generated for it, but the caller should be smart
7776 enough not to use it. */
7777 if (nregs == 0
7778 || (nregs == 1 && !sve_p)
7779 || GET_MODE_CLASS (mode) == MODE_INT)
7780 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7781 else
7783 rtx par;
7784 int i;
7786 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7787 for (i = 0; i < nregs; i++)
7789 scalar_int_mode reg_mode = word_mode;
7790 if (nregs == 1)
7791 reg_mode = int_mode_for_mode (mode).require ();
7792 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7793 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7794 GEN_INT (i * UNITS_PER_WORD));
7795 XVECEXP (par, 0, i) = tmp;
7797 pcum->aapcs_reg = par;
7800 pcum->aapcs_nextncrn = ncrn + nregs;
7801 return;
7804 /* C.11 */
7805 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7807 /* The argument is passed on stack; record the needed number of words for
7808 this argument and align the total size if necessary. */
7809 on_stack:
7810 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7812 if (aarch64_function_arg_alignment (mode, type, &abi_break)
7813 == 16 * BITS_PER_UNIT)
7815 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7816 if (pcum->aapcs_stack_size != new_size)
7818 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
7819 inform (input_location, "parameter passing for argument of type "
7820 "%qT changed in GCC 9.1", type);
7821 pcum->aapcs_stack_size = new_size;
7824 return;
7827 /* Implement TARGET_FUNCTION_ARG. */
7829 static rtx
7830 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7832 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7833 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7834 || pcum->pcs_variant == ARM_PCS_SIMD
7835 || pcum->pcs_variant == ARM_PCS_SVE);
7837 if (arg.end_marker_p ())
7838 return gen_int_mode (pcum->pcs_variant, DImode);
7840 aarch64_layout_arg (pcum_v, arg);
7841 return pcum->aapcs_reg;
7844 void
7845 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7846 const_tree fntype,
7847 rtx libname ATTRIBUTE_UNUSED,
7848 const_tree fndecl ATTRIBUTE_UNUSED,
7849 unsigned n_named ATTRIBUTE_UNUSED,
7850 bool silent_p)
7852 pcum->aapcs_ncrn = 0;
7853 pcum->aapcs_nvrn = 0;
7854 pcum->aapcs_nprn = 0;
7855 pcum->aapcs_nextncrn = 0;
7856 pcum->aapcs_nextnvrn = 0;
7857 pcum->aapcs_nextnprn = 0;
7858 if (fntype)
7859 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7860 else
7861 pcum->pcs_variant = ARM_PCS_AAPCS64;
7862 pcum->aapcs_reg = NULL_RTX;
7863 pcum->aapcs_arg_processed = false;
7864 pcum->aapcs_stack_words = 0;
7865 pcum->aapcs_stack_size = 0;
7866 pcum->silent_p = silent_p;
7868 if (!silent_p
7869 && !TARGET_FLOAT
7870 && fntype && fntype != error_mark_node)
7872 const_tree type = TREE_TYPE (fntype);
7873 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7874 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7875 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7876 &mode, &nregs, NULL, false))
7877 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7880 if (!silent_p
7881 && !TARGET_SVE
7882 && pcum->pcs_variant == ARM_PCS_SVE)
7884 /* We can't gracefully recover at this point, so make this a
7885 fatal error. */
7886 if (fndecl)
7887 fatal_error (input_location, "%qE requires the SVE ISA extension",
7888 fndecl);
7889 else
7890 fatal_error (input_location, "calls to functions of type %qT require"
7891 " the SVE ISA extension", fntype);
7895 static void
7896 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7897 const function_arg_info &arg)
7899 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7900 if (pcum->pcs_variant == ARM_PCS_AAPCS64
7901 || pcum->pcs_variant == ARM_PCS_SIMD
7902 || pcum->pcs_variant == ARM_PCS_SVE)
7904 aarch64_layout_arg (pcum_v, arg);
7905 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7906 != (pcum->aapcs_stack_words != 0));
7907 pcum->aapcs_arg_processed = false;
7908 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7909 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7910 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7911 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7912 pcum->aapcs_stack_words = 0;
7913 pcum->aapcs_reg = NULL_RTX;
7917 bool
7918 aarch64_function_arg_regno_p (unsigned regno)
7920 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7921 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
7924 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7925 PARM_BOUNDARY bits of alignment, but will be given anything up
7926 to STACK_BOUNDARY bits if the type requires it. This makes sure
7927 that both before and after the layout of each argument, the Next
7928 Stacked Argument Address (NSAA) will have a minimum alignment of
7929 8 bytes. */
7931 static unsigned int
7932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7934 unsigned int abi_break;
7935 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7936 &abi_break);
7937 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7938 if (abi_break & warn_psabi)
7940 abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
7941 if (alignment != abi_break)
7942 inform (input_location, "parameter passing for argument of type "
7943 "%qT changed in GCC 9.1", type);
7946 return alignment;
7949 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7951 static fixed_size_mode
7952 aarch64_get_reg_raw_mode (int regno)
7954 if (TARGET_SVE && FP_REGNUM_P (regno))
7955 /* Don't use the SVE part of the register for __builtin_apply and
7956 __builtin_return. The SVE registers aren't used by the normal PCS,
7957 so using them there would be a waste of time. The PCS extensions
7958 for SVE types are fundamentally incompatible with the
7959 __builtin_return/__builtin_apply interface. */
7960 return as_a <fixed_size_mode> (V16QImode);
7961 return default_get_reg_raw_mode (regno);
7964 /* Implement TARGET_FUNCTION_ARG_PADDING.
7966 Small aggregate types are placed in the lowest memory address.
7968 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7970 static pad_direction
7971 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7973 /* On little-endian targets, the least significant byte of every stack
7974 argument is passed at the lowest byte address of the stack slot. */
7975 if (!BYTES_BIG_ENDIAN)
7976 return PAD_UPWARD;
7978 /* Otherwise, integral, floating-point and pointer types are padded downward:
7979 the least significant byte of a stack argument is passed at the highest
7980 byte address of the stack slot. */
7981 if (type
7982 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7983 || POINTER_TYPE_P (type))
7984 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7985 return PAD_DOWNWARD;
7987 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7988 return PAD_UPWARD;
7991 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7993 It specifies padding for the last (may also be the only)
7994 element of a block move between registers and memory. If
7995 assuming the block is in the memory, padding upward means that
7996 the last element is padded after its highest significant byte,
7997 while in downward padding, the last element is padded at the
7998 its least significant byte side.
8000 Small aggregates and small complex types are always padded
8001 upwards.
8003 We don't need to worry about homogeneous floating-point or
8004 short-vector aggregates; their move is not affected by the
8005 padding direction determined here. Regardless of endianness,
8006 each element of such an aggregate is put in the least
8007 significant bits of a fp/simd register.
8009 Return !BYTES_BIG_ENDIAN if the least significant byte of the
8010 register has useful data, and return the opposite if the most
8011 significant byte does. */
8013 bool
8014 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
8015 bool first ATTRIBUTE_UNUSED)
8018 /* Aside from pure scalable types, small composite types are always
8019 padded upward. */
8020 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
8022 HOST_WIDE_INT size;
8023 if (type)
8024 size = int_size_in_bytes (type);
8025 else
8026 /* No frontends can create types with variable-sized modes, so we
8027 shouldn't be asked to pass or return them. */
8028 size = GET_MODE_SIZE (mode).to_constant ();
8029 if (size < 2 * UNITS_PER_WORD)
8031 pure_scalable_type_info pst_info;
8032 if (pst_info.analyze_registers (type))
8033 return false;
8034 return true;
8038 /* Otherwise, use the default padding. */
8039 return !BYTES_BIG_ENDIAN;
8042 static scalar_int_mode
8043 aarch64_libgcc_cmp_return_mode (void)
8045 return SImode;
8048 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
8050 /* We use the 12-bit shifted immediate arithmetic instructions so values
8051 must be multiple of (1 << 12), i.e. 4096. */
8052 #define ARITH_FACTOR 4096
8054 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
8055 #error Cannot use simple address calculation for stack probing
8056 #endif
8058 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
8059 inclusive. These are offsets from the current stack pointer. */
8061 static void
8062 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
8064 HOST_WIDE_INT size;
8065 if (!poly_size.is_constant (&size))
8067 sorry ("stack probes for SVE frames");
8068 return;
8071 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
8073 /* See the same assertion on PROBE_INTERVAL above. */
8074 gcc_assert ((first % ARITH_FACTOR) == 0);
8076 /* See if we have a constant small number of probes to generate. If so,
8077 that's the easy case. */
8078 if (size <= PROBE_INTERVAL)
8080 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
8082 emit_set_insn (reg1,
8083 plus_constant (Pmode,
8084 stack_pointer_rtx, -(first + base)));
8085 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
8088 /* The run-time loop is made up of 8 insns in the generic case while the
8089 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
8090 else if (size <= 4 * PROBE_INTERVAL)
8092 HOST_WIDE_INT i, rem;
8094 emit_set_insn (reg1,
8095 plus_constant (Pmode,
8096 stack_pointer_rtx,
8097 -(first + PROBE_INTERVAL)));
8098 emit_stack_probe (reg1);
8100 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
8101 it exceeds SIZE. If only two probes are needed, this will not
8102 generate any code. Then probe at FIRST + SIZE. */
8103 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
8105 emit_set_insn (reg1,
8106 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
8107 emit_stack_probe (reg1);
8110 rem = size - (i - PROBE_INTERVAL);
8111 if (rem > 256)
8113 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8115 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
8116 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
8118 else
8119 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
8122 /* Otherwise, do the same as above, but in a loop. Note that we must be
8123 extra careful with variables wrapping around because we might be at
8124 the very top (or the very bottom) of the address space and we have
8125 to be able to handle this case properly; in particular, we use an
8126 equality test for the loop condition. */
8127 else
8129 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
8131 /* Step 1: round SIZE to the previous multiple of the interval. */
8133 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
8136 /* Step 2: compute initial and final value of the loop counter. */
8138 /* TEST_ADDR = SP + FIRST. */
8139 emit_set_insn (reg1,
8140 plus_constant (Pmode, stack_pointer_rtx, -first));
8142 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
8143 HOST_WIDE_INT adjustment = - (first + rounded_size);
8144 if (! aarch64_uimm12_shift (adjustment))
8146 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
8147 true, Pmode);
8148 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
8150 else
8151 emit_set_insn (reg2,
8152 plus_constant (Pmode, stack_pointer_rtx, adjustment));
8154 /* Step 3: the loop
8158 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
8159 probe at TEST_ADDR
8161 while (TEST_ADDR != LAST_ADDR)
8163 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
8164 until it is equal to ROUNDED_SIZE. */
8166 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
8169 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
8170 that SIZE is equal to ROUNDED_SIZE. */
8172 if (size != rounded_size)
8174 HOST_WIDE_INT rem = size - rounded_size;
8176 if (rem > 256)
8178 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8180 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
8181 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
8183 else
8184 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
8188 /* Make sure nothing is scheduled before we are done. */
8189 emit_insn (gen_blockage ());
8192 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
8193 absolute addresses. */
8195 const char *
8196 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
8198 static int labelno = 0;
8199 char loop_lab[32];
8200 rtx xops[2];
8202 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
8204 /* Loop. */
8205 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
8207 HOST_WIDE_INT stack_clash_probe_interval
8208 = 1 << param_stack_clash_protection_guard_size;
8210 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
8211 xops[0] = reg1;
8212 HOST_WIDE_INT interval;
8213 if (flag_stack_clash_protection)
8214 interval = stack_clash_probe_interval;
8215 else
8216 interval = PROBE_INTERVAL;
8218 gcc_assert (aarch64_uimm12_shift (interval));
8219 xops[1] = GEN_INT (interval);
8221 output_asm_insn ("sub\t%0, %0, %1", xops);
8223 /* If doing stack clash protection then we probe up by the ABI specified
8224 amount. We do this because we're dropping full pages at a time in the
8225 loop. But if we're doing non-stack clash probing, probe at SP 0. */
8226 if (flag_stack_clash_protection)
8227 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
8228 else
8229 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
8231 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
8232 by this amount for each iteration. */
8233 output_asm_insn ("str\txzr, [%0, %1]", xops);
8235 /* Test if TEST_ADDR == LAST_ADDR. */
8236 xops[1] = reg2;
8237 output_asm_insn ("cmp\t%0, %1", xops);
8239 /* Branch. */
8240 fputs ("\tb.ne\t", asm_out_file);
8241 assemble_name_raw (asm_out_file, loop_lab);
8242 fputc ('\n', asm_out_file);
8244 return "";
8247 /* Emit the probe loop for doing stack clash probes and stack adjustments for
8248 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8249 of GUARD_SIZE. When a probe is emitted it is done at most
8250 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8251 at most MIN_PROBE_THRESHOLD. By the end of this function
8252 BASE = BASE - ADJUSTMENT. */
8254 const char *
8255 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
8256 rtx min_probe_threshold, rtx guard_size)
8258 /* This function is not allowed to use any instruction generation function
8259 like gen_ and friends. If you do you'll likely ICE during CFG validation,
8260 so instead emit the code you want using output_asm_insn. */
8261 gcc_assert (flag_stack_clash_protection);
8262 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
8263 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
8265 /* The minimum required allocation before the residual requires probing. */
8266 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
8268 /* Clamp the value down to the nearest value that can be used with a cmp. */
8269 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
8270 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
8272 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
8273 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
8275 static int labelno = 0;
8276 char loop_start_lab[32];
8277 char loop_end_lab[32];
8278 rtx xops[2];
8280 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
8281 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
8283 /* Emit loop start label. */
8284 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
8286 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
8287 xops[0] = adjustment;
8288 xops[1] = probe_offset_value_rtx;
8289 output_asm_insn ("cmp\t%0, %1", xops);
8291 /* Branch to end if not enough adjustment to probe. */
8292 fputs ("\tb.lt\t", asm_out_file);
8293 assemble_name_raw (asm_out_file, loop_end_lab);
8294 fputc ('\n', asm_out_file);
8296 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
8297 xops[0] = base;
8298 xops[1] = probe_offset_value_rtx;
8299 output_asm_insn ("sub\t%0, %0, %1", xops);
8301 /* Probe at BASE. */
8302 xops[1] = const0_rtx;
8303 output_asm_insn ("str\txzr, [%0, %1]", xops);
8305 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
8306 xops[0] = adjustment;
8307 xops[1] = probe_offset_value_rtx;
8308 output_asm_insn ("sub\t%0, %0, %1", xops);
8310 /* Branch to start if still more bytes to allocate. */
8311 fputs ("\tb\t", asm_out_file);
8312 assemble_name_raw (asm_out_file, loop_start_lab);
8313 fputc ('\n', asm_out_file);
8315 /* No probe leave. */
8316 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8318 /* BASE = BASE - ADJUSTMENT. */
8319 xops[0] = base;
8320 xops[1] = adjustment;
8321 output_asm_insn ("sub\t%0, %0, %1", xops);
8322 return "";
8325 /* Determine whether a frame chain needs to be generated. */
8326 static bool
8327 aarch64_needs_frame_chain (void)
8329 /* Force a frame chain for EH returns so the return address is at FP+8. */
8330 if (frame_pointer_needed || crtl->calls_eh_return)
8331 return true;
8333 /* A leaf function cannot have calls or write LR. */
8334 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8336 /* Don't use a frame chain in leaf functions if leaf frame pointers
8337 are disabled. */
8338 if (flag_omit_leaf_frame_pointer && is_leaf)
8339 return false;
8341 return aarch64_use_frame_pointer;
8344 /* Mark the registers that need to be saved by the callee and calculate
8345 the size of the callee-saved registers area and frame record (both FP
8346 and LR may be omitted). */
8347 static void
8348 aarch64_layout_frame (void)
8350 poly_int64 offset = 0;
8351 int regno, last_fp_reg = INVALID_REGNUM;
8352 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8353 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8354 bool frame_related_fp_reg_p = false;
8355 aarch64_frame &frame = cfun->machine->frame;
8357 frame.emit_frame_chain = aarch64_needs_frame_chain ();
8359 /* Adjust the outgoing arguments size if required. Keep it in sync with what
8360 the mid-end is doing. */
8361 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8363 #define SLOT_NOT_REQUIRED (-2)
8364 #define SLOT_REQUIRED (-1)
8366 frame.wb_push_candidate1 = INVALID_REGNUM;
8367 frame.wb_push_candidate2 = INVALID_REGNUM;
8368 frame.spare_pred_reg = INVALID_REGNUM;
8370 /* First mark all the registers that really need to be saved... */
8371 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8372 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8374 /* ... that includes the eh data registers (if needed)... */
8375 if (crtl->calls_eh_return)
8376 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8377 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8379 /* ... and any callee saved register that dataflow says is live. */
8380 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8381 if (df_regs_ever_live_p (regno)
8382 && !fixed_regs[regno]
8383 && (regno == R30_REGNUM
8384 || !crtl->abi->clobbers_full_reg_p (regno)))
8385 frame.reg_offset[regno] = SLOT_REQUIRED;
8387 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8388 if (df_regs_ever_live_p (regno)
8389 && !fixed_regs[regno]
8390 && !crtl->abi->clobbers_full_reg_p (regno))
8392 frame.reg_offset[regno] = SLOT_REQUIRED;
8393 last_fp_reg = regno;
8394 if (aarch64_emit_cfi_for_reg_p (regno))
8395 frame_related_fp_reg_p = true;
8398 /* Big-endian SVE frames need a spare predicate register in order
8399 to save Z8-Z15. Decide which register they should use. Prefer
8400 an unused argument register if possible, so that we don't force P4
8401 to be saved unnecessarily. */
8402 if (frame_related_fp_reg_p
8403 && crtl->abi->id () == ARM_PCS_SVE
8404 && BYTES_BIG_ENDIAN)
8406 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8407 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8408 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8409 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8410 break;
8411 gcc_assert (regno <= P7_REGNUM);
8412 frame.spare_pred_reg = regno;
8413 df_set_regs_ever_live (regno, true);
8416 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8417 if (df_regs_ever_live_p (regno)
8418 && !fixed_regs[regno]
8419 && !crtl->abi->clobbers_full_reg_p (regno))
8420 frame.reg_offset[regno] = SLOT_REQUIRED;
8422 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
8423 LR counts as an implicit probe which allows us to maintain the invariant
8424 described in the comment at expand_prologue. */
8425 gcc_assert (crtl->is_leaf
8426 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8428 /* Now assign stack slots for the registers. Start with the predicate
8429 registers, since predicate LDR and STR have a relatively small
8430 offset range. These saves happen below the hard frame pointer. */
8431 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8432 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8434 frame.reg_offset[regno] = offset;
8435 offset += BYTES_PER_SVE_PRED;
8438 if (maybe_ne (offset, 0))
8440 /* If we have any vector registers to save above the predicate registers,
8441 the offset of the vector register save slots need to be a multiple
8442 of the vector size. This lets us use the immediate forms of LDR/STR
8443 (or LD1/ST1 for big-endian).
8445 A vector register is 8 times the size of a predicate register,
8446 and we need to save a maximum of 12 predicate registers, so the
8447 first vector register will be at either #1, MUL VL or #2, MUL VL.
8449 If we don't have any vector registers to save, and we know how
8450 big the predicate save area is, we can just round it up to the
8451 next 16-byte boundary. */
8452 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8453 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8454 else
8456 if (known_le (offset, vector_save_size))
8457 offset = vector_save_size;
8458 else if (known_le (offset, vector_save_size * 2))
8459 offset = vector_save_size * 2;
8460 else
8461 gcc_unreachable ();
8465 /* If we need to save any SVE vector registers, add them next. */
8466 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8467 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8468 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8470 frame.reg_offset[regno] = offset;
8471 offset += vector_save_size;
8474 /* OFFSET is now the offset of the hard frame pointer from the bottom
8475 of the callee save area. */
8476 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8477 frame.below_hard_fp_saved_regs_size = offset;
8478 if (frame.emit_frame_chain)
8480 /* FP and LR are placed in the linkage record. */
8481 frame.reg_offset[R29_REGNUM] = offset;
8482 frame.wb_push_candidate1 = R29_REGNUM;
8483 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
8484 frame.wb_push_candidate2 = R30_REGNUM;
8485 offset += 2 * UNITS_PER_WORD;
8488 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8489 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8491 frame.reg_offset[regno] = offset;
8492 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8493 frame.wb_push_candidate1 = regno;
8494 else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8495 frame.wb_push_candidate2 = regno;
8496 offset += UNITS_PER_WORD;
8499 poly_int64 max_int_offset = offset;
8500 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8501 bool has_align_gap = maybe_ne (offset, max_int_offset);
8503 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8504 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8506 /* If there is an alignment gap between integer and fp callee-saves,
8507 allocate the last fp register to it if possible. */
8508 if (regno == last_fp_reg
8509 && has_align_gap
8510 && known_eq (vector_save_size, 8)
8511 && multiple_p (offset, 16))
8513 frame.reg_offset[regno] = max_int_offset;
8514 break;
8517 frame.reg_offset[regno] = offset;
8518 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8519 frame.wb_push_candidate1 = regno;
8520 else if (frame.wb_push_candidate2 == INVALID_REGNUM
8521 && frame.wb_push_candidate1 >= V0_REGNUM)
8522 frame.wb_push_candidate2 = regno;
8523 offset += vector_save_size;
8526 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8528 frame.saved_regs_size = offset;
8530 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
8532 poly_int64 above_outgoing_args
8533 = aligned_upper_bound (varargs_and_saved_regs_size
8534 + get_frame_size (),
8535 STACK_BOUNDARY / BITS_PER_UNIT);
8537 frame.hard_fp_offset
8538 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8540 /* Both these values are already aligned. */
8541 gcc_assert (multiple_p (crtl->outgoing_args_size,
8542 STACK_BOUNDARY / BITS_PER_UNIT));
8543 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
8545 frame.locals_offset = frame.saved_varargs_size;
8547 frame.initial_adjust = 0;
8548 frame.final_adjust = 0;
8549 frame.callee_adjust = 0;
8550 frame.sve_callee_adjust = 0;
8551 frame.callee_offset = 0;
8553 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8554 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8556 /* Shadow call stack only deals with functions where the LR is pushed
8557 onto the stack and without specifying the "no_sanitize" attribute
8558 with the argument "shadow-call-stack". */
8559 frame.is_scs_enabled
8560 = (!crtl->calls_eh_return
8561 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8562 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8564 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8565 restore x30, and we don't need to pop x30 again in the traditional
8566 way. Pop candidates record the registers that need to be popped
8567 eventually. */
8568 if (frame.is_scs_enabled)
8570 if (frame.wb_pop_candidate2 == R30_REGNUM)
8571 frame.wb_pop_candidate2 = INVALID_REGNUM;
8572 else if (frame.wb_pop_candidate1 == R30_REGNUM)
8573 frame.wb_pop_candidate1 = INVALID_REGNUM;
8576 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8577 256 to ensure that the offset meets the requirements of emit_move_insn.
8578 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8579 max_push_offset to 0, because no registers are popped at this time,
8580 so callee_adjust cannot be adjusted. */
8581 HOST_WIDE_INT max_push_offset = 0;
8582 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8583 max_push_offset = 512;
8584 else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8585 max_push_offset = 256;
8587 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
8588 HOST_WIDE_INT const_saved_regs_size;
8589 if (frame.frame_size.is_constant (&const_size)
8590 && const_size < max_push_offset
8591 && known_eq (frame.hard_fp_offset, const_size))
8593 /* Simple, small frame with no outgoing arguments:
8595 stp reg1, reg2, [sp, -frame_size]!
8596 stp reg3, reg4, [sp, 16] */
8597 frame.callee_adjust = const_size;
8599 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
8600 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8601 && const_outgoing_args_size + const_saved_regs_size < 512
8602 /* We could handle this case even with outgoing args, provided
8603 that the number of args left us with valid offsets for all
8604 predicate and vector save slots. It's such a rare case that
8605 it hardly seems worth the effort though. */
8606 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
8607 && !(cfun->calls_alloca
8608 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8609 && const_fp_offset < max_push_offset))
8611 /* Frame with small outgoing arguments:
8613 sub sp, sp, frame_size
8614 stp reg1, reg2, [sp, outgoing_args_size]
8615 stp reg3, reg4, [sp, outgoing_args_size + 16] */
8616 frame.initial_adjust = frame.frame_size;
8617 frame.callee_offset = const_outgoing_args_size;
8619 else if (saves_below_hard_fp_p
8620 && known_eq (frame.saved_regs_size,
8621 frame.below_hard_fp_saved_regs_size))
8623 /* Frame in which all saves are SVE saves:
8625 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8626 save SVE registers relative to SP
8627 sub sp, sp, outgoing_args_size */
8628 frame.initial_adjust = (frame.hard_fp_offset
8629 + frame.below_hard_fp_saved_regs_size);
8630 frame.final_adjust = crtl->outgoing_args_size;
8632 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
8633 && const_fp_offset < max_push_offset)
8635 /* Frame with large outgoing arguments or SVE saves, but with
8636 a small local area:
8638 stp reg1, reg2, [sp, -hard_fp_offset]!
8639 stp reg3, reg4, [sp, 16]
8640 [sub sp, sp, below_hard_fp_saved_regs_size]
8641 [save SVE registers relative to SP]
8642 sub sp, sp, outgoing_args_size */
8643 frame.callee_adjust = const_fp_offset;
8644 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8645 frame.final_adjust = crtl->outgoing_args_size;
8647 else
8649 /* Frame with large local area and outgoing arguments or SVE saves,
8650 using frame pointer:
8652 sub sp, sp, hard_fp_offset
8653 stp x29, x30, [sp, 0]
8654 add x29, sp, 0
8655 stp reg3, reg4, [sp, 16]
8656 [sub sp, sp, below_hard_fp_saved_regs_size]
8657 [save SVE registers relative to SP]
8658 sub sp, sp, outgoing_args_size */
8659 frame.initial_adjust = frame.hard_fp_offset;
8660 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8661 frame.final_adjust = crtl->outgoing_args_size;
8664 /* Make sure the individual adjustments add up to the full frame size. */
8665 gcc_assert (known_eq (frame.initial_adjust
8666 + frame.callee_adjust
8667 + frame.sve_callee_adjust
8668 + frame.final_adjust, frame.frame_size));
8670 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8672 /* We've decided not to associate any register saves with the initial
8673 stack allocation. */
8674 frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8675 frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8678 frame.laid_out = true;
8681 /* Return true if the register REGNO is saved on entry to
8682 the current function. */
8684 static bool
8685 aarch64_register_saved_on_entry (int regno)
8687 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8690 /* Return the next register up from REGNO up to LIMIT for the callee
8691 to save. */
8693 static unsigned
8694 aarch64_next_callee_save (unsigned regno, unsigned limit)
8696 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8697 regno ++;
8698 return regno;
8701 /* Push the register number REGNO of mode MODE to the stack with write-back
8702 adjusting the stack by ADJUSTMENT. */
8704 static void
8705 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8706 HOST_WIDE_INT adjustment)
8708 rtx base_rtx = stack_pointer_rtx;
8709 rtx insn, reg, mem;
8711 reg = gen_rtx_REG (mode, regno);
8712 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8713 plus_constant (Pmode, base_rtx, -adjustment));
8714 mem = gen_frame_mem (mode, mem);
8716 insn = emit_move_insn (mem, reg);
8717 RTX_FRAME_RELATED_P (insn) = 1;
8720 /* Generate and return an instruction to store the pair of registers
8721 REG and REG2 of mode MODE to location BASE with write-back adjusting
8722 the stack location BASE by ADJUSTMENT. */
8724 static rtx
8725 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8726 HOST_WIDE_INT adjustment)
8728 switch (mode)
8730 case E_DImode:
8731 return gen_storewb_pairdi_di (base, base, reg, reg2,
8732 GEN_INT (-adjustment),
8733 GEN_INT (UNITS_PER_WORD - adjustment));
8734 case E_DFmode:
8735 return gen_storewb_pairdf_di (base, base, reg, reg2,
8736 GEN_INT (-adjustment),
8737 GEN_INT (UNITS_PER_WORD - adjustment));
8738 case E_TFmode:
8739 return gen_storewb_pairtf_di (base, base, reg, reg2,
8740 GEN_INT (-adjustment),
8741 GEN_INT (UNITS_PER_VREG - adjustment));
8742 default:
8743 gcc_unreachable ();
8747 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8748 stack pointer by ADJUSTMENT. */
8750 static void
8751 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8753 rtx_insn *insn;
8754 machine_mode mode = aarch64_reg_save_mode (regno1);
8756 if (regno2 == INVALID_REGNUM)
8757 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8759 rtx reg1 = gen_rtx_REG (mode, regno1);
8760 rtx reg2 = gen_rtx_REG (mode, regno2);
8762 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8763 reg2, adjustment));
8764 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8765 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8766 RTX_FRAME_RELATED_P (insn) = 1;
8769 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8770 adjusting it by ADJUSTMENT afterwards. */
8772 static rtx
8773 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8774 HOST_WIDE_INT adjustment)
8776 switch (mode)
8778 case E_DImode:
8779 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8780 GEN_INT (UNITS_PER_WORD));
8781 case E_DFmode:
8782 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8783 GEN_INT (UNITS_PER_WORD));
8784 case E_TFmode:
8785 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8786 GEN_INT (UNITS_PER_VREG));
8787 default:
8788 gcc_unreachable ();
8792 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8793 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8794 into CFI_OPS. */
8796 static void
8797 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8798 rtx *cfi_ops)
8800 machine_mode mode = aarch64_reg_save_mode (regno1);
8801 rtx reg1 = gen_rtx_REG (mode, regno1);
8803 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8805 if (regno2 == INVALID_REGNUM)
8807 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8808 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8809 emit_move_insn (reg1, gen_frame_mem (mode, mem));
8811 else
8813 rtx reg2 = gen_rtx_REG (mode, regno2);
8814 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8815 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8816 reg2, adjustment));
8820 /* Generate and return a store pair instruction of mode MODE to store
8821 register REG1 to MEM1 and register REG2 to MEM2. */
8823 static rtx
8824 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8825 rtx reg2)
8827 switch (mode)
8829 case E_DImode:
8830 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8832 case E_DFmode:
8833 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8835 case E_TFmode:
8836 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8838 case E_V4SImode:
8839 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8841 case E_V16QImode:
8842 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8844 default:
8845 gcc_unreachable ();
8849 /* Generate and regurn a load pair isntruction of mode MODE to load register
8850 REG1 from MEM1 and register REG2 from MEM2. */
8852 static rtx
8853 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8854 rtx mem2)
8856 switch (mode)
8858 case E_DImode:
8859 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8861 case E_DFmode:
8862 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8864 case E_TFmode:
8865 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8867 case E_V4SImode:
8868 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8870 default:
8871 gcc_unreachable ();
8875 /* Return TRUE if return address signing should be enabled for the current
8876 function, otherwise return FALSE. */
8878 bool
8879 aarch64_return_address_signing_enabled (void)
8881 /* This function should only be called after frame laid out. */
8882 gcc_assert (cfun->machine->frame.laid_out);
8884 /* Turn return address signing off in any function that uses
8885 __builtin_eh_return. The address passed to __builtin_eh_return
8886 is not signed so either it has to be signed (with original sp)
8887 or the code path that uses it has to avoid authenticating it.
8888 Currently eh return introduces a return to anywhere gadget, no
8889 matter what we do here since it uses ret with user provided
8890 address. An ideal fix for that is to use indirect branch which
8891 can be protected with BTI j (to some extent). */
8892 if (crtl->calls_eh_return)
8893 return false;
8895 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8896 if its LR is pushed onto stack. */
8897 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
8898 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
8899 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8902 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8903 bool
8904 aarch64_bti_enabled (void)
8906 return (aarch64_enable_bti == 1);
8909 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8910 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8911 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8913 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8914 or LD1D address
8916 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8917 if the variable isn't already nonnull
8919 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8920 Handle this case using a temporary base register that is suitable for
8921 all offsets in that range. Use ANCHOR_REG as this base register if it
8922 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8924 static inline void
8925 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8926 rtx &anchor_reg, poly_int64 &offset,
8927 rtx &ptrue)
8929 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8931 /* This is the maximum valid offset of the anchor from the base.
8932 Lower values would be valid too. */
8933 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8934 if (!anchor_reg)
8936 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8937 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8938 gen_int_mode (anchor_offset, Pmode)));
8940 base_rtx = anchor_reg;
8941 offset -= anchor_offset;
8943 if (!ptrue)
8945 int pred_reg = cfun->machine->frame.spare_pred_reg;
8946 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8947 CONSTM1_RTX (VNx16BImode));
8948 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8952 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8953 is saved at BASE + OFFSET. */
8955 static void
8956 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8957 rtx base, poly_int64 offset)
8959 rtx mem = gen_frame_mem (GET_MODE (reg),
8960 plus_constant (Pmode, base, offset));
8961 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8964 /* Emit code to save the callee-saved registers from register number START
8965 to LIMIT to the stack at the location starting at offset START_OFFSET,
8966 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
8967 is true if the hard frame pointer has been set up. */
8969 static void
8970 aarch64_save_callee_saves (poly_int64 start_offset,
8971 unsigned start, unsigned limit, bool skip_wb,
8972 bool hard_fp_valid_p)
8974 rtx_insn *insn;
8975 unsigned regno;
8976 unsigned regno2;
8977 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8979 for (regno = aarch64_next_callee_save (start, limit);
8980 regno <= limit;
8981 regno = aarch64_next_callee_save (regno + 1, limit))
8983 rtx reg, mem;
8984 poly_int64 offset;
8985 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8987 if (skip_wb
8988 && (regno == cfun->machine->frame.wb_push_candidate1
8989 || regno == cfun->machine->frame.wb_push_candidate2))
8990 continue;
8992 if (cfun->machine->reg_is_wrapped_separately[regno])
8993 continue;
8995 machine_mode mode = aarch64_reg_save_mode (regno);
8996 reg = gen_rtx_REG (mode, regno);
8997 offset = start_offset + cfun->machine->frame.reg_offset[regno];
8998 rtx base_rtx = stack_pointer_rtx;
8999 poly_int64 sp_offset = offset;
9001 HOST_WIDE_INT const_offset;
9002 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9003 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9004 offset, ptrue);
9005 else if (GP_REGNUM_P (regno)
9006 && (!offset.is_constant (&const_offset) || const_offset >= 512))
9008 gcc_assert (known_eq (start_offset, 0));
9009 poly_int64 fp_offset
9010 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9011 if (hard_fp_valid_p)
9012 base_rtx = hard_frame_pointer_rtx;
9013 else
9015 if (!anchor_reg)
9017 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9018 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9019 gen_int_mode (fp_offset, Pmode)));
9021 base_rtx = anchor_reg;
9023 offset -= fp_offset;
9025 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9026 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
9028 if (!aarch64_sve_mode_p (mode)
9029 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9030 && !cfun->machine->reg_is_wrapped_separately[regno2]
9031 && known_eq (GET_MODE_SIZE (mode),
9032 cfun->machine->frame.reg_offset[regno2]
9033 - cfun->machine->frame.reg_offset[regno]))
9035 rtx reg2 = gen_rtx_REG (mode, regno2);
9036 rtx mem2;
9038 offset += GET_MODE_SIZE (mode);
9039 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9040 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
9041 reg2));
9043 /* The first part of a frame-related parallel insn is
9044 always assumed to be relevant to the frame
9045 calculations; subsequent parts, are only
9046 frame-related if explicitly marked. */
9047 if (aarch64_emit_cfi_for_reg_p (regno2))
9049 if (need_cfa_note_p)
9050 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
9051 sp_offset + GET_MODE_SIZE (mode));
9052 else
9053 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
9056 regno = regno2;
9058 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9060 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
9061 need_cfa_note_p = true;
9063 else if (aarch64_sve_mode_p (mode))
9064 insn = emit_insn (gen_rtx_SET (mem, reg));
9065 else
9066 insn = emit_move_insn (mem, reg);
9068 RTX_FRAME_RELATED_P (insn) = frame_related_p;
9069 if (frame_related_p && need_cfa_note_p)
9070 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
9074 /* Emit code to restore the callee registers from register number START
9075 up to and including LIMIT. Restore from the stack offset START_OFFSET,
9076 skipping any write-back candidates if SKIP_WB is true. Write the
9077 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
9079 static void
9080 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
9081 unsigned limit, bool skip_wb, rtx *cfi_ops)
9083 unsigned regno;
9084 unsigned regno2;
9085 poly_int64 offset;
9086 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9088 for (regno = aarch64_next_callee_save (start, limit);
9089 regno <= limit;
9090 regno = aarch64_next_callee_save (regno + 1, limit))
9092 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9093 if (cfun->machine->reg_is_wrapped_separately[regno])
9094 continue;
9096 rtx reg, mem;
9098 if (skip_wb
9099 && (regno == cfun->machine->frame.wb_pop_candidate1
9100 || regno == cfun->machine->frame.wb_pop_candidate2))
9101 continue;
9103 machine_mode mode = aarch64_reg_save_mode (regno);
9104 reg = gen_rtx_REG (mode, regno);
9105 offset = start_offset + cfun->machine->frame.reg_offset[regno];
9106 rtx base_rtx = stack_pointer_rtx;
9107 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9108 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9109 offset, ptrue);
9110 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9112 if (!aarch64_sve_mode_p (mode)
9113 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9114 && !cfun->machine->reg_is_wrapped_separately[regno2]
9115 && known_eq (GET_MODE_SIZE (mode),
9116 cfun->machine->frame.reg_offset[regno2]
9117 - cfun->machine->frame.reg_offset[regno]))
9119 rtx reg2 = gen_rtx_REG (mode, regno2);
9120 rtx mem2;
9122 offset += GET_MODE_SIZE (mode);
9123 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9124 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9126 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9127 regno = regno2;
9129 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9130 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9131 else if (aarch64_sve_mode_p (mode))
9132 emit_insn (gen_rtx_SET (reg, mem));
9133 else
9134 emit_move_insn (reg, mem);
9135 if (frame_related_p)
9136 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9140 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9141 of MODE. */
9143 static inline bool
9144 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9146 HOST_WIDE_INT multiple;
9147 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9148 && IN_RANGE (multiple, -8, 7));
9151 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9152 of MODE. */
9154 static inline bool
9155 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9157 HOST_WIDE_INT multiple;
9158 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9159 && IN_RANGE (multiple, -32, 31));
9162 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9163 of MODE. */
9165 static inline bool
9166 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9168 HOST_WIDE_INT multiple;
9169 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9170 && IN_RANGE (multiple, 0, 63));
9173 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9174 of MODE. */
9176 bool
9177 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9179 HOST_WIDE_INT multiple;
9180 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9181 && IN_RANGE (multiple, -64, 63));
9184 /* Return true if OFFSET is a signed 9-bit value. */
9186 bool
9187 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9188 poly_int64 offset)
9190 HOST_WIDE_INT const_offset;
9191 return (offset.is_constant (&const_offset)
9192 && IN_RANGE (const_offset, -256, 255));
9195 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9196 of MODE. */
9198 static inline bool
9199 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9201 HOST_WIDE_INT multiple;
9202 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9203 && IN_RANGE (multiple, -256, 255));
9206 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9207 of MODE. */
9209 static inline bool
9210 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9212 HOST_WIDE_INT multiple;
9213 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9214 && IN_RANGE (multiple, 0, 4095));
9217 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
9219 static sbitmap
9220 aarch64_get_separate_components (void)
9222 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9223 bitmap_clear (components);
9225 /* The registers we need saved to the frame. */
9226 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9227 if (aarch64_register_saved_on_entry (regno))
9229 /* Punt on saves and restores that use ST1D and LD1D. We could
9230 try to be smarter, but it would involve making sure that the
9231 spare predicate register itself is safe to use at the save
9232 and restore points. Also, when a frame pointer is being used,
9233 the slots are often out of reach of ST1D and LD1D anyway. */
9234 machine_mode mode = aarch64_reg_save_mode (regno);
9235 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9236 continue;
9238 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9240 /* If the register is saved in the first SVE save slot, we use
9241 it as a stack probe for -fstack-clash-protection. */
9242 if (flag_stack_clash_protection
9243 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
9244 && known_eq (offset, 0))
9245 continue;
9247 /* Get the offset relative to the register we'll use. */
9248 if (frame_pointer_needed)
9249 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9250 else
9251 offset += crtl->outgoing_args_size;
9253 /* Check that we can access the stack slot of the register with one
9254 direct load with no adjustments needed. */
9255 if (aarch64_sve_mode_p (mode)
9256 ? offset_9bit_signed_scaled_p (mode, offset)
9257 : offset_12bit_unsigned_scaled_p (mode, offset))
9258 bitmap_set_bit (components, regno);
9261 /* Don't mess with the hard frame pointer. */
9262 if (frame_pointer_needed)
9263 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9265 /* If the spare predicate register used by big-endian SVE code
9266 is call-preserved, it must be saved in the main prologue
9267 before any saves that use it. */
9268 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
9269 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
9271 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9272 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9273 /* If registers have been chosen to be stored/restored with
9274 writeback don't interfere with them to avoid having to output explicit
9275 stack adjustment instructions. */
9276 if (reg2 != INVALID_REGNUM)
9277 bitmap_clear_bit (components, reg2);
9278 if (reg1 != INVALID_REGNUM)
9279 bitmap_clear_bit (components, reg1);
9281 bitmap_clear_bit (components, LR_REGNUM);
9282 bitmap_clear_bit (components, SP_REGNUM);
9284 return components;
9287 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
9289 static sbitmap
9290 aarch64_components_for_bb (basic_block bb)
9292 bitmap in = DF_LIVE_IN (bb);
9293 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9294 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9296 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9297 bitmap_clear (components);
9299 /* Clobbered registers don't generate values in any meaningful sense,
9300 since nothing after the clobber can rely on their value. And we can't
9301 say that partially-clobbered registers are unconditionally killed,
9302 because whether they're killed or not depends on the mode of the
9303 value they're holding. Thus partially call-clobbered registers
9304 appear in neither the kill set nor the gen set.
9306 Check manually for any calls that clobber more of a register than the
9307 current function can. */
9308 function_abi_aggregator callee_abis;
9309 rtx_insn *insn;
9310 FOR_BB_INSNS (bb, insn)
9311 if (CALL_P (insn))
9312 callee_abis.note_callee_abi (insn_callee_abi (insn));
9313 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9315 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9316 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9317 if (!fixed_regs[regno]
9318 && !crtl->abi->clobbers_full_reg_p (regno)
9319 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9320 || bitmap_bit_p (in, regno)
9321 || bitmap_bit_p (gen, regno)
9322 || bitmap_bit_p (kill, regno)))
9324 bitmap_set_bit (components, regno);
9326 /* If there is a callee-save at an adjacent offset, add it too
9327 to increase the use of LDP/STP. */
9328 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9329 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9331 if (regno2 <= LAST_SAVED_REGNUM)
9333 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9334 if (regno < regno2
9335 ? known_eq (offset + 8, offset2)
9336 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9337 bitmap_set_bit (components, regno2);
9341 return components;
9344 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9345 Nothing to do for aarch64. */
9347 static void
9348 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9352 /* Return the next set bit in BMP from START onwards. Return the total number
9353 of bits in BMP if no set bit is found at or after START. */
9355 static unsigned int
9356 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9358 unsigned int nbits = SBITMAP_SIZE (bmp);
9359 if (start == nbits)
9360 return start;
9362 gcc_assert (start < nbits);
9363 for (unsigned int i = start; i < nbits; i++)
9364 if (bitmap_bit_p (bmp, i))
9365 return i;
9367 return nbits;
9370 /* Do the work for aarch64_emit_prologue_components and
9371 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9372 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9373 for these components or the epilogue sequence. That is, it determines
9374 whether we should emit stores or loads and what kind of CFA notes to attach
9375 to the insns. Otherwise the logic for the two sequences is very
9376 similar. */
9378 static void
9379 aarch64_process_components (sbitmap components, bool prologue_p)
9381 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9382 ? HARD_FRAME_POINTER_REGNUM
9383 : STACK_POINTER_REGNUM);
9385 unsigned last_regno = SBITMAP_SIZE (components);
9386 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9387 rtx_insn *insn = NULL;
9389 while (regno != last_regno)
9391 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9392 machine_mode mode = aarch64_reg_save_mode (regno);
9394 rtx reg = gen_rtx_REG (mode, regno);
9395 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9396 if (frame_pointer_needed)
9397 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9398 else
9399 offset += crtl->outgoing_args_size;
9401 rtx addr = plus_constant (Pmode, ptr_reg, offset);
9402 rtx mem = gen_frame_mem (mode, addr);
9404 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9405 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9406 /* No more registers to handle after REGNO.
9407 Emit a single save/restore and exit. */
9408 if (regno2 == last_regno)
9410 insn = emit_insn (set);
9411 if (frame_related_p)
9413 RTX_FRAME_RELATED_P (insn) = 1;
9414 if (prologue_p)
9415 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9416 else
9417 add_reg_note (insn, REG_CFA_RESTORE, reg);
9419 break;
9422 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9423 /* The next register is not of the same class or its offset is not
9424 mergeable with the current one into a pair. */
9425 if (aarch64_sve_mode_p (mode)
9426 || !satisfies_constraint_Ump (mem)
9427 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9428 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9429 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9430 GET_MODE_SIZE (mode)))
9432 insn = emit_insn (set);
9433 if (frame_related_p)
9435 RTX_FRAME_RELATED_P (insn) = 1;
9436 if (prologue_p)
9437 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9438 else
9439 add_reg_note (insn, REG_CFA_RESTORE, reg);
9442 regno = regno2;
9443 continue;
9446 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9448 /* REGNO2 can be saved/restored in a pair with REGNO. */
9449 rtx reg2 = gen_rtx_REG (mode, regno2);
9450 if (frame_pointer_needed)
9451 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9452 else
9453 offset2 += crtl->outgoing_args_size;
9454 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9455 rtx mem2 = gen_frame_mem (mode, addr2);
9456 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9457 : gen_rtx_SET (reg2, mem2);
9459 if (prologue_p)
9460 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9461 else
9462 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9464 if (frame_related_p || frame_related2_p)
9466 RTX_FRAME_RELATED_P (insn) = 1;
9467 if (prologue_p)
9469 if (frame_related_p)
9470 add_reg_note (insn, REG_CFA_OFFSET, set);
9471 if (frame_related2_p)
9472 add_reg_note (insn, REG_CFA_OFFSET, set2);
9474 else
9476 if (frame_related_p)
9477 add_reg_note (insn, REG_CFA_RESTORE, reg);
9478 if (frame_related2_p)
9479 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9483 regno = aarch64_get_next_set_bit (components, regno2 + 1);
9487 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9489 static void
9490 aarch64_emit_prologue_components (sbitmap components)
9492 aarch64_process_components (components, true);
9495 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9497 static void
9498 aarch64_emit_epilogue_components (sbitmap components)
9500 aarch64_process_components (components, false);
9503 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9505 static void
9506 aarch64_set_handled_components (sbitmap components)
9508 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9509 if (bitmap_bit_p (components, regno))
9510 cfun->machine->reg_is_wrapped_separately[regno] = true;
9513 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9514 determining the probe offset for alloca. */
9516 static HOST_WIDE_INT
9517 aarch64_stack_clash_protection_alloca_probe_range (void)
9519 return STACK_CLASH_CALLER_GUARD;
9523 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9524 registers. If POLY_SIZE is not large enough to require a probe this function
9525 will only adjust the stack. When allocating the stack space
9526 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9527 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9528 arguments. If we are then we ensure that any allocation larger than the ABI
9529 defined buffer needs a probe so that the invariant of having a 1KB buffer is
9530 maintained.
9532 We emit barriers after each stack adjustment to prevent optimizations from
9533 breaking the invariant that we never drop the stack more than a page. This
9534 invariant is needed to make it easier to correctly handle asynchronous
9535 events, e.g. if we were to allow the stack to be dropped by more than a page
9536 and then have multiple probes up and we take a signal somewhere in between
9537 then the signal handler doesn't know the state of the stack and can make no
9538 assumptions about which pages have been probed. */
9540 static void
9541 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9542 poly_int64 poly_size,
9543 bool frame_related_p,
9544 bool final_adjustment_p)
9546 HOST_WIDE_INT guard_size
9547 = 1 << param_stack_clash_protection_guard_size;
9548 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9549 HOST_WIDE_INT min_probe_threshold
9550 = (final_adjustment_p
9551 ? guard_used_by_caller
9552 : guard_size - guard_used_by_caller);
9553 /* When doing the final adjustment for the outgoing arguments, take into
9554 account any unprobed space there is above the current SP. There are
9555 two cases:
9557 - When saving SVE registers below the hard frame pointer, we force
9558 the lowest save to take place in the prologue before doing the final
9559 adjustment (i.e. we don't allow the save to be shrink-wrapped).
9560 This acts as a probe at SP, so there is no unprobed space.
9562 - When there are no SVE register saves, we use the store of the link
9563 register as a probe. We can't assume that LR was saved at position 0
9564 though, so treat any space below it as unprobed. */
9565 if (final_adjustment_p
9566 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9568 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9569 if (known_ge (lr_offset, 0))
9570 min_probe_threshold -= lr_offset.to_constant ();
9571 else
9572 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9575 poly_int64 frame_size = cfun->machine->frame.frame_size;
9577 /* We should always have a positive probe threshold. */
9578 gcc_assert (min_probe_threshold > 0);
9580 if (flag_stack_clash_protection && !final_adjustment_p)
9582 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9583 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9584 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9586 if (known_eq (frame_size, 0))
9588 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9590 else if (known_lt (initial_adjust + sve_callee_adjust,
9591 guard_size - guard_used_by_caller)
9592 && known_lt (final_adjust, guard_used_by_caller))
9594 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9598 /* If SIZE is not large enough to require probing, just adjust the stack and
9599 exit. */
9600 if (known_lt (poly_size, min_probe_threshold)
9601 || !flag_stack_clash_protection)
9603 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9604 return;
9607 HOST_WIDE_INT size;
9608 /* Handle the SVE non-constant case first. */
9609 if (!poly_size.is_constant (&size))
9611 if (dump_file)
9613 fprintf (dump_file, "Stack clash SVE prologue: ");
9614 print_dec (poly_size, dump_file);
9615 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9618 /* First calculate the amount of bytes we're actually spilling. */
9619 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9620 poly_size, temp1, temp2, false, true);
9622 rtx_insn *insn = get_last_insn ();
9624 if (frame_related_p)
9626 /* This is done to provide unwinding information for the stack
9627 adjustments we're about to do, however to prevent the optimizers
9628 from removing the R11 move and leaving the CFA note (which would be
9629 very wrong) we tie the old and new stack pointer together.
9630 The tie will expand to nothing but the optimizers will not touch
9631 the instruction. */
9632 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9633 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9634 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9636 /* We want the CFA independent of the stack pointer for the
9637 duration of the loop. */
9638 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9639 RTX_FRAME_RELATED_P (insn) = 1;
9642 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9643 rtx guard_const = gen_int_mode (guard_size, Pmode);
9645 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9646 stack_pointer_rtx, temp1,
9647 probe_const, guard_const));
9649 /* Now reset the CFA register if needed. */
9650 if (frame_related_p)
9652 add_reg_note (insn, REG_CFA_DEF_CFA,
9653 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9654 gen_int_mode (poly_size, Pmode)));
9655 RTX_FRAME_RELATED_P (insn) = 1;
9658 return;
9661 if (dump_file)
9662 fprintf (dump_file,
9663 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9664 " bytes, probing will be required.\n", size);
9666 /* Round size to the nearest multiple of guard_size, and calculate the
9667 residual as the difference between the original size and the rounded
9668 size. */
9669 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9670 HOST_WIDE_INT residual = size - rounded_size;
9672 /* We can handle a small number of allocations/probes inline. Otherwise
9673 punt to a loop. */
9674 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9676 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9678 aarch64_sub_sp (NULL, temp2, guard_size, true);
9679 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9680 guard_used_by_caller));
9681 emit_insn (gen_blockage ());
9683 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9685 else
9687 /* Compute the ending address. */
9688 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9689 temp1, NULL, false, true);
9690 rtx_insn *insn = get_last_insn ();
9692 /* For the initial allocation, we don't have a frame pointer
9693 set up, so we always need CFI notes. If we're doing the
9694 final allocation, then we may have a frame pointer, in which
9695 case it is the CFA, otherwise we need CFI notes.
9697 We can determine which allocation we are doing by looking at
9698 the value of FRAME_RELATED_P since the final allocations are not
9699 frame related. */
9700 if (frame_related_p)
9702 /* We want the CFA independent of the stack pointer for the
9703 duration of the loop. */
9704 add_reg_note (insn, REG_CFA_DEF_CFA,
9705 plus_constant (Pmode, temp1, rounded_size));
9706 RTX_FRAME_RELATED_P (insn) = 1;
9709 /* This allocates and probes the stack. Note that this re-uses some of
9710 the existing Ada stack protection code. However we are guaranteed not
9711 to enter the non loop or residual branches of that code.
9713 The non-loop part won't be entered because if our allocation amount
9714 doesn't require a loop, the case above would handle it.
9716 The residual amount won't be entered because TEMP1 is a mutliple of
9717 the allocation size. The residual will always be 0. As such, the only
9718 part we are actually using from that code is the loop setup. The
9719 actual probing is done in aarch64_output_probe_stack_range. */
9720 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9721 stack_pointer_rtx, temp1));
9723 /* Now reset the CFA register if needed. */
9724 if (frame_related_p)
9726 add_reg_note (insn, REG_CFA_DEF_CFA,
9727 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9728 RTX_FRAME_RELATED_P (insn) = 1;
9731 emit_insn (gen_blockage ());
9732 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9735 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9736 be probed. This maintains the requirement that each page is probed at
9737 least once. For initial probing we probe only if the allocation is
9738 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9739 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9740 GUARD_SIZE. This works that for any allocation that is large enough to
9741 trigger a probe here, we'll have at least one, and if they're not large
9742 enough for this code to emit anything for them, The page would have been
9743 probed by the saving of FP/LR either by this function or any callees. If
9744 we don't have any callees then we won't have more stack adjustments and so
9745 are still safe. */
9746 if (residual)
9748 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9749 /* If we're doing final adjustments, and we've done any full page
9750 allocations then any residual needs to be probed. */
9751 if (final_adjustment_p && rounded_size != 0)
9752 min_probe_threshold = 0;
9753 /* If doing a small final adjustment, we always probe at offset 0.
9754 This is done to avoid issues when LR is not at position 0 or when
9755 the final adjustment is smaller than the probing offset. */
9756 else if (final_adjustment_p && rounded_size == 0)
9757 residual_probe_offset = 0;
9759 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9760 if (residual >= min_probe_threshold)
9762 if (dump_file)
9763 fprintf (dump_file,
9764 "Stack clash AArch64 prologue residuals: "
9765 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9766 "\n", residual);
9768 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9769 residual_probe_offset));
9770 emit_insn (gen_blockage ());
9775 /* Return 1 if the register is used by the epilogue. We need to say the
9776 return register is used, but only after epilogue generation is complete.
9777 Note that in the case of sibcalls, the values "used by the epilogue" are
9778 considered live at the start of the called function.
9780 For SIMD functions we need to return 1 for FP registers that are saved and
9781 restored by a function but are not zero in call_used_regs. If we do not do
9782 this optimizations may remove the restore of the register. */
9785 aarch64_epilogue_uses (int regno)
9787 if (epilogue_completed)
9789 if (regno == LR_REGNUM)
9790 return 1;
9792 return 0;
9795 /* AArch64 stack frames generated by this compiler look like:
9797 +-------------------------------+
9799 | incoming stack arguments |
9801 +-------------------------------+
9802 | | <-- incoming stack pointer (aligned)
9803 | callee-allocated save area |
9804 | for register varargs |
9806 +-------------------------------+
9807 | local variables | <-- frame_pointer_rtx
9809 +-------------------------------+
9810 | padding | \
9811 +-------------------------------+ |
9812 | callee-saved registers | | frame.saved_regs_size
9813 +-------------------------------+ |
9814 | LR' | |
9815 +-------------------------------+ |
9816 | FP' | |
9817 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
9818 | SVE vector registers | | \
9819 +-------------------------------+ | | below_hard_fp_saved_regs_size
9820 | SVE predicate registers | / /
9821 +-------------------------------+
9822 | dynamic allocation |
9823 +-------------------------------+
9824 | padding |
9825 +-------------------------------+
9826 | outgoing stack arguments | <-- arg_pointer
9828 +-------------------------------+
9829 | | <-- stack_pointer_rtx (aligned)
9831 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9832 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9833 unchanged.
9835 By default for stack-clash we assume the guard is at least 64KB, but this
9836 value is configurable to either 4KB or 64KB. We also force the guard size to
9837 be the same as the probing interval and both values are kept in sync.
9839 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9840 on the guard size) of stack space without probing.
9842 When probing is needed, we emit a probe at the start of the prologue
9843 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9845 We have to track how much space has been allocated and the only stores
9846 to the stack we track as implicit probes are the FP/LR stores.
9848 For outgoing arguments we probe if the size is larger than 1KB, such that
9849 the ABI specified buffer is maintained for the next callee.
9851 The following registers are reserved during frame layout and should not be
9852 used for any other purpose:
9854 - r11: Used by stack clash protection when SVE is enabled, and also
9855 as an anchor register when saving and restoring registers
9856 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9857 - r14 and r15: Used for speculation tracking.
9858 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9859 - r30(LR), r29(FP): Used by standard frame layout.
9861 These registers must be avoided in frame layout related code unless the
9862 explicit intention is to interact with one of the features listed above. */
9864 /* Generate the prologue instructions for entry into a function.
9865 Establish the stack frame by decreasing the stack pointer with a
9866 properly calculated size and, if necessary, create a frame record
9867 filled with the values of LR and previous frame pointer. The
9868 current FP is also set up if it is in use. */
9870 void
9871 aarch64_expand_prologue (void)
9873 poly_int64 frame_size = cfun->machine->frame.frame_size;
9874 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9875 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9876 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9877 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9878 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9879 poly_int64 below_hard_fp_saved_regs_size
9880 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9881 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9882 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9883 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
9884 rtx_insn *insn;
9886 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
9888 /* Fold the SVE allocation into the initial allocation.
9889 We don't do this in aarch64_layout_arg to avoid pessimizing
9890 the epilogue code. */
9891 initial_adjust += sve_callee_adjust;
9892 sve_callee_adjust = 0;
9895 /* Sign return address for functions. */
9896 if (aarch64_return_address_signing_enabled ())
9898 switch (aarch64_ra_sign_key)
9900 case AARCH64_KEY_A:
9901 insn = emit_insn (gen_paciasp ());
9902 break;
9903 case AARCH64_KEY_B:
9904 insn = emit_insn (gen_pacibsp ());
9905 break;
9906 default:
9907 gcc_unreachable ();
9909 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9910 RTX_FRAME_RELATED_P (insn) = 1;
9913 /* Push return address to shadow call stack. */
9914 if (cfun->machine->frame.is_scs_enabled)
9915 emit_insn (gen_scs_push ());
9917 if (flag_stack_usage_info)
9918 current_function_static_stack_size = constant_lower_bound (frame_size);
9920 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9922 if (crtl->is_leaf && !cfun->calls_alloca)
9924 if (maybe_gt (frame_size, PROBE_INTERVAL)
9925 && maybe_gt (frame_size, get_stack_check_protect ()))
9926 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9927 (frame_size
9928 - get_stack_check_protect ()));
9930 else if (maybe_gt (frame_size, 0))
9931 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9934 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9935 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9937 /* In theory we should never have both an initial adjustment
9938 and a callee save adjustment. Verify that is the case since the
9939 code below does not handle it for -fstack-clash-protection. */
9940 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9942 /* Will only probe if the initial adjustment is larger than the guard
9943 less the amount of the guard reserved for use by the caller's
9944 outgoing args. */
9945 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9946 true, false);
9948 if (callee_adjust != 0)
9949 aarch64_push_regs (reg1, reg2, callee_adjust);
9951 /* The offset of the frame chain record (if any) from the current SP. */
9952 poly_int64 chain_offset = (initial_adjust + callee_adjust
9953 - cfun->machine->frame.hard_fp_offset);
9954 gcc_assert (known_ge (chain_offset, 0));
9956 /* The offset of the bottom of the save area from the current SP. */
9957 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
9959 if (emit_frame_chain)
9961 if (callee_adjust == 0)
9963 reg1 = R29_REGNUM;
9964 reg2 = R30_REGNUM;
9965 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
9966 false, false);
9968 else
9969 gcc_assert (known_eq (chain_offset, 0));
9970 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9971 stack_pointer_rtx, chain_offset,
9972 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
9973 if (frame_pointer_needed && !frame_size.is_constant ())
9975 /* Variable-sized frames need to describe the save slot
9976 address using DW_CFA_expression rather than DW_CFA_offset.
9977 This means that, without taking further action, the
9978 locations of the registers that we've already saved would
9979 remain based on the stack pointer even after we redefine
9980 the CFA based on the frame pointer. We therefore need new
9981 DW_CFA_expressions to re-express the save slots with addresses
9982 based on the frame pointer. */
9983 rtx_insn *insn = get_last_insn ();
9984 gcc_assert (RTX_FRAME_RELATED_P (insn));
9986 /* Add an explicit CFA definition if this was previously
9987 implicit. */
9988 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9990 rtx src = plus_constant (Pmode, stack_pointer_rtx,
9991 callee_offset);
9992 add_reg_note (insn, REG_CFA_ADJUST_CFA,
9993 gen_rtx_SET (hard_frame_pointer_rtx, src));
9996 /* Change the save slot expressions for the registers that
9997 we've already saved. */
9998 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9999 hard_frame_pointer_rtx, UNITS_PER_WORD);
10000 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
10001 hard_frame_pointer_rtx, 0);
10003 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
10006 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
10007 callee_adjust != 0 || emit_frame_chain,
10008 emit_frame_chain);
10009 if (maybe_ne (sve_callee_adjust, 0))
10011 gcc_assert (!flag_stack_clash_protection
10012 || known_eq (initial_adjust, 0));
10013 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
10014 sve_callee_adjust,
10015 !frame_pointer_needed, false);
10016 saved_regs_offset += sve_callee_adjust;
10018 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
10019 false, emit_frame_chain);
10020 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
10021 callee_adjust != 0 || emit_frame_chain,
10022 emit_frame_chain);
10024 /* We may need to probe the final adjustment if it is larger than the guard
10025 that is assumed by the called. */
10026 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
10027 !frame_pointer_needed, true);
10030 /* Return TRUE if we can use a simple_return insn.
10032 This function checks whether the callee saved stack is empty, which
10033 means no restore actions are need. The pro_and_epilogue will use
10034 this to check whether shrink-wrapping opt is feasible. */
10036 bool
10037 aarch64_use_return_insn_p (void)
10039 if (!reload_completed)
10040 return false;
10042 if (crtl->profile)
10043 return false;
10045 return known_eq (cfun->machine->frame.frame_size, 0);
10048 /* Generate the epilogue instructions for returning from a function.
10049 This is almost exactly the reverse of the prolog sequence, except
10050 that we need to insert barriers to avoid scheduling loads that read
10051 from a deallocated stack, and we optimize the unwind records by
10052 emitting them all together if possible. */
10053 void
10054 aarch64_expand_epilogue (bool for_sibcall)
10056 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
10057 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
10058 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
10059 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
10060 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
10061 poly_int64 below_hard_fp_saved_regs_size
10062 = cfun->machine->frame.below_hard_fp_saved_regs_size;
10063 unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
10064 unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
10065 unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
10066 ? R29_REGNUM : R30_REGNUM);
10067 rtx cfi_ops = NULL;
10068 rtx_insn *insn;
10069 /* A stack clash protection prologue may not have left EP0_REGNUM or
10070 EP1_REGNUM in a usable state. The same is true for allocations
10071 with an SVE component, since we then need both temporary registers
10072 for each allocation. For stack clash we are in a usable state if
10073 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
10074 HOST_WIDE_INT guard_size
10075 = 1 << param_stack_clash_protection_guard_size;
10076 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10078 /* We can re-use the registers when:
10080 (a) the deallocation amount is the same as the corresponding
10081 allocation amount (which is false if we combine the initial
10082 and SVE callee save allocations in the prologue); and
10084 (b) the allocation amount doesn't need a probe (which is false
10085 if the amount is guard_size - guard_used_by_caller or greater).
10087 In such situations the register should remain live with the correct
10088 value. */
10089 bool can_inherit_p = (initial_adjust.is_constant ()
10090 && final_adjust.is_constant ()
10091 && (!flag_stack_clash_protection
10092 || (known_lt (initial_adjust,
10093 guard_size - guard_used_by_caller)
10094 && known_eq (sve_callee_adjust, 0))));
10096 /* We need to add memory barrier to prevent read from deallocated stack. */
10097 bool need_barrier_p
10098 = maybe_ne (get_frame_size ()
10099 + cfun->machine->frame.saved_varargs_size, 0);
10101 /* Emit a barrier to prevent loads from a deallocated stack. */
10102 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10103 || cfun->calls_alloca
10104 || crtl->calls_eh_return)
10106 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10107 need_barrier_p = false;
10110 /* Restore the stack pointer from the frame pointer if it may not
10111 be the same as the stack pointer. */
10112 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10113 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10114 if (frame_pointer_needed
10115 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
10116 /* If writeback is used when restoring callee-saves, the CFA
10117 is restored on the instruction doing the writeback. */
10118 aarch64_add_offset (Pmode, stack_pointer_rtx,
10119 hard_frame_pointer_rtx,
10120 -callee_offset - below_hard_fp_saved_regs_size,
10121 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
10122 else
10123 /* The case where we need to re-use the register here is very rare, so
10124 avoid the complicated condition and just always emit a move if the
10125 immediate doesn't fit. */
10126 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
10128 /* Restore the vector registers before the predicate registers,
10129 so that we can use P4 as a temporary for big-endian SVE frames. */
10130 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
10131 callee_adjust != 0, &cfi_ops);
10132 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
10133 false, &cfi_ops);
10134 if (maybe_ne (sve_callee_adjust, 0))
10135 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
10137 /* When shadow call stack is enabled, the scs_pop in the epilogue will
10138 restore x30, we don't need to restore x30 again in the traditional
10139 way. */
10140 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
10141 R0_REGNUM, last_gpr,
10142 callee_adjust != 0, &cfi_ops);
10144 if (need_barrier_p)
10145 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10147 if (callee_adjust != 0)
10148 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10150 /* If we have no register restore information, the CFA must have been
10151 defined in terms of the stack pointer since the end of the prologue. */
10152 gcc_assert (cfi_ops || !frame_pointer_needed);
10154 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10156 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
10157 insn = get_last_insn ();
10158 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10159 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10160 RTX_FRAME_RELATED_P (insn) = 1;
10161 cfi_ops = NULL;
10164 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10165 add restriction on emit_move optimization to leaf functions. */
10166 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
10167 (!can_inherit_p || !crtl->is_leaf
10168 || df_regs_ever_live_p (EP0_REGNUM)));
10170 if (cfi_ops)
10172 /* Emit delayed restores and reset the CFA to be SP. */
10173 insn = get_last_insn ();
10174 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10175 REG_NOTES (insn) = cfi_ops;
10176 RTX_FRAME_RELATED_P (insn) = 1;
10179 /* Pop return address from shadow call stack. */
10180 if (cfun->machine->frame.is_scs_enabled)
10182 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10183 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10185 insn = emit_insn (gen_scs_pop ());
10186 add_reg_note (insn, REG_CFA_RESTORE, reg);
10187 RTX_FRAME_RELATED_P (insn) = 1;
10190 /* We prefer to emit the combined return/authenticate instruction RETAA,
10191 however there are three cases in which we must instead emit an explicit
10192 authentication instruction.
10194 1) Sibcalls don't return in a normal way, so if we're about to call one
10195 we must authenticate.
10197 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10198 generating code for !TARGET_ARMV8_3 we can't use it and must
10199 explicitly authenticate.
10201 if (aarch64_return_address_signing_enabled ()
10202 && (for_sibcall || !TARGET_ARMV8_3))
10204 switch (aarch64_ra_sign_key)
10206 case AARCH64_KEY_A:
10207 insn = emit_insn (gen_autiasp ());
10208 break;
10209 case AARCH64_KEY_B:
10210 insn = emit_insn (gen_autibsp ());
10211 break;
10212 default:
10213 gcc_unreachable ();
10215 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10216 RTX_FRAME_RELATED_P (insn) = 1;
10219 /* Stack adjustment for exception handler. */
10220 if (crtl->calls_eh_return && !for_sibcall)
10222 /* We need to unwind the stack by the offset computed by
10223 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
10224 to be SP; letting the CFA move during this adjustment
10225 is just as correct as retaining the CFA from the body
10226 of the function. Therefore, do nothing special. */
10227 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
10230 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10231 if (!for_sibcall)
10232 emit_jump_insn (ret_rtx);
10235 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
10236 normally or return to a previous frame after unwinding.
10238 An EH return uses a single shared return sequence. The epilogue is
10239 exactly like a normal epilogue except that it has an extra input
10240 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10241 that must be applied after the frame has been destroyed. An extra label
10242 is inserted before the epilogue which initializes this register to zero,
10243 and this is the entry point for a normal return.
10245 An actual EH return updates the return address, initializes the stack
10246 adjustment and jumps directly into the epilogue (bypassing the zeroing
10247 of the adjustment). Since the return address is typically saved on the
10248 stack when a function makes a call, the saved LR must be updated outside
10249 the epilogue.
10251 This poses problems as the store is generated well before the epilogue,
10252 so the offset of LR is not known yet. Also optimizations will remove the
10253 store as it appears dead, even after the epilogue is generated (as the
10254 base or offset for loading LR is different in many cases).
10256 To avoid these problems this implementation forces the frame pointer
10257 in eh_return functions so that the location of LR is fixed and known early.
10258 It also marks the store volatile, so no optimization is permitted to
10259 remove the store. */
10261 aarch64_eh_return_handler_rtx (void)
10263 rtx tmp = gen_frame_mem (Pmode,
10264 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
10266 /* Mark the store volatile, so no optimization is permitted to remove it. */
10267 MEM_VOLATILE_P (tmp) = true;
10268 return tmp;
10271 /* Output code to add DELTA to the first argument, and then jump
10272 to FUNCTION. Used for C++ multiple inheritance. */
10273 static void
10274 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10275 HOST_WIDE_INT delta,
10276 HOST_WIDE_INT vcall_offset,
10277 tree function)
10279 /* The this pointer is always in x0. Note that this differs from
10280 Arm where the this pointer maybe bumped to r1 if r0 is required
10281 to return a pointer to an aggregate. On AArch64 a result value
10282 pointer will be in x8. */
10283 int this_regno = R0_REGNUM;
10284 rtx this_rtx, temp0, temp1, addr, funexp;
10285 rtx_insn *insn;
10286 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10288 if (aarch64_bti_enabled ())
10289 emit_insn (gen_bti_c());
10291 reload_completed = 1;
10292 emit_note (NOTE_INSN_PROLOGUE_END);
10294 this_rtx = gen_rtx_REG (Pmode, this_regno);
10295 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10296 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10298 if (vcall_offset == 0)
10299 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10300 else
10302 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10304 addr = this_rtx;
10305 if (delta != 0)
10307 if (delta >= -256 && delta < 256)
10308 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10309 plus_constant (Pmode, this_rtx, delta));
10310 else
10311 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10312 temp1, temp0, false);
10315 if (Pmode == ptr_mode)
10316 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10317 else
10318 aarch64_emit_move (temp0,
10319 gen_rtx_ZERO_EXTEND (Pmode,
10320 gen_rtx_MEM (ptr_mode, addr)));
10322 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10323 addr = plus_constant (Pmode, temp0, vcall_offset);
10324 else
10326 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10327 Pmode);
10328 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10331 if (Pmode == ptr_mode)
10332 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10333 else
10334 aarch64_emit_move (temp1,
10335 gen_rtx_SIGN_EXTEND (Pmode,
10336 gen_rtx_MEM (ptr_mode, addr)));
10338 emit_insn (gen_add2_insn (this_rtx, temp1));
10341 /* Generate a tail call to the target function. */
10342 if (!TREE_USED (function))
10344 assemble_external (function);
10345 TREE_USED (function) = 1;
10347 funexp = XEXP (DECL_RTL (function), 0);
10348 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10349 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10350 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10351 SIBLING_CALL_P (insn) = 1;
10353 insn = get_insns ();
10354 shorten_branches (insn);
10356 assemble_start_function (thunk, fnname);
10357 final_start_function (insn, file, 1);
10358 final (insn, file, 1);
10359 final_end_function ();
10360 assemble_end_function (thunk, fnname);
10362 /* Stop pretending to be a post-reload pass. */
10363 reload_completed = 0;
10366 static bool
10367 aarch64_tls_referenced_p (rtx x)
10369 if (!TARGET_HAVE_TLS)
10370 return false;
10371 subrtx_iterator::array_type array;
10372 FOR_EACH_SUBRTX (iter, array, x, ALL)
10374 const_rtx x = *iter;
10375 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10376 return true;
10377 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10378 TLS offsets, not real symbol references. */
10379 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10380 iter.skip_subrtxes ();
10382 return false;
10386 static bool
10387 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10389 if (GET_CODE (x) == HIGH)
10390 return true;
10392 /* There's no way to calculate VL-based values using relocations. */
10393 subrtx_iterator::array_type array;
10394 FOR_EACH_SUBRTX (iter, array, x, ALL)
10395 if (GET_CODE (*iter) == CONST_POLY_INT)
10396 return true;
10398 poly_int64 offset;
10399 rtx base = strip_offset_and_salt (x, &offset);
10400 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10402 /* We checked for POLY_INT_CST offsets above. */
10403 if (aarch64_classify_symbol (base, offset.to_constant ())
10404 != SYMBOL_FORCE_TO_MEM)
10405 return true;
10406 else
10407 /* Avoid generating a 64-bit relocation in ILP32; leave
10408 to aarch64_expand_mov_immediate to handle it properly. */
10409 return mode != ptr_mode;
10412 return aarch64_tls_referenced_p (x);
10415 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10416 The expansion for a table switch is quite expensive due to the number
10417 of instructions, the table lookup and hard to predict indirect jump.
10418 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10419 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10420 performance. When optimizing for size, use 8 for smallest codesize. */
10422 static unsigned int
10423 aarch64_case_values_threshold (void)
10425 /* Use the specified limit for the number of cases before using jump
10426 tables at higher optimization levels. */
10427 if (optimize > 2
10428 && aarch64_tune_params.max_case_values != 0)
10429 return aarch64_tune_params.max_case_values;
10430 else
10431 return optimize_size ? 8 : 11;
10434 /* Return true if register REGNO is a valid index register.
10435 STRICT_P is true if REG_OK_STRICT is in effect. */
10437 bool
10438 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10440 if (!HARD_REGISTER_NUM_P (regno))
10442 if (!strict_p)
10443 return true;
10445 if (!reg_renumber)
10446 return false;
10448 regno = reg_renumber[regno];
10450 return GP_REGNUM_P (regno);
10453 /* Return true if register REGNO is a valid base register for mode MODE.
10454 STRICT_P is true if REG_OK_STRICT is in effect. */
10456 bool
10457 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10459 if (!HARD_REGISTER_NUM_P (regno))
10461 if (!strict_p)
10462 return true;
10464 if (!reg_renumber)
10465 return false;
10467 regno = reg_renumber[regno];
10470 /* The fake registers will be eliminated to either the stack or
10471 hard frame pointer, both of which are usually valid base registers.
10472 Reload deals with the cases where the eliminated form isn't valid. */
10473 return (GP_REGNUM_P (regno)
10474 || regno == SP_REGNUM
10475 || regno == FRAME_POINTER_REGNUM
10476 || regno == ARG_POINTER_REGNUM);
10479 /* Return true if X is a valid base register for mode MODE.
10480 STRICT_P is true if REG_OK_STRICT is in effect. */
10482 static bool
10483 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10485 if (!strict_p
10486 && SUBREG_P (x)
10487 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10488 x = SUBREG_REG (x);
10490 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10493 /* Return true if address offset is a valid index. If it is, fill in INFO
10494 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10496 static bool
10497 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10498 machine_mode mode, bool strict_p)
10500 enum aarch64_address_type type;
10501 rtx index;
10502 int shift;
10504 /* (reg:P) */
10505 if ((REG_P (x) || SUBREG_P (x))
10506 && GET_MODE (x) == Pmode)
10508 type = ADDRESS_REG_REG;
10509 index = x;
10510 shift = 0;
10512 /* (sign_extend:DI (reg:SI)) */
10513 else if ((GET_CODE (x) == SIGN_EXTEND
10514 || GET_CODE (x) == ZERO_EXTEND)
10515 && GET_MODE (x) == DImode
10516 && GET_MODE (XEXP (x, 0)) == SImode)
10518 type = (GET_CODE (x) == SIGN_EXTEND)
10519 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10520 index = XEXP (x, 0);
10521 shift = 0;
10523 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10524 else if (GET_CODE (x) == MULT
10525 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10526 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10527 && GET_MODE (XEXP (x, 0)) == DImode
10528 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10529 && CONST_INT_P (XEXP (x, 1)))
10531 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10532 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10533 index = XEXP (XEXP (x, 0), 0);
10534 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10536 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10537 else if (GET_CODE (x) == ASHIFT
10538 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10539 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10540 && GET_MODE (XEXP (x, 0)) == DImode
10541 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10542 && CONST_INT_P (XEXP (x, 1)))
10544 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10545 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10546 index = XEXP (XEXP (x, 0), 0);
10547 shift = INTVAL (XEXP (x, 1));
10549 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10550 (const_int 0xffffffff<<shift)) */
10551 else if (GET_CODE (x) == AND
10552 && GET_MODE (x) == DImode
10553 && GET_CODE (XEXP (x, 0)) == MULT
10554 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10555 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10556 && CONST_INT_P (XEXP (x, 1)))
10558 type = ADDRESS_REG_UXTW;
10559 index = XEXP (XEXP (x, 0), 0);
10560 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10561 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10562 shift = -1;
10564 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10565 (const_int 0xffffffff<<shift)) */
10566 else if (GET_CODE (x) == AND
10567 && GET_MODE (x) == DImode
10568 && GET_CODE (XEXP (x, 0)) == ASHIFT
10569 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10570 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10571 && CONST_INT_P (XEXP (x, 1)))
10573 type = ADDRESS_REG_UXTW;
10574 index = XEXP (XEXP (x, 0), 0);
10575 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10576 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10577 shift = -1;
10579 /* (mult:P (reg:P) (const_int scale)) */
10580 else if (GET_CODE (x) == MULT
10581 && GET_MODE (x) == Pmode
10582 && GET_MODE (XEXP (x, 0)) == Pmode
10583 && CONST_INT_P (XEXP (x, 1)))
10585 type = ADDRESS_REG_REG;
10586 index = XEXP (x, 0);
10587 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10589 /* (ashift:P (reg:P) (const_int shift)) */
10590 else if (GET_CODE (x) == ASHIFT
10591 && GET_MODE (x) == Pmode
10592 && GET_MODE (XEXP (x, 0)) == Pmode
10593 && CONST_INT_P (XEXP (x, 1)))
10595 type = ADDRESS_REG_REG;
10596 index = XEXP (x, 0);
10597 shift = INTVAL (XEXP (x, 1));
10599 else
10600 return false;
10602 if (!strict_p
10603 && SUBREG_P (index)
10604 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10605 index = SUBREG_REG (index);
10607 if (aarch64_sve_data_mode_p (mode))
10609 if (type != ADDRESS_REG_REG
10610 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10611 return false;
10613 else
10615 if (shift != 0
10616 && !(IN_RANGE (shift, 1, 3)
10617 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10618 return false;
10621 if (REG_P (index)
10622 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10624 info->type = type;
10625 info->offset = index;
10626 info->shift = shift;
10627 return true;
10630 return false;
10633 /* Return true if MODE is one of the modes for which we
10634 support LDP/STP operations. */
10636 static bool
10637 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10639 return mode == SImode || mode == DImode
10640 || mode == SFmode || mode == DFmode
10641 || mode == SDmode || mode == DDmode
10642 || (aarch64_vector_mode_supported_p (mode)
10643 && (known_eq (GET_MODE_SIZE (mode), 8)
10644 || (known_eq (GET_MODE_SIZE (mode), 16)
10645 && (aarch64_tune_params.extra_tuning_flags
10646 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10649 /* Return true if REGNO is a virtual pointer register, or an eliminable
10650 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10651 include stack_pointer or hard_frame_pointer. */
10652 static bool
10653 virt_or_elim_regno_p (unsigned regno)
10655 return ((regno >= FIRST_VIRTUAL_REGISTER
10656 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10657 || regno == FRAME_POINTER_REGNUM
10658 || regno == ARG_POINTER_REGNUM);
10661 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10662 If it is, fill in INFO appropriately. STRICT_P is true if
10663 REG_OK_STRICT is in effect. */
10665 bool
10666 aarch64_classify_address (struct aarch64_address_info *info,
10667 rtx x, machine_mode mode, bool strict_p,
10668 aarch64_addr_query_type type)
10670 enum rtx_code code = GET_CODE (x);
10671 rtx op0, op1;
10672 poly_int64 offset;
10674 HOST_WIDE_INT const_size;
10676 /* Whether a vector mode is partial doesn't affect address legitimacy.
10677 Partial vectors like VNx8QImode allow the same indexed addressing
10678 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10679 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10680 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10681 vec_flags &= ~VEC_PARTIAL;
10683 /* On BE, we use load/store pair for all large int mode load/stores.
10684 TI/TF/TDmode may also use a load/store pair. */
10685 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10686 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10687 || type == ADDR_QUERY_LDP_STP_N
10688 || mode == TImode
10689 || mode == TFmode
10690 || mode == TDmode
10691 || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10692 && advsimd_struct_p));
10693 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10694 corresponds to the actual size of the memory being loaded/stored and the
10695 mode of the corresponding addressing mode is half of that. */
10696 if (type == ADDR_QUERY_LDP_STP_N)
10698 if (known_eq (GET_MODE_SIZE (mode), 16))
10699 mode = DFmode;
10700 else if (known_eq (GET_MODE_SIZE (mode), 8))
10701 mode = SFmode;
10702 else
10703 return false;
10706 bool allow_reg_index_p = (!load_store_pair_p
10707 && ((vec_flags == 0
10708 && known_lt (GET_MODE_SIZE (mode), 16))
10709 || vec_flags == VEC_ADVSIMD
10710 || vec_flags & VEC_SVE_DATA));
10712 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10713 The latter is not valid for SVE predicates, and that's rejected through
10714 allow_reg_index_p above. */
10715 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10716 && (code != REG && code != PLUS))
10717 return false;
10719 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10720 REG addressing. */
10721 if (advsimd_struct_p
10722 && TARGET_SIMD
10723 && !BYTES_BIG_ENDIAN
10724 && (code != POST_INC && code != REG))
10725 return false;
10727 gcc_checking_assert (GET_MODE (x) == VOIDmode
10728 || SCALAR_INT_MODE_P (GET_MODE (x)));
10730 switch (code)
10732 case REG:
10733 case SUBREG:
10734 info->type = ADDRESS_REG_IMM;
10735 info->base = x;
10736 info->offset = const0_rtx;
10737 info->const_offset = 0;
10738 return aarch64_base_register_rtx_p (x, strict_p);
10740 case PLUS:
10741 op0 = XEXP (x, 0);
10742 op1 = XEXP (x, 1);
10744 if (! strict_p
10745 && REG_P (op0)
10746 && virt_or_elim_regno_p (REGNO (op0))
10747 && poly_int_rtx_p (op1, &offset))
10749 info->type = ADDRESS_REG_IMM;
10750 info->base = op0;
10751 info->offset = op1;
10752 info->const_offset = offset;
10754 return true;
10757 if (maybe_ne (GET_MODE_SIZE (mode), 0)
10758 && aarch64_base_register_rtx_p (op0, strict_p)
10759 && poly_int_rtx_p (op1, &offset))
10761 info->type = ADDRESS_REG_IMM;
10762 info->base = op0;
10763 info->offset = op1;
10764 info->const_offset = offset;
10766 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10767 registers and individual Q registers. The available
10768 address modes are:
10769 X,X: 7-bit signed scaled offset
10770 Q: 9-bit signed offset
10771 We conservatively require an offset representable in either mode.
10772 When performing the check for pairs of X registers i.e. LDP/STP
10773 pass down DImode since that is the natural size of the LDP/STP
10774 instruction memory accesses. */
10775 if (mode == TImode || mode == TFmode || mode == TDmode)
10776 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10777 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10778 || offset_12bit_unsigned_scaled_p (mode, offset)));
10780 if (mode == V8DImode)
10781 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10782 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10784 /* A 7bit offset check because OImode will emit a ldp/stp
10785 instruction (only !TARGET_SIMD or big endian will get here).
10786 For ldp/stp instructions, the offset is scaled for the size of a
10787 single element of the pair. */
10788 if (aarch64_advsimd_partial_struct_mode_p (mode)
10789 && known_eq (GET_MODE_SIZE (mode), 16))
10790 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10791 if (aarch64_advsimd_full_struct_mode_p (mode)
10792 && known_eq (GET_MODE_SIZE (mode), 32))
10793 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10795 /* Three 9/12 bit offsets checks because CImode will emit three
10796 ldr/str instructions (only !TARGET_SIMD or big endian will
10797 get here). */
10798 if (aarch64_advsimd_partial_struct_mode_p (mode)
10799 && known_eq (GET_MODE_SIZE (mode), 24))
10800 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10801 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10802 offset + 16)
10803 || offset_12bit_unsigned_scaled_p (DImode,
10804 offset + 16)));
10805 if (aarch64_advsimd_full_struct_mode_p (mode)
10806 && known_eq (GET_MODE_SIZE (mode), 48))
10807 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10808 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10809 offset + 32)
10810 || offset_12bit_unsigned_scaled_p (TImode,
10811 offset + 32)));
10813 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10814 instructions (only big endian will get here). */
10815 if (aarch64_advsimd_partial_struct_mode_p (mode)
10816 && known_eq (GET_MODE_SIZE (mode), 32))
10817 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10818 && aarch64_offset_7bit_signed_scaled_p (DImode,
10819 offset + 16));
10820 if (aarch64_advsimd_full_struct_mode_p (mode)
10821 && known_eq (GET_MODE_SIZE (mode), 64))
10822 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10823 && aarch64_offset_7bit_signed_scaled_p (TImode,
10824 offset + 32));
10826 /* Make "m" use the LD1 offset range for SVE data modes, so
10827 that pre-RTL optimizers like ivopts will work to that
10828 instead of the wider LDR/STR range. */
10829 if (vec_flags == VEC_SVE_DATA)
10830 return (type == ADDR_QUERY_M
10831 ? offset_4bit_signed_scaled_p (mode, offset)
10832 : offset_9bit_signed_scaled_p (mode, offset));
10834 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10836 poly_int64 end_offset = (offset
10837 + GET_MODE_SIZE (mode)
10838 - BYTES_PER_SVE_VECTOR);
10839 return (type == ADDR_QUERY_M
10840 ? offset_4bit_signed_scaled_p (mode, offset)
10841 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10842 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10843 end_offset)));
10846 if (vec_flags == VEC_SVE_PRED)
10847 return offset_9bit_signed_scaled_p (mode, offset);
10849 if (load_store_pair_p)
10850 return ((known_eq (GET_MODE_SIZE (mode), 4)
10851 || known_eq (GET_MODE_SIZE (mode), 8)
10852 || known_eq (GET_MODE_SIZE (mode), 16))
10853 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10854 else
10855 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10856 || offset_12bit_unsigned_scaled_p (mode, offset));
10859 if (allow_reg_index_p)
10861 /* Look for base + (scaled/extended) index register. */
10862 if (aarch64_base_register_rtx_p (op0, strict_p)
10863 && aarch64_classify_index (info, op1, mode, strict_p))
10865 info->base = op0;
10866 return true;
10868 if (aarch64_base_register_rtx_p (op1, strict_p)
10869 && aarch64_classify_index (info, op0, mode, strict_p))
10871 info->base = op1;
10872 return true;
10876 return false;
10878 case POST_INC:
10879 case POST_DEC:
10880 case PRE_INC:
10881 case PRE_DEC:
10882 info->type = ADDRESS_REG_WB;
10883 info->base = XEXP (x, 0);
10884 info->offset = NULL_RTX;
10885 return aarch64_base_register_rtx_p (info->base, strict_p);
10887 case POST_MODIFY:
10888 case PRE_MODIFY:
10889 info->type = ADDRESS_REG_WB;
10890 info->base = XEXP (x, 0);
10891 if (GET_CODE (XEXP (x, 1)) == PLUS
10892 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10893 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10894 && aarch64_base_register_rtx_p (info->base, strict_p))
10896 info->offset = XEXP (XEXP (x, 1), 1);
10897 info->const_offset = offset;
10899 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10900 registers and individual Q registers. The available
10901 address modes are:
10902 X,X: 7-bit signed scaled offset
10903 Q: 9-bit signed offset
10904 We conservatively require an offset representable in either mode.
10906 if (mode == TImode || mode == TFmode || mode == TDmode)
10907 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10908 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10910 if (load_store_pair_p)
10911 return ((known_eq (GET_MODE_SIZE (mode), 4)
10912 || known_eq (GET_MODE_SIZE (mode), 8)
10913 || known_eq (GET_MODE_SIZE (mode), 16))
10914 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10915 else
10916 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10918 return false;
10920 case CONST:
10921 case SYMBOL_REF:
10922 case LABEL_REF:
10923 /* load literal: pc-relative constant pool entry. Only supported
10924 for SI mode or larger. */
10925 info->type = ADDRESS_SYMBOLIC;
10927 if (!load_store_pair_p
10928 && GET_MODE_SIZE (mode).is_constant (&const_size)
10929 && const_size >= 4)
10931 poly_int64 offset;
10932 rtx sym = strip_offset_and_salt (x, &offset);
10933 return ((LABEL_REF_P (sym)
10934 || (SYMBOL_REF_P (sym)
10935 && CONSTANT_POOL_ADDRESS_P (sym)
10936 && aarch64_pcrelative_literal_loads)));
10938 return false;
10940 case LO_SUM:
10941 info->type = ADDRESS_LO_SUM;
10942 info->base = XEXP (x, 0);
10943 info->offset = XEXP (x, 1);
10944 if (allow_reg_index_p
10945 && aarch64_base_register_rtx_p (info->base, strict_p))
10947 poly_int64 offset;
10948 HOST_WIDE_INT const_offset;
10949 rtx sym = strip_offset_and_salt (info->offset, &offset);
10950 if (SYMBOL_REF_P (sym)
10951 && offset.is_constant (&const_offset)
10952 && (aarch64_classify_symbol (sym, const_offset)
10953 == SYMBOL_SMALL_ABSOLUTE))
10955 /* The symbol and offset must be aligned to the access size. */
10956 unsigned int align;
10958 if (CONSTANT_POOL_ADDRESS_P (sym))
10959 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10960 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10962 tree exp = SYMBOL_REF_DECL (sym);
10963 align = TYPE_ALIGN (TREE_TYPE (exp));
10964 align = aarch64_constant_alignment (exp, align);
10966 else if (SYMBOL_REF_DECL (sym))
10967 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10968 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10969 && SYMBOL_REF_BLOCK (sym) != NULL)
10970 align = SYMBOL_REF_BLOCK (sym)->alignment;
10971 else
10972 align = BITS_PER_UNIT;
10974 poly_int64 ref_size = GET_MODE_SIZE (mode);
10975 if (known_eq (ref_size, 0))
10976 ref_size = GET_MODE_SIZE (DImode);
10978 return (multiple_p (const_offset, ref_size)
10979 && multiple_p (align / BITS_PER_UNIT, ref_size));
10982 return false;
10984 default:
10985 return false;
10989 /* Return true if the address X is valid for a PRFM instruction.
10990 STRICT_P is true if we should do strict checking with
10991 aarch64_classify_address. */
10993 bool
10994 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10996 struct aarch64_address_info addr;
10998 /* PRFM accepts the same addresses as DImode... */
10999 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11000 if (!res)
11001 return false;
11003 /* ... except writeback forms. */
11004 return addr.type != ADDRESS_REG_WB;
11007 bool
11008 aarch64_symbolic_address_p (rtx x)
11010 poly_int64 offset;
11011 x = strip_offset_and_salt (x, &offset);
11012 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11015 /* Classify the base of symbolic expression X. */
11017 enum aarch64_symbol_type
11018 aarch64_classify_symbolic_expression (rtx x)
11020 rtx offset;
11022 split_const (x, &x, &offset);
11023 return aarch64_classify_symbol (x, INTVAL (offset));
11027 /* Return TRUE if X is a legitimate address for accessing memory in
11028 mode MODE. */
11029 static bool
11030 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
11032 struct aarch64_address_info addr;
11034 return aarch64_classify_address (&addr, x, mode, strict_p);
11037 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11038 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
11039 bool
11040 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11041 aarch64_addr_query_type type)
11043 struct aarch64_address_info addr;
11045 return aarch64_classify_address (&addr, x, mode, strict_p, type);
11048 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
11050 static bool
11051 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11052 poly_int64 orig_offset,
11053 machine_mode mode)
11055 HOST_WIDE_INT size;
11056 if (GET_MODE_SIZE (mode).is_constant (&size))
11058 HOST_WIDE_INT const_offset, second_offset;
11060 /* A general SVE offset is A * VQ + B. Remove the A component from
11061 coefficient 0 in order to get the constant B. */
11062 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11064 /* Split an out-of-range address displacement into a base and
11065 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
11066 range otherwise to increase opportunities for sharing the base
11067 address of different sizes. Unaligned accesses use the signed
11068 9-bit range, TImode/TFmode/TDmode use the intersection of signed
11069 scaled 7-bit and signed 9-bit offset. */
11070 if (mode == TImode || mode == TFmode || mode == TDmode)
11071 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11072 else if ((const_offset & (size - 1)) != 0)
11073 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11074 else
11075 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11077 if (second_offset == 0 || known_eq (orig_offset, second_offset))
11078 return false;
11080 /* Split the offset into second_offset and the rest. */
11081 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11082 *offset2 = gen_int_mode (second_offset, Pmode);
11083 return true;
11085 else
11087 /* Get the mode we should use as the basis of the range. For structure
11088 modes this is the mode of one vector. */
11089 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11090 machine_mode step_mode
11091 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11093 /* Get the "mul vl" multiplier we'd like to use. */
11094 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11095 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11096 if (vec_flags & VEC_SVE_DATA)
11097 /* LDR supports a 9-bit range, but the move patterns for
11098 structure modes require all vectors to be in range of the
11099 same base. The simplest way of accomodating that while still
11100 promoting reuse of anchor points between different modes is
11101 to use an 8-bit range unconditionally. */
11102 vnum = ((vnum + 128) & 255) - 128;
11103 else
11104 /* Predicates are only handled singly, so we might as well use
11105 the full range. */
11106 vnum = ((vnum + 256) & 511) - 256;
11107 if (vnum == 0)
11108 return false;
11110 /* Convert the "mul vl" multiplier into a byte offset. */
11111 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11112 if (known_eq (second_offset, orig_offset))
11113 return false;
11115 /* Split the offset into second_offset and the rest. */
11116 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11117 *offset2 = gen_int_mode (second_offset, Pmode);
11118 return true;
11122 /* Return the binary representation of floating point constant VALUE in INTVAL.
11123 If the value cannot be converted, return false without setting INTVAL.
11124 The conversion is done in the given MODE. */
11125 bool
11126 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11129 /* We make a general exception for 0. */
11130 if (aarch64_float_const_zero_rtx_p (value))
11132 *intval = 0;
11133 return true;
11136 scalar_float_mode mode;
11137 if (!CONST_DOUBLE_P (value)
11138 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11139 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11140 /* Only support up to DF mode. */
11141 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11142 return false;
11144 unsigned HOST_WIDE_INT ival = 0;
11146 long res[2];
11147 real_to_target (res,
11148 CONST_DOUBLE_REAL_VALUE (value),
11149 REAL_MODE_FORMAT (mode));
11151 if (mode == DFmode || mode == DDmode)
11153 int order = BYTES_BIG_ENDIAN ? 1 : 0;
11154 ival = zext_hwi (res[order], 32);
11155 ival |= (zext_hwi (res[1 - order], 32) << 32);
11157 else
11158 ival = zext_hwi (res[0], 32);
11160 *intval = ival;
11161 return true;
11164 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11165 single MOV(+MOVK) followed by an FMOV. */
11166 bool
11167 aarch64_float_const_rtx_p (rtx x)
11169 machine_mode mode = GET_MODE (x);
11170 if (mode == VOIDmode)
11171 return false;
11173 /* Determine whether it's cheaper to write float constants as
11174 mov/movk pairs over ldr/adrp pairs. */
11175 unsigned HOST_WIDE_INT ival;
11177 if (CONST_DOUBLE_P (x)
11178 && SCALAR_FLOAT_MODE_P (mode)
11179 && aarch64_reinterpret_float_as_int (x, &ival))
11181 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11182 int num_instr = aarch64_internal_mov_immediate
11183 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11184 return num_instr < 3;
11187 return false;
11190 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11191 Floating Point). */
11192 bool
11193 aarch64_float_const_zero_rtx_p (rtx x)
11195 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11196 zr as our callers expect, so no need to check the actual
11197 value if X is of Decimal Floating Point type. */
11198 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11199 return false;
11201 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11202 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11203 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11206 /* Return TRUE if rtx X is immediate constant that fits in a single
11207 MOVI immediate operation. */
11208 bool
11209 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11211 if (!TARGET_SIMD)
11212 return false;
11214 machine_mode vmode;
11215 scalar_int_mode imode;
11216 unsigned HOST_WIDE_INT ival;
11218 if (CONST_DOUBLE_P (x)
11219 && SCALAR_FLOAT_MODE_P (mode))
11221 if (!aarch64_reinterpret_float_as_int (x, &ival))
11222 return false;
11224 /* We make a general exception for 0. */
11225 if (aarch64_float_const_zero_rtx_p (x))
11226 return true;
11228 imode = int_mode_for_mode (mode).require ();
11230 else if (CONST_INT_P (x)
11231 && is_a <scalar_int_mode> (mode, &imode))
11232 ival = INTVAL (x);
11233 else
11234 return false;
11236 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11237 a 128 bit vector mode. */
11238 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11240 vmode = aarch64_simd_container_mode (imode, width);
11241 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11243 return aarch64_simd_valid_immediate (v_op, NULL);
11247 /* Return the fixed registers used for condition codes. */
11249 static bool
11250 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11252 *p1 = CC_REGNUM;
11253 *p2 = INVALID_REGNUM;
11254 return true;
11257 /* This function is used by the call expanders of the machine description.
11258 RESULT is the register in which the result is returned. It's NULL for
11259 "call" and "sibcall".
11260 MEM is the location of the function call.
11261 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11262 SIBCALL indicates whether this function call is normal call or sibling call.
11263 It will generate different pattern accordingly. */
11265 void
11266 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11268 rtx call, callee, tmp;
11269 rtvec vec;
11270 machine_mode mode;
11272 gcc_assert (MEM_P (mem));
11273 callee = XEXP (mem, 0);
11274 mode = GET_MODE (callee);
11275 gcc_assert (mode == Pmode);
11277 /* Decide if we should generate indirect calls by loading the
11278 address of the callee into a register before performing
11279 the branch-and-link. */
11280 if (SYMBOL_REF_P (callee)
11281 ? (aarch64_is_long_call_p (callee)
11282 || aarch64_is_noplt_call_p (callee))
11283 : !REG_P (callee))
11284 XEXP (mem, 0) = force_reg (mode, callee);
11286 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11288 if (result != NULL_RTX)
11289 call = gen_rtx_SET (result, call);
11291 if (sibcall)
11292 tmp = ret_rtx;
11293 else
11294 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11296 gcc_assert (CONST_INT_P (callee_abi));
11297 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11298 UNSPEC_CALLEE_ABI);
11300 vec = gen_rtvec (3, call, callee_abi, tmp);
11301 call = gen_rtx_PARALLEL (VOIDmode, vec);
11303 aarch64_emit_call_insn (call);
11306 /* Emit call insn with PAT and do aarch64-specific handling. */
11308 void
11309 aarch64_emit_call_insn (rtx pat)
11311 rtx insn = emit_call_insn (pat);
11313 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11314 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11315 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11318 machine_mode
11319 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11321 machine_mode mode_x = GET_MODE (x);
11322 rtx_code code_x = GET_CODE (x);
11324 /* All floating point compares return CCFP if it is an equality
11325 comparison, and CCFPE otherwise. */
11326 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11328 switch (code)
11330 case EQ:
11331 case NE:
11332 case UNORDERED:
11333 case ORDERED:
11334 case UNLT:
11335 case UNLE:
11336 case UNGT:
11337 case UNGE:
11338 case UNEQ:
11339 return CCFPmode;
11341 case LT:
11342 case LE:
11343 case GT:
11344 case GE:
11345 case LTGT:
11346 return CCFPEmode;
11348 default:
11349 gcc_unreachable ();
11353 /* Equality comparisons of short modes against zero can be performed
11354 using the TST instruction with the appropriate bitmask. */
11355 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11356 && (code == EQ || code == NE)
11357 && (mode_x == HImode || mode_x == QImode))
11358 return CC_Zmode;
11360 /* Similarly, comparisons of zero_extends from shorter modes can
11361 be performed using an ANDS with an immediate mask. */
11362 if (y == const0_rtx && code_x == ZERO_EXTEND
11363 && (mode_x == SImode || mode_x == DImode)
11364 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11365 && (code == EQ || code == NE))
11366 return CC_Zmode;
11368 /* Zero extracts support equality comparisons. */
11369 if ((mode_x == SImode || mode_x == DImode)
11370 && y == const0_rtx
11371 && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11372 && CONST_INT_P (XEXP (x, 2)))
11373 && (code == EQ || code == NE))
11374 return CC_Zmode;
11376 /* ANDS/BICS/TST support equality and all signed comparisons. */
11377 if ((mode_x == SImode || mode_x == DImode)
11378 && y == const0_rtx
11379 && (code_x == AND)
11380 && (code == EQ || code == NE || code == LT || code == GE
11381 || code == GT || code == LE))
11382 return CC_NZVmode;
11384 /* ADDS/SUBS correctly set N and Z flags. */
11385 if ((mode_x == SImode || mode_x == DImode)
11386 && y == const0_rtx
11387 && (code == EQ || code == NE || code == LT || code == GE)
11388 && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11389 return CC_NZmode;
11391 /* A compare with a shifted operand. Because of canonicalization,
11392 the comparison will have to be swapped when we emit the assembly
11393 code. */
11394 if ((mode_x == SImode || mode_x == DImode)
11395 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11396 && (code_x == ASHIFT || code_x == ASHIFTRT
11397 || code_x == LSHIFTRT
11398 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11399 return CC_SWPmode;
11401 /* Similarly for a negated operand, but we can only do this for
11402 equalities. */
11403 if ((mode_x == SImode || mode_x == DImode)
11404 && (REG_P (y) || SUBREG_P (y))
11405 && (code == EQ || code == NE)
11406 && code_x == NEG)
11407 return CC_Zmode;
11409 /* A test for unsigned overflow from an addition. */
11410 if ((mode_x == DImode || mode_x == TImode)
11411 && (code == LTU || code == GEU)
11412 && code_x == PLUS
11413 && rtx_equal_p (XEXP (x, 0), y))
11414 return CC_Cmode;
11416 /* A test for unsigned overflow from an add with carry. */
11417 if ((mode_x == DImode || mode_x == TImode)
11418 && (code == LTU || code == GEU)
11419 && code_x == PLUS
11420 && CONST_SCALAR_INT_P (y)
11421 && (rtx_mode_t (y, mode_x)
11422 == (wi::shwi (1, mode_x)
11423 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11424 return CC_ADCmode;
11426 /* A test for signed overflow. */
11427 if ((mode_x == DImode || mode_x == TImode)
11428 && code == NE
11429 && code_x == PLUS
11430 && GET_CODE (y) == SIGN_EXTEND)
11431 return CC_Vmode;
11433 /* For everything else, return CCmode. */
11434 return CCmode;
11437 static int
11438 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11441 aarch64_get_condition_code (rtx x)
11443 machine_mode mode = GET_MODE (XEXP (x, 0));
11444 enum rtx_code comp_code = GET_CODE (x);
11446 if (GET_MODE_CLASS (mode) != MODE_CC)
11447 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11448 return aarch64_get_condition_code_1 (mode, comp_code);
11451 static int
11452 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11454 switch (mode)
11456 case E_CCFPmode:
11457 case E_CCFPEmode:
11458 switch (comp_code)
11460 case GE: return AARCH64_GE;
11461 case GT: return AARCH64_GT;
11462 case LE: return AARCH64_LS;
11463 case LT: return AARCH64_MI;
11464 case NE: return AARCH64_NE;
11465 case EQ: return AARCH64_EQ;
11466 case ORDERED: return AARCH64_VC;
11467 case UNORDERED: return AARCH64_VS;
11468 case UNLT: return AARCH64_LT;
11469 case UNLE: return AARCH64_LE;
11470 case UNGT: return AARCH64_HI;
11471 case UNGE: return AARCH64_PL;
11472 default: return -1;
11474 break;
11476 case E_CCmode:
11477 switch (comp_code)
11479 case NE: return AARCH64_NE;
11480 case EQ: return AARCH64_EQ;
11481 case GE: return AARCH64_GE;
11482 case GT: return AARCH64_GT;
11483 case LE: return AARCH64_LE;
11484 case LT: return AARCH64_LT;
11485 case GEU: return AARCH64_CS;
11486 case GTU: return AARCH64_HI;
11487 case LEU: return AARCH64_LS;
11488 case LTU: return AARCH64_CC;
11489 default: return -1;
11491 break;
11493 case E_CC_SWPmode:
11494 switch (comp_code)
11496 case NE: return AARCH64_NE;
11497 case EQ: return AARCH64_EQ;
11498 case GE: return AARCH64_LE;
11499 case GT: return AARCH64_LT;
11500 case LE: return AARCH64_GE;
11501 case LT: return AARCH64_GT;
11502 case GEU: return AARCH64_LS;
11503 case GTU: return AARCH64_CC;
11504 case LEU: return AARCH64_CS;
11505 case LTU: return AARCH64_HI;
11506 default: return -1;
11508 break;
11510 case E_CC_NZCmode:
11511 switch (comp_code)
11513 case NE: return AARCH64_NE; /* = any */
11514 case EQ: return AARCH64_EQ; /* = none */
11515 case GE: return AARCH64_PL; /* = nfrst */
11516 case LT: return AARCH64_MI; /* = first */
11517 case GEU: return AARCH64_CS; /* = nlast */
11518 case GTU: return AARCH64_HI; /* = pmore */
11519 case LEU: return AARCH64_LS; /* = plast */
11520 case LTU: return AARCH64_CC; /* = last */
11521 default: return -1;
11523 break;
11525 case E_CC_NZVmode:
11526 switch (comp_code)
11528 case NE: return AARCH64_NE;
11529 case EQ: return AARCH64_EQ;
11530 case GE: return AARCH64_PL;
11531 case LT: return AARCH64_MI;
11532 case GT: return AARCH64_GT;
11533 case LE: return AARCH64_LE;
11534 default: return -1;
11536 break;
11538 case E_CC_NZmode:
11539 switch (comp_code)
11541 case NE: return AARCH64_NE;
11542 case EQ: return AARCH64_EQ;
11543 case GE: return AARCH64_PL;
11544 case LT: return AARCH64_MI;
11545 default: return -1;
11547 break;
11549 case E_CC_Zmode:
11550 switch (comp_code)
11552 case NE: return AARCH64_NE;
11553 case EQ: return AARCH64_EQ;
11554 default: return -1;
11556 break;
11558 case E_CC_Cmode:
11559 switch (comp_code)
11561 case LTU: return AARCH64_CS;
11562 case GEU: return AARCH64_CC;
11563 default: return -1;
11565 break;
11567 case E_CC_ADCmode:
11568 switch (comp_code)
11570 case GEU: return AARCH64_CS;
11571 case LTU: return AARCH64_CC;
11572 default: return -1;
11574 break;
11576 case E_CC_Vmode:
11577 switch (comp_code)
11579 case NE: return AARCH64_VS;
11580 case EQ: return AARCH64_VC;
11581 default: return -1;
11583 break;
11585 default:
11586 return -1;
11589 return -1;
11592 bool
11593 aarch64_const_vec_all_same_in_range_p (rtx x,
11594 HOST_WIDE_INT minval,
11595 HOST_WIDE_INT maxval)
11597 rtx elt;
11598 return (const_vec_duplicate_p (x, &elt)
11599 && CONST_INT_P (elt)
11600 && IN_RANGE (INTVAL (elt), minval, maxval));
11603 bool
11604 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11606 return aarch64_const_vec_all_same_in_range_p (x, val, val);
11609 /* Return true if VEC is a constant in which every element is in the range
11610 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11612 static bool
11613 aarch64_const_vec_all_in_range_p (rtx vec,
11614 HOST_WIDE_INT minval,
11615 HOST_WIDE_INT maxval)
11617 if (!CONST_VECTOR_P (vec)
11618 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11619 return false;
11621 int nunits;
11622 if (!CONST_VECTOR_STEPPED_P (vec))
11623 nunits = const_vector_encoded_nelts (vec);
11624 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11625 return false;
11627 for (int i = 0; i < nunits; i++)
11629 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11630 if (!CONST_INT_P (vec_elem)
11631 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11632 return false;
11634 return true;
11637 /* N Z C V. */
11638 #define AARCH64_CC_V 1
11639 #define AARCH64_CC_C (1 << 1)
11640 #define AARCH64_CC_Z (1 << 2)
11641 #define AARCH64_CC_N (1 << 3)
11643 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11644 static const int aarch64_nzcv_codes[] =
11646 0, /* EQ, Z == 1. */
11647 AARCH64_CC_Z, /* NE, Z == 0. */
11648 0, /* CS, C == 1. */
11649 AARCH64_CC_C, /* CC, C == 0. */
11650 0, /* MI, N == 1. */
11651 AARCH64_CC_N, /* PL, N == 0. */
11652 0, /* VS, V == 1. */
11653 AARCH64_CC_V, /* VC, V == 0. */
11654 0, /* HI, C ==1 && Z == 0. */
11655 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
11656 AARCH64_CC_V, /* GE, N == V. */
11657 0, /* LT, N != V. */
11658 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
11659 0, /* LE, !(Z == 0 && N == V). */
11660 0, /* AL, Any. */
11661 0 /* NV, Any. */
11664 /* Print floating-point vector immediate operand X to F, negating it
11665 first if NEGATE is true. Return true on success, false if it isn't
11666 a constant we can handle. */
11668 static bool
11669 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11671 rtx elt;
11673 if (!const_vec_duplicate_p (x, &elt))
11674 return false;
11676 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11677 if (negate)
11678 r = real_value_negate (&r);
11680 /* Handle the SVE single-bit immediates specially, since they have a
11681 fixed form in the assembly syntax. */
11682 if (real_equal (&r, &dconst0))
11683 asm_fprintf (f, "0.0");
11684 else if (real_equal (&r, &dconst2))
11685 asm_fprintf (f, "2.0");
11686 else if (real_equal (&r, &dconst1))
11687 asm_fprintf (f, "1.0");
11688 else if (real_equal (&r, &dconsthalf))
11689 asm_fprintf (f, "0.5");
11690 else
11692 const int buf_size = 20;
11693 char float_buf[buf_size] = {'\0'};
11694 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11695 1, GET_MODE (elt));
11696 asm_fprintf (f, "%s", float_buf);
11699 return true;
11702 /* Return the equivalent letter for size. */
11703 static char
11704 sizetochar (int size)
11706 switch (size)
11708 case 64: return 'd';
11709 case 32: return 's';
11710 case 16: return 'h';
11711 case 8 : return 'b';
11712 default: gcc_unreachable ();
11716 /* Print operand X to file F in a target specific manner according to CODE.
11717 The acceptable formatting commands given by CODE are:
11718 'c': An integer or symbol address without a preceding #
11719 sign.
11720 'C': Take the duplicated element in a vector constant
11721 and print it in hex.
11722 'D': Take the duplicated element in a vector constant
11723 and print it as an unsigned integer, in decimal.
11724 'e': Print the sign/zero-extend size as a character 8->b,
11725 16->h, 32->w. Can also be used for masks:
11726 0xff->b, 0xffff->h, 0xffffffff->w.
11727 'I': If the operand is a duplicated vector constant,
11728 replace it with the duplicated scalar. If the
11729 operand is then a floating-point constant, replace
11730 it with the integer bit representation. Print the
11731 transformed constant as a signed decimal number.
11732 'p': Prints N such that 2^N == X (X must be power of 2 and
11733 const int).
11734 'P': Print the number of non-zero bits in X (a const_int).
11735 'H': Print the higher numbered register of a pair (TImode)
11736 of regs.
11737 'm': Print a condition (eq, ne, etc).
11738 'M': Same as 'm', but invert condition.
11739 'N': Take the duplicated element in a vector constant
11740 and print the negative of it in decimal.
11741 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11742 'S/T/U/V': Print a FP/SIMD register name for a register list.
11743 The register printed is the FP/SIMD register name
11744 of X + 0/1/2/3 for S/T/U/V.
11745 'R': Print a scalar Integer/FP/SIMD register name + 1.
11746 'X': Print bottom 16 bits of integer constant in hex.
11747 'w/x': Print a general register name or the zero register
11748 (32-bit or 64-bit).
11749 '0': Print a normal operand, if it's a general register,
11750 then we assume DImode.
11751 'k': Print NZCV for conditional compare instructions.
11752 'A': Output address constant representing the first
11753 argument of X, specifying a relocation offset
11754 if appropriate.
11755 'L': Output constant address specified by X
11756 with a relocation offset if appropriate.
11757 'G': Prints address of X, specifying a PC relative
11758 relocation mode if appropriate.
11759 'y': Output address of LDP or STP - this is used for
11760 some LDP/STPs which don't use a PARALLEL in their
11761 pattern (so the mode needs to be adjusted).
11762 'z': Output address of a typical LDP or STP. */
11764 static void
11765 aarch64_print_operand (FILE *f, rtx x, int code)
11767 rtx elt;
11768 switch (code)
11770 case 'c':
11771 if (CONST_INT_P (x))
11772 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11773 else
11775 poly_int64 offset;
11776 rtx base = strip_offset_and_salt (x, &offset);
11777 if (SYMBOL_REF_P (base))
11778 output_addr_const (f, x);
11779 else
11780 output_operand_lossage ("unsupported operand for code '%c'", code);
11782 break;
11784 case 'e':
11786 x = unwrap_const_vec_duplicate (x);
11787 if (!CONST_INT_P (x))
11789 output_operand_lossage ("invalid operand for '%%%c'", code);
11790 return;
11793 HOST_WIDE_INT val = INTVAL (x);
11794 if ((val & ~7) == 8 || val == 0xff)
11795 fputc ('b', f);
11796 else if ((val & ~7) == 16 || val == 0xffff)
11797 fputc ('h', f);
11798 else if ((val & ~7) == 32 || val == 0xffffffff)
11799 fputc ('w', f);
11800 else
11802 output_operand_lossage ("invalid operand for '%%%c'", code);
11803 return;
11806 break;
11808 case 'p':
11810 int n;
11812 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11814 output_operand_lossage ("invalid operand for '%%%c'", code);
11815 return;
11818 asm_fprintf (f, "%d", n);
11820 break;
11822 case 'P':
11823 if (!CONST_INT_P (x))
11825 output_operand_lossage ("invalid operand for '%%%c'", code);
11826 return;
11829 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11830 break;
11832 case 'H':
11833 if (x == const0_rtx)
11835 asm_fprintf (f, "xzr");
11836 break;
11839 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11841 output_operand_lossage ("invalid operand for '%%%c'", code);
11842 return;
11845 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11846 break;
11848 case 'I':
11850 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11851 if (CONST_INT_P (x))
11852 asm_fprintf (f, "%wd", INTVAL (x));
11853 else
11855 output_operand_lossage ("invalid operand for '%%%c'", code);
11856 return;
11858 break;
11861 case 'M':
11862 case 'm':
11864 int cond_code;
11865 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
11866 if (x == const_true_rtx)
11868 if (code == 'M')
11869 fputs ("nv", f);
11870 return;
11873 if (!COMPARISON_P (x))
11875 output_operand_lossage ("invalid operand for '%%%c'", code);
11876 return;
11879 cond_code = aarch64_get_condition_code (x);
11880 gcc_assert (cond_code >= 0);
11881 if (code == 'M')
11882 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11883 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11884 fputs (aarch64_sve_condition_codes[cond_code], f);
11885 else
11886 fputs (aarch64_condition_codes[cond_code], f);
11888 break;
11890 case 'N':
11891 if (!const_vec_duplicate_p (x, &elt))
11893 output_operand_lossage ("invalid vector constant");
11894 return;
11897 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11898 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
11899 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11900 && aarch64_print_vector_float_operand (f, x, true))
11902 else
11904 output_operand_lossage ("invalid vector constant");
11905 return;
11907 break;
11909 case 'b':
11910 case 'h':
11911 case 's':
11912 case 'd':
11913 case 'q':
11914 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11916 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11917 return;
11919 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
11920 break;
11922 case 'S':
11923 case 'T':
11924 case 'U':
11925 case 'V':
11926 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11928 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11929 return;
11931 asm_fprintf (f, "%c%d",
11932 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
11933 REGNO (x) - V0_REGNUM + (code - 'S'));
11934 break;
11936 case 'R':
11937 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
11938 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
11939 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
11940 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
11941 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
11942 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11943 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
11944 else
11945 output_operand_lossage ("incompatible register operand for '%%%c'",
11946 code);
11947 break;
11949 case 'X':
11950 if (!CONST_INT_P (x))
11952 output_operand_lossage ("invalid operand for '%%%c'", code);
11953 return;
11955 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
11956 break;
11958 case 'C':
11960 /* Print a replicated constant in hex. */
11961 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11963 output_operand_lossage ("invalid operand for '%%%c'", code);
11964 return;
11966 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11967 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11969 break;
11971 case 'D':
11973 /* Print a replicated constant in decimal, treating it as
11974 unsigned. */
11975 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11977 output_operand_lossage ("invalid operand for '%%%c'", code);
11978 return;
11980 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11981 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11983 break;
11985 case 'w':
11986 case 'x':
11987 if (x == const0_rtx
11988 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
11990 asm_fprintf (f, "%czr", code);
11991 break;
11994 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11996 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
11997 break;
12000 if (REG_P (x) && REGNO (x) == SP_REGNUM)
12002 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12003 break;
12006 /* Fall through */
12008 case 0:
12009 if (x == NULL)
12011 output_operand_lossage ("missing operand");
12012 return;
12015 switch (GET_CODE (x))
12017 case REG:
12018 if (aarch64_sve_data_mode_p (GET_MODE (x)))
12020 if (REG_NREGS (x) == 1)
12021 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12022 else
12024 char suffix
12025 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12026 asm_fprintf (f, "{z%d.%c - z%d.%c}",
12027 REGNO (x) - V0_REGNUM, suffix,
12028 END_REGNO (x) - V0_REGNUM - 1, suffix);
12031 else
12032 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12033 break;
12035 case MEM:
12036 output_address (GET_MODE (x), XEXP (x, 0));
12037 break;
12039 case LABEL_REF:
12040 case SYMBOL_REF:
12041 output_addr_const (asm_out_file, x);
12042 break;
12044 case CONST_INT:
12045 asm_fprintf (f, "%wd", INTVAL (x));
12046 break;
12048 case CONST:
12049 if (!VECTOR_MODE_P (GET_MODE (x)))
12051 output_addr_const (asm_out_file, x);
12052 break;
12054 /* fall through */
12056 case CONST_VECTOR:
12057 if (!const_vec_duplicate_p (x, &elt))
12059 output_operand_lossage ("invalid vector constant");
12060 return;
12063 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12064 asm_fprintf (f, "%wd", INTVAL (elt));
12065 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12066 && aarch64_print_vector_float_operand (f, x, false))
12068 else
12070 output_operand_lossage ("invalid vector constant");
12071 return;
12073 break;
12075 case CONST_DOUBLE:
12076 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12077 be getting CONST_DOUBLEs holding integers. */
12078 gcc_assert (GET_MODE (x) != VOIDmode);
12079 if (aarch64_float_const_zero_rtx_p (x))
12081 fputc ('0', f);
12082 break;
12084 else if (aarch64_float_const_representable_p (x))
12086 #define buf_size 20
12087 char float_buf[buf_size] = {'\0'};
12088 real_to_decimal_for_mode (float_buf,
12089 CONST_DOUBLE_REAL_VALUE (x),
12090 buf_size, buf_size,
12091 1, GET_MODE (x));
12092 asm_fprintf (asm_out_file, "%s", float_buf);
12093 break;
12094 #undef buf_size
12096 output_operand_lossage ("invalid constant");
12097 return;
12098 default:
12099 output_operand_lossage ("invalid operand");
12100 return;
12102 break;
12104 case 'A':
12105 if (GET_CODE (x) == HIGH)
12106 x = XEXP (x, 0);
12108 switch (aarch64_classify_symbolic_expression (x))
12110 case SYMBOL_SMALL_GOT_4G:
12111 asm_fprintf (asm_out_file, ":got:");
12112 break;
12114 case SYMBOL_SMALL_TLSGD:
12115 asm_fprintf (asm_out_file, ":tlsgd:");
12116 break;
12118 case SYMBOL_SMALL_TLSDESC:
12119 asm_fprintf (asm_out_file, ":tlsdesc:");
12120 break;
12122 case SYMBOL_SMALL_TLSIE:
12123 asm_fprintf (asm_out_file, ":gottprel:");
12124 break;
12126 case SYMBOL_TLSLE24:
12127 asm_fprintf (asm_out_file, ":tprel:");
12128 break;
12130 case SYMBOL_TINY_GOT:
12131 gcc_unreachable ();
12132 break;
12134 default:
12135 break;
12137 output_addr_const (asm_out_file, x);
12138 break;
12140 case 'L':
12141 switch (aarch64_classify_symbolic_expression (x))
12143 case SYMBOL_SMALL_GOT_4G:
12144 asm_fprintf (asm_out_file, ":got_lo12:");
12145 break;
12147 case SYMBOL_SMALL_TLSGD:
12148 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12149 break;
12151 case SYMBOL_SMALL_TLSDESC:
12152 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12153 break;
12155 case SYMBOL_SMALL_TLSIE:
12156 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12157 break;
12159 case SYMBOL_TLSLE12:
12160 asm_fprintf (asm_out_file, ":tprel_lo12:");
12161 break;
12163 case SYMBOL_TLSLE24:
12164 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12165 break;
12167 case SYMBOL_TINY_GOT:
12168 asm_fprintf (asm_out_file, ":got:");
12169 break;
12171 case SYMBOL_TINY_TLSIE:
12172 asm_fprintf (asm_out_file, ":gottprel:");
12173 break;
12175 default:
12176 break;
12178 output_addr_const (asm_out_file, x);
12179 break;
12181 case 'G':
12182 switch (aarch64_classify_symbolic_expression (x))
12184 case SYMBOL_TLSLE24:
12185 asm_fprintf (asm_out_file, ":tprel_hi12:");
12186 break;
12187 default:
12188 break;
12190 output_addr_const (asm_out_file, x);
12191 break;
12193 case 'k':
12195 HOST_WIDE_INT cond_code;
12197 if (!CONST_INT_P (x))
12199 output_operand_lossage ("invalid operand for '%%%c'", code);
12200 return;
12203 cond_code = INTVAL (x);
12204 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12205 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12207 break;
12209 case 'y':
12210 case 'z':
12212 machine_mode mode = GET_MODE (x);
12214 if (!MEM_P (x)
12215 || (code == 'y'
12216 && maybe_ne (GET_MODE_SIZE (mode), 8)
12217 && maybe_ne (GET_MODE_SIZE (mode), 16)))
12219 output_operand_lossage ("invalid operand for '%%%c'", code);
12220 return;
12223 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12224 code == 'y'
12225 ? ADDR_QUERY_LDP_STP_N
12226 : ADDR_QUERY_LDP_STP))
12227 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12229 break;
12231 default:
12232 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12233 return;
12237 /* Print address 'x' of a memory access with mode 'mode'.
12238 'op' is the context required by aarch64_classify_address. It can either be
12239 MEM for a normal memory access or PARALLEL for LDP/STP. */
12240 static bool
12241 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12242 aarch64_addr_query_type type)
12244 struct aarch64_address_info addr;
12245 unsigned int size, vec_flags;
12247 /* Check all addresses are Pmode - including ILP32. */
12248 if (GET_MODE (x) != Pmode
12249 && (!CONST_INT_P (x)
12250 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12252 output_operand_lossage ("invalid address mode");
12253 return false;
12256 if (aarch64_classify_address (&addr, x, mode, true, type))
12257 switch (addr.type)
12259 case ADDRESS_REG_IMM:
12260 if (known_eq (addr.const_offset, 0))
12262 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12263 return true;
12266 vec_flags = aarch64_classify_vector_mode (mode);
12267 if (vec_flags & VEC_ANY_SVE)
12269 HOST_WIDE_INT vnum
12270 = exact_div (addr.const_offset,
12271 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12272 asm_fprintf (f, "[%s, #%wd, mul vl]",
12273 reg_names[REGNO (addr.base)], vnum);
12274 return true;
12277 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12278 INTVAL (addr.offset));
12279 return true;
12281 case ADDRESS_REG_REG:
12282 if (addr.shift == 0)
12283 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12284 reg_names [REGNO (addr.offset)]);
12285 else
12286 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12287 reg_names [REGNO (addr.offset)], addr.shift);
12288 return true;
12290 case ADDRESS_REG_UXTW:
12291 if (addr.shift == 0)
12292 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12293 REGNO (addr.offset) - R0_REGNUM);
12294 else
12295 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12296 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12297 return true;
12299 case ADDRESS_REG_SXTW:
12300 if (addr.shift == 0)
12301 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12302 REGNO (addr.offset) - R0_REGNUM);
12303 else
12304 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12305 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12306 return true;
12308 case ADDRESS_REG_WB:
12309 /* Writeback is only supported for fixed-width modes. */
12310 size = GET_MODE_SIZE (mode).to_constant ();
12311 switch (GET_CODE (x))
12313 case PRE_INC:
12314 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12315 return true;
12316 case POST_INC:
12317 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12318 return true;
12319 case PRE_DEC:
12320 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12321 return true;
12322 case POST_DEC:
12323 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12324 return true;
12325 case PRE_MODIFY:
12326 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12327 INTVAL (addr.offset));
12328 return true;
12329 case POST_MODIFY:
12330 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12331 INTVAL (addr.offset));
12332 return true;
12333 default:
12334 break;
12336 break;
12338 case ADDRESS_LO_SUM:
12339 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12340 output_addr_const (f, addr.offset);
12341 asm_fprintf (f, "]");
12342 return true;
12344 case ADDRESS_SYMBOLIC:
12345 output_addr_const (f, x);
12346 return true;
12349 return false;
12352 /* Print address 'x' of a memory access with mode 'mode'. */
12353 static void
12354 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12356 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12357 output_addr_const (f, x);
12360 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12362 static bool
12363 aarch64_output_addr_const_extra (FILE *file, rtx x)
12365 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12367 output_addr_const (file, XVECEXP (x, 0, 0));
12368 return true;
12370 return false;
12373 bool
12374 aarch64_label_mentioned_p (rtx x)
12376 const char *fmt;
12377 int i;
12379 if (LABEL_REF_P (x))
12380 return true;
12382 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12383 referencing instruction, but they are constant offsets, not
12384 symbols. */
12385 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12386 return false;
12388 fmt = GET_RTX_FORMAT (GET_CODE (x));
12389 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12391 if (fmt[i] == 'E')
12393 int j;
12395 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12396 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12397 return 1;
12399 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12400 return 1;
12403 return 0;
12406 /* Implement REGNO_REG_CLASS. */
12408 enum reg_class
12409 aarch64_regno_regclass (unsigned regno)
12411 if (STUB_REGNUM_P (regno))
12412 return STUB_REGS;
12414 if (GP_REGNUM_P (regno))
12415 return GENERAL_REGS;
12417 if (regno == SP_REGNUM)
12418 return STACK_REG;
12420 if (regno == FRAME_POINTER_REGNUM
12421 || regno == ARG_POINTER_REGNUM)
12422 return POINTER_REGS;
12424 if (FP_REGNUM_P (regno))
12425 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12426 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12428 if (PR_REGNUM_P (regno))
12429 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12431 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12432 return FFR_REGS;
12434 return NO_REGS;
12437 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12438 If OFFSET is out of range, return an offset of an anchor point
12439 that is in range. Return 0 otherwise. */
12441 static HOST_WIDE_INT
12442 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12443 machine_mode mode)
12445 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12446 if (size > 16)
12447 return (offset + 0x400) & ~0x7f0;
12449 /* For offsets that aren't a multiple of the access size, the limit is
12450 -256...255. */
12451 if (offset & (size - 1))
12453 /* BLKmode typically uses LDP of X-registers. */
12454 if (mode == BLKmode)
12455 return (offset + 512) & ~0x3ff;
12456 return (offset + 0x100) & ~0x1ff;
12459 /* Small negative offsets are supported. */
12460 if (IN_RANGE (offset, -256, 0))
12461 return 0;
12463 if (mode == TImode || mode == TFmode || mode == TDmode)
12464 return (offset + 0x100) & ~0x1ff;
12466 /* Use 12-bit offset by access size. */
12467 return offset & (~0xfff * size);
12470 static rtx
12471 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
12473 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12474 where mask is selected by alignment and size of the offset.
12475 We try to pick as large a range for the offset as possible to
12476 maximize the chance of a CSE. However, for aligned addresses
12477 we limit the range to 4k so that structures with different sized
12478 elements are likely to use the same base. We need to be careful
12479 not to split a CONST for some forms of address expression, otherwise
12480 it will generate sub-optimal code. */
12482 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12484 rtx base = XEXP (x, 0);
12485 rtx offset_rtx = XEXP (x, 1);
12486 HOST_WIDE_INT offset = INTVAL (offset_rtx);
12488 if (GET_CODE (base) == PLUS)
12490 rtx op0 = XEXP (base, 0);
12491 rtx op1 = XEXP (base, 1);
12493 /* Force any scaling into a temp for CSE. */
12494 op0 = force_reg (Pmode, op0);
12495 op1 = force_reg (Pmode, op1);
12497 /* Let the pointer register be in op0. */
12498 if (REG_POINTER (op1))
12499 std::swap (op0, op1);
12501 /* If the pointer is virtual or frame related, then we know that
12502 virtual register instantiation or register elimination is going
12503 to apply a second constant. We want the two constants folded
12504 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12505 if (virt_or_elim_regno_p (REGNO (op0)))
12507 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12508 NULL_RTX, true, OPTAB_DIRECT);
12509 return gen_rtx_PLUS (Pmode, base, op1);
12512 /* Otherwise, in order to encourage CSE (and thence loop strength
12513 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12514 base = expand_binop (Pmode, add_optab, op0, op1,
12515 NULL_RTX, true, OPTAB_DIRECT);
12516 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12519 HOST_WIDE_INT size;
12520 if (GET_MODE_SIZE (mode).is_constant (&size))
12522 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12523 mode);
12524 if (base_offset != 0)
12526 base = plus_constant (Pmode, base, base_offset);
12527 base = force_operand (base, NULL_RTX);
12528 return plus_constant (Pmode, base, offset - base_offset);
12533 return x;
12536 static reg_class_t
12537 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12538 reg_class_t rclass,
12539 machine_mode mode,
12540 secondary_reload_info *sri)
12542 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12543 LDR and STR. See the comment at the head of aarch64-sve.md for
12544 more details about the big-endian handling. */
12545 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12546 if (reg_class_subset_p (rclass, FP_REGS)
12547 && !((REG_P (x) && HARD_REGISTER_P (x))
12548 || aarch64_simd_valid_immediate (x, NULL))
12549 && mode != VNx16QImode
12550 && (vec_flags & VEC_SVE_DATA)
12551 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12553 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12554 return NO_REGS;
12557 /* If we have to disable direct literal pool loads and stores because the
12558 function is too big, then we need a scratch register. */
12559 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12560 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12561 || targetm.vector_mode_supported_p (GET_MODE (x)))
12562 && !aarch64_pcrelative_literal_loads)
12564 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12565 return NO_REGS;
12568 /* Without the TARGET_SIMD instructions we cannot move a Q register
12569 to a Q register directly. We need a scratch. */
12570 if (REG_P (x)
12571 && (mode == TFmode
12572 || mode == TImode
12573 || mode == TDmode
12574 || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12575 && mode == GET_MODE (x)
12576 && !TARGET_SIMD
12577 && FP_REGNUM_P (REGNO (x))
12578 && reg_class_subset_p (rclass, FP_REGS))
12580 sri->icode = code_for_aarch64_reload_mov (mode);
12581 return NO_REGS;
12584 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12585 because AArch64 has richer addressing modes for LDR/STR instructions
12586 than LDP/STP instructions. */
12587 if (TARGET_FLOAT && rclass == GENERAL_REGS
12588 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12589 return FP_REGS;
12591 if (rclass == FP_REGS
12592 && (mode == TImode || mode == TFmode || mode == TDmode)
12593 && CONSTANT_P(x))
12594 return GENERAL_REGS;
12596 return NO_REGS;
12599 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12601 static bool
12602 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12603 reg_class_t class2)
12605 if (!TARGET_SIMD
12606 && reg_classes_intersect_p (class1, FP_REGS)
12607 && reg_classes_intersect_p (class2, FP_REGS))
12609 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12610 so we can't easily split a move involving tuples of 128-bit
12611 vectors. Force the copy through memory instead.
12613 (Tuples of 64-bit vectors are fine.) */
12614 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12615 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12616 return true;
12618 return false;
12621 static bool
12622 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12624 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12626 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12627 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12628 if (frame_pointer_needed)
12629 return to == HARD_FRAME_POINTER_REGNUM;
12630 return true;
12633 poly_int64
12634 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12636 if (to == HARD_FRAME_POINTER_REGNUM)
12638 if (from == ARG_POINTER_REGNUM)
12639 return cfun->machine->frame.hard_fp_offset;
12641 if (from == FRAME_POINTER_REGNUM)
12642 return cfun->machine->frame.hard_fp_offset
12643 - cfun->machine->frame.locals_offset;
12646 if (to == STACK_POINTER_REGNUM)
12648 if (from == FRAME_POINTER_REGNUM)
12649 return cfun->machine->frame.frame_size
12650 - cfun->machine->frame.locals_offset;
12653 return cfun->machine->frame.frame_size;
12657 /* Get return address without mangling. */
12660 aarch64_return_addr_rtx (void)
12662 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12663 /* Note: aarch64_return_address_signing_enabled only
12664 works after cfun->machine->frame.laid_out is set,
12665 so here we don't know if the return address will
12666 be signed or not. */
12667 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12668 emit_move_insn (lr, val);
12669 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12670 return lr;
12674 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12675 previous frame. */
12678 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12680 if (count != 0)
12681 return const0_rtx;
12682 return aarch64_return_addr_rtx ();
12685 static void
12686 aarch64_asm_trampoline_template (FILE *f)
12688 /* Even if the current function doesn't have branch protection, some
12689 later function might, so since this template is only generated once
12690 we have to add a BTI just in case. */
12691 asm_fprintf (f, "\thint\t34 // bti c\n");
12693 if (TARGET_ILP32)
12695 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12696 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12698 else
12700 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12701 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12703 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12705 /* We always emit a speculation barrier.
12706 This is because the same trampoline template is used for every nested
12707 function. Since nested functions are not particularly common or
12708 performant we don't worry too much about the extra instructions to copy
12709 around.
12710 This is not yet a problem, since we have not yet implemented function
12711 specific attributes to choose between hardening against straight line
12712 speculation or not, but such function specific attributes are likely to
12713 happen in the future. */
12714 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12716 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12717 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12720 static void
12721 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12723 rtx fnaddr, mem, a_tramp;
12724 const int tramp_code_sz = 24;
12726 /* Don't need to copy the trailing D-words, we fill those in below. */
12727 /* We create our own memory address in Pmode so that `emit_block_move` can
12728 use parts of the backend which expect Pmode addresses. */
12729 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12730 emit_block_move (gen_rtx_MEM (BLKmode, temp),
12731 assemble_trampoline_template (),
12732 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12733 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12734 fnaddr = XEXP (DECL_RTL (fndecl), 0);
12735 if (GET_MODE (fnaddr) != ptr_mode)
12736 fnaddr = convert_memory_address (ptr_mode, fnaddr);
12737 emit_move_insn (mem, fnaddr);
12739 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12740 emit_move_insn (mem, chain_value);
12742 /* XXX We should really define a "clear_cache" pattern and use
12743 gen_clear_cache(). */
12744 a_tramp = XEXP (m_tramp, 0);
12745 maybe_emit_call_builtin___clear_cache (a_tramp,
12746 plus_constant (ptr_mode,
12747 a_tramp,
12748 TRAMPOLINE_SIZE));
12751 static unsigned char
12752 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12754 /* ??? Logically we should only need to provide a value when
12755 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12756 can hold MODE, but at the moment we need to handle all modes.
12757 Just ignore any runtime parts for registers that can't store them. */
12758 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12759 unsigned int nregs, vec_flags;
12760 switch (regclass)
12762 case STUB_REGS:
12763 case TAILCALL_ADDR_REGS:
12764 case POINTER_REGS:
12765 case GENERAL_REGS:
12766 case ALL_REGS:
12767 case POINTER_AND_FP_REGS:
12768 case FP_REGS:
12769 case FP_LO_REGS:
12770 case FP_LO8_REGS:
12771 vec_flags = aarch64_classify_vector_mode (mode);
12772 if ((vec_flags & VEC_SVE_DATA)
12773 && constant_multiple_p (GET_MODE_SIZE (mode),
12774 aarch64_vl_bytes (mode, vec_flags), &nregs))
12775 return nregs;
12776 return (vec_flags & VEC_ADVSIMD
12777 ? CEIL (lowest_size, UNITS_PER_VREG)
12778 : CEIL (lowest_size, UNITS_PER_WORD));
12779 case STACK_REG:
12780 case PR_REGS:
12781 case PR_LO_REGS:
12782 case PR_HI_REGS:
12783 case FFR_REGS:
12784 case PR_AND_FFR_REGS:
12785 return 1;
12787 case NO_REGS:
12788 return 0;
12790 default:
12791 break;
12793 gcc_unreachable ();
12796 static reg_class_t
12797 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12799 if (regclass == POINTER_REGS)
12800 return GENERAL_REGS;
12802 if (regclass == STACK_REG)
12804 if (REG_P(x)
12805 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12806 return regclass;
12808 return NO_REGS;
12811 /* Register eliminiation can result in a request for
12812 SP+constant->FP_REGS. We cannot support such operations which
12813 use SP as source and an FP_REG as destination, so reject out
12814 right now. */
12815 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12817 rtx lhs = XEXP (x, 0);
12819 /* Look through a possible SUBREG introduced by ILP32. */
12820 if (SUBREG_P (lhs))
12821 lhs = SUBREG_REG (lhs);
12823 gcc_assert (REG_P (lhs));
12824 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12825 POINTER_REGS));
12826 return NO_REGS;
12829 return regclass;
12832 void
12833 aarch64_asm_output_labelref (FILE* f, const char *name)
12835 asm_fprintf (f, "%U%s", name);
12838 static void
12839 aarch64_elf_asm_constructor (rtx symbol, int priority)
12841 if (priority == DEFAULT_INIT_PRIORITY)
12842 default_ctor_section_asm_out_constructor (symbol, priority);
12843 else
12845 section *s;
12846 /* While priority is known to be in range [0, 65535], so 18 bytes
12847 would be enough, the compiler might not know that. To avoid
12848 -Wformat-truncation false positive, use a larger size. */
12849 char buf[23];
12850 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12851 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12852 switch_to_section (s);
12853 assemble_align (POINTER_SIZE);
12854 assemble_aligned_integer (POINTER_BYTES, symbol);
12858 static void
12859 aarch64_elf_asm_destructor (rtx symbol, int priority)
12861 if (priority == DEFAULT_INIT_PRIORITY)
12862 default_dtor_section_asm_out_destructor (symbol, priority);
12863 else
12865 section *s;
12866 /* While priority is known to be in range [0, 65535], so 18 bytes
12867 would be enough, the compiler might not know that. To avoid
12868 -Wformat-truncation false positive, use a larger size. */
12869 char buf[23];
12870 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12871 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12872 switch_to_section (s);
12873 assemble_align (POINTER_SIZE);
12874 assemble_aligned_integer (POINTER_BYTES, symbol);
12878 const char*
12879 aarch64_output_casesi (rtx *operands)
12881 char buf[100];
12882 char label[100];
12883 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12884 int index;
12885 static const char *const patterns[4][2] =
12888 "ldrb\t%w3, [%0,%w1,uxtw]",
12889 "add\t%3, %4, %w3, sxtb #2"
12892 "ldrh\t%w3, [%0,%w1,uxtw #1]",
12893 "add\t%3, %4, %w3, sxth #2"
12896 "ldr\t%w3, [%0,%w1,uxtw #2]",
12897 "add\t%3, %4, %w3, sxtw #2"
12899 /* We assume that DImode is only generated when not optimizing and
12900 that we don't really need 64-bit address offsets. That would
12901 imply an object file with 8GB of code in a single function! */
12903 "ldr\t%w3, [%0,%w1,uxtw #2]",
12904 "add\t%3, %4, %w3, sxtw #2"
12908 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
12910 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
12911 index = exact_log2 (GET_MODE_SIZE (mode));
12913 gcc_assert (index >= 0 && index <= 3);
12915 /* Need to implement table size reduction, by chaning the code below. */
12916 output_asm_insn (patterns[index][0], operands);
12917 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
12918 snprintf (buf, sizeof (buf),
12919 "adr\t%%4, %s", targetm.strip_name_encoding (label));
12920 output_asm_insn (buf, operands);
12921 output_asm_insn (patterns[index][1], operands);
12922 output_asm_insn ("br\t%3", operands);
12923 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
12924 operands);
12925 assemble_label (asm_out_file, label);
12926 return "";
12930 /* Return size in bits of an arithmetic operand which is shifted/scaled and
12931 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
12932 operator. */
12935 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
12937 if (shift >= 0 && shift <= 3)
12939 int size;
12940 for (size = 8; size <= 32; size *= 2)
12942 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
12943 if (mask == bits << shift)
12944 return size;
12947 return 0;
12950 /* Constant pools are per function only when PC relative
12951 literal loads are true or we are in the large memory
12952 model. */
12954 static inline bool
12955 aarch64_can_use_per_function_literal_pools_p (void)
12957 return (aarch64_pcrelative_literal_loads
12958 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
12961 static bool
12962 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
12964 /* We can't use blocks for constants when we're using a per-function
12965 constant pool. */
12966 return !aarch64_can_use_per_function_literal_pools_p ();
12969 /* Select appropriate section for constants depending
12970 on where we place literal pools. */
12972 static section *
12973 aarch64_select_rtx_section (machine_mode mode,
12974 rtx x,
12975 unsigned HOST_WIDE_INT align)
12977 if (aarch64_can_use_per_function_literal_pools_p ())
12978 return function_section (current_function_decl);
12980 return default_elf_select_rtx_section (mode, x, align);
12983 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
12984 void
12985 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
12986 HOST_WIDE_INT offset)
12988 /* When using per-function literal pools, we must ensure that any code
12989 section is aligned to the minimal instruction length, lest we get
12990 errors from the assembler re "unaligned instructions". */
12991 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
12992 ASM_OUTPUT_ALIGN (f, 2);
12995 /* Costs. */
12997 /* Helper function for rtx cost calculation. Strip a shift expression
12998 from X. Returns the inner operand if successful, or the original
12999 expression on failure. */
13000 static rtx
13001 aarch64_strip_shift (rtx x)
13003 rtx op = x;
13005 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13006 we can convert both to ROR during final output. */
13007 if ((GET_CODE (op) == ASHIFT
13008 || GET_CODE (op) == ASHIFTRT
13009 || GET_CODE (op) == LSHIFTRT
13010 || GET_CODE (op) == ROTATERT
13011 || GET_CODE (op) == ROTATE)
13012 && CONST_INT_P (XEXP (op, 1)))
13013 return XEXP (op, 0);
13015 if (GET_CODE (op) == MULT
13016 && CONST_INT_P (XEXP (op, 1))
13017 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13018 return XEXP (op, 0);
13020 return x;
13023 /* Helper function for rtx cost calculation. Strip an extend
13024 expression from X. Returns the inner operand if successful, or the
13025 original expression on failure. We deal with a number of possible
13026 canonicalization variations here. If STRIP_SHIFT is true, then
13027 we can strip off a shift also. */
13028 static rtx
13029 aarch64_strip_extend (rtx x, bool strip_shift)
13031 scalar_int_mode mode;
13032 rtx op = x;
13034 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13035 return op;
13037 if (GET_CODE (op) == AND
13038 && GET_CODE (XEXP (op, 0)) == MULT
13039 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13040 && CONST_INT_P (XEXP (op, 1))
13041 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13042 INTVAL (XEXP (op, 1))) != 0)
13043 return XEXP (XEXP (op, 0), 0);
13045 /* Now handle extended register, as this may also have an optional
13046 left shift by 1..4. */
13047 if (strip_shift
13048 && GET_CODE (op) == ASHIFT
13049 && CONST_INT_P (XEXP (op, 1))
13050 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13051 op = XEXP (op, 0);
13053 if (GET_CODE (op) == ZERO_EXTEND
13054 || GET_CODE (op) == SIGN_EXTEND)
13055 op = XEXP (op, 0);
13057 if (op != x)
13058 return op;
13060 return x;
13063 /* Helper function for rtx cost calculation. Strip extension as well as any
13064 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13065 successful, or the original expression on failure. */
13066 static rtx
13067 aarch64_strip_extend_vec_half (rtx x)
13069 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13071 x = XEXP (x, 0);
13072 if (GET_CODE (x) == VEC_SELECT
13073 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13074 XEXP (x, 1)))
13075 x = XEXP (x, 0);
13077 return x;
13080 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13081 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13082 operand if successful, or the original expression on failure. */
13083 static rtx
13084 aarch64_strip_duplicate_vec_elt (rtx x)
13086 if (GET_CODE (x) == VEC_DUPLICATE
13087 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13089 x = XEXP (x, 0);
13090 if (GET_CODE (x) == VEC_SELECT)
13091 x = XEXP (x, 0);
13092 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13093 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13094 x = XEXP (XEXP (x, 0), 0);
13096 return x;
13099 /* Return true iff CODE is a shift supported in combination
13100 with arithmetic instructions. */
13102 static bool
13103 aarch64_shift_p (enum rtx_code code)
13105 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13109 /* Return true iff X is a cheap shift without a sign extend. */
13111 static bool
13112 aarch64_cheap_mult_shift_p (rtx x)
13114 rtx op0, op1;
13116 op0 = XEXP (x, 0);
13117 op1 = XEXP (x, 1);
13119 if (!(aarch64_tune_params.extra_tuning_flags
13120 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13121 return false;
13123 if (GET_CODE (op0) == SIGN_EXTEND)
13124 return false;
13126 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13127 && UINTVAL (op1) <= 4)
13128 return true;
13130 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13131 return false;
13133 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13135 if (l2 > 0 && l2 <= 4)
13136 return true;
13138 return false;
13141 /* Helper function for rtx cost calculation. Calculate the cost of
13142 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13143 Return the calculated cost of the expression, recursing manually in to
13144 operands where needed. */
13146 static int
13147 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13149 rtx op0, op1;
13150 const struct cpu_cost_table *extra_cost
13151 = aarch64_tune_params.insn_extra_cost;
13152 int cost = 0;
13153 bool compound_p = (outer == PLUS || outer == MINUS);
13154 machine_mode mode = GET_MODE (x);
13156 gcc_checking_assert (code == MULT);
13158 op0 = XEXP (x, 0);
13159 op1 = XEXP (x, 1);
13161 if (VECTOR_MODE_P (mode))
13163 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13164 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13166 /* The select-operand-high-half versions of the instruction have the
13167 same cost as the three vector version - don't add the costs of the
13168 extension or selection into the costs of the multiply. */
13169 op0 = aarch64_strip_extend_vec_half (op0);
13170 op1 = aarch64_strip_extend_vec_half (op1);
13171 /* The by-element versions of the instruction have the same costs as
13172 the normal 3-vector version. We make an assumption that the input
13173 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13174 costing of a MUL by element pre RA is a bit optimistic. */
13175 op0 = aarch64_strip_duplicate_vec_elt (op0);
13176 op1 = aarch64_strip_duplicate_vec_elt (op1);
13178 cost += rtx_cost (op0, mode, MULT, 0, speed);
13179 cost += rtx_cost (op1, mode, MULT, 1, speed);
13180 if (speed)
13182 if (GET_CODE (x) == MULT)
13183 cost += extra_cost->vect.mult;
13184 /* This is to catch the SSRA costing currently flowing here. */
13185 else
13186 cost += extra_cost->vect.alu;
13188 return cost;
13191 /* Integer multiply/fma. */
13192 if (GET_MODE_CLASS (mode) == MODE_INT)
13194 /* The multiply will be canonicalized as a shift, cost it as such. */
13195 if (aarch64_shift_p (GET_CODE (x))
13196 || (CONST_INT_P (op1)
13197 && exact_log2 (INTVAL (op1)) > 0))
13199 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13200 || GET_CODE (op0) == SIGN_EXTEND;
13201 if (speed)
13203 if (compound_p)
13205 /* If the shift is considered cheap,
13206 then don't add any cost. */
13207 if (aarch64_cheap_mult_shift_p (x))
13209 else if (REG_P (op1))
13210 /* ARITH + shift-by-register. */
13211 cost += extra_cost->alu.arith_shift_reg;
13212 else if (is_extend)
13213 /* ARITH + extended register. We don't have a cost field
13214 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13215 cost += extra_cost->alu.extend_arith;
13216 else
13217 /* ARITH + shift-by-immediate. */
13218 cost += extra_cost->alu.arith_shift;
13220 else
13221 /* LSL (immediate). */
13222 cost += extra_cost->alu.shift;
13225 /* Strip extends as we will have costed them in the case above. */
13226 if (is_extend)
13227 op0 = aarch64_strip_extend (op0, true);
13229 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13231 return cost;
13234 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13235 compound and let the below cases handle it. After all, MNEG is a
13236 special-case alias of MSUB. */
13237 if (GET_CODE (op0) == NEG)
13239 op0 = XEXP (op0, 0);
13240 compound_p = true;
13243 /* Integer multiplies or FMAs have zero/sign extending variants. */
13244 if ((GET_CODE (op0) == ZERO_EXTEND
13245 && GET_CODE (op1) == ZERO_EXTEND)
13246 || (GET_CODE (op0) == SIGN_EXTEND
13247 && GET_CODE (op1) == SIGN_EXTEND))
13249 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13250 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13252 if (speed)
13254 if (compound_p)
13255 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13256 cost += extra_cost->mult[0].extend_add;
13257 else
13258 /* MUL/SMULL/UMULL. */
13259 cost += extra_cost->mult[0].extend;
13262 return cost;
13265 /* This is either an integer multiply or a MADD. In both cases
13266 we want to recurse and cost the operands. */
13267 cost += rtx_cost (op0, mode, MULT, 0, speed);
13268 cost += rtx_cost (op1, mode, MULT, 1, speed);
13270 if (speed)
13272 if (compound_p)
13273 /* MADD/MSUB. */
13274 cost += extra_cost->mult[mode == DImode].add;
13275 else
13276 /* MUL. */
13277 cost += extra_cost->mult[mode == DImode].simple;
13280 return cost;
13282 else
13284 if (speed)
13286 /* Floating-point FMA/FMUL can also support negations of the
13287 operands, unless the rounding mode is upward or downward in
13288 which case FNMUL is different than FMUL with operand negation. */
13289 bool neg0 = GET_CODE (op0) == NEG;
13290 bool neg1 = GET_CODE (op1) == NEG;
13291 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13293 if (neg0)
13294 op0 = XEXP (op0, 0);
13295 if (neg1)
13296 op1 = XEXP (op1, 0);
13299 if (compound_p)
13300 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13301 cost += extra_cost->fp[mode == DFmode].fma;
13302 else
13303 /* FMUL/FNMUL. */
13304 cost += extra_cost->fp[mode == DFmode].mult;
13307 cost += rtx_cost (op0, mode, MULT, 0, speed);
13308 cost += rtx_cost (op1, mode, MULT, 1, speed);
13309 return cost;
13313 static int
13314 aarch64_address_cost (rtx x,
13315 machine_mode mode,
13316 addr_space_t as ATTRIBUTE_UNUSED,
13317 bool speed)
13319 enum rtx_code c = GET_CODE (x);
13320 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13321 struct aarch64_address_info info;
13322 int cost = 0;
13323 info.shift = 0;
13325 if (!aarch64_classify_address (&info, x, mode, false))
13327 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13329 /* This is a CONST or SYMBOL ref which will be split
13330 in a different way depending on the code model in use.
13331 Cost it through the generic infrastructure. */
13332 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13333 /* Divide through by the cost of one instruction to
13334 bring it to the same units as the address costs. */
13335 cost_symbol_ref /= COSTS_N_INSNS (1);
13336 /* The cost is then the cost of preparing the address,
13337 followed by an immediate (possibly 0) offset. */
13338 return cost_symbol_ref + addr_cost->imm_offset;
13340 else
13342 /* This is most likely a jump table from a case
13343 statement. */
13344 return addr_cost->register_offset;
13348 switch (info.type)
13350 case ADDRESS_LO_SUM:
13351 case ADDRESS_SYMBOLIC:
13352 case ADDRESS_REG_IMM:
13353 cost += addr_cost->imm_offset;
13354 break;
13356 case ADDRESS_REG_WB:
13357 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13358 cost += addr_cost->pre_modify;
13359 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13361 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13362 if (nvectors == 3)
13363 cost += addr_cost->post_modify_ld3_st3;
13364 else if (nvectors == 4)
13365 cost += addr_cost->post_modify_ld4_st4;
13366 else
13367 cost += addr_cost->post_modify;
13369 else
13370 gcc_unreachable ();
13372 break;
13374 case ADDRESS_REG_REG:
13375 cost += addr_cost->register_offset;
13376 break;
13378 case ADDRESS_REG_SXTW:
13379 cost += addr_cost->register_sextend;
13380 break;
13382 case ADDRESS_REG_UXTW:
13383 cost += addr_cost->register_zextend;
13384 break;
13386 default:
13387 gcc_unreachable ();
13391 if (info.shift > 0)
13393 /* For the sake of calculating the cost of the shifted register
13394 component, we can treat same sized modes in the same way. */
13395 if (known_eq (GET_MODE_BITSIZE (mode), 16))
13396 cost += addr_cost->addr_scale_costs.hi;
13397 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13398 cost += addr_cost->addr_scale_costs.si;
13399 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13400 cost += addr_cost->addr_scale_costs.di;
13401 else
13402 /* We can't tell, or this is a 128-bit vector. */
13403 cost += addr_cost->addr_scale_costs.ti;
13406 return cost;
13409 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13410 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13411 to be taken. */
13414 aarch64_branch_cost (bool speed_p, bool predictable_p)
13416 /* When optimizing for speed, use the cost of unpredictable branches. */
13417 const struct cpu_branch_cost *branch_costs =
13418 aarch64_tune_params.branch_costs;
13420 if (!speed_p || predictable_p)
13421 return branch_costs->predictable;
13422 else
13423 return branch_costs->unpredictable;
13426 /* Return true if X is a zero or sign extract
13427 usable in an ADD or SUB (extended register) instruction. */
13428 static bool
13429 aarch64_rtx_arith_op_extract_p (rtx x)
13431 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13432 No shift. */
13433 if (GET_CODE (x) == SIGN_EXTEND
13434 || GET_CODE (x) == ZERO_EXTEND)
13435 return REG_P (XEXP (x, 0));
13437 return false;
13440 static bool
13441 aarch64_frint_unspec_p (unsigned int u)
13443 switch (u)
13445 case UNSPEC_FRINTZ:
13446 case UNSPEC_FRINTP:
13447 case UNSPEC_FRINTM:
13448 case UNSPEC_FRINTA:
13449 case UNSPEC_FRINTN:
13450 case UNSPEC_FRINTX:
13451 case UNSPEC_FRINTI:
13452 return true;
13454 default:
13455 return false;
13459 /* Return true iff X is an rtx that will match an extr instruction
13460 i.e. as described in the *extr<mode>5_insn family of patterns.
13461 OP0 and OP1 will be set to the operands of the shifts involved
13462 on success and will be NULL_RTX otherwise. */
13464 static bool
13465 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13467 rtx op0, op1;
13468 scalar_int_mode mode;
13469 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13470 return false;
13472 *res_op0 = NULL_RTX;
13473 *res_op1 = NULL_RTX;
13475 if (GET_CODE (x) != IOR)
13476 return false;
13478 op0 = XEXP (x, 0);
13479 op1 = XEXP (x, 1);
13481 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13482 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13484 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13485 if (GET_CODE (op1) == ASHIFT)
13486 std::swap (op0, op1);
13488 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13489 return false;
13491 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13492 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13494 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13495 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13497 *res_op0 = XEXP (op0, 0);
13498 *res_op1 = XEXP (op1, 0);
13499 return true;
13503 return false;
13506 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13507 storing it in *COST. Result is true if the total cost of the operation
13508 has now been calculated. */
13509 static bool
13510 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13512 rtx inner;
13513 rtx comparator;
13514 enum rtx_code cmpcode;
13515 const struct cpu_cost_table *extra_cost
13516 = aarch64_tune_params.insn_extra_cost;
13518 if (COMPARISON_P (op0))
13520 inner = XEXP (op0, 0);
13521 comparator = XEXP (op0, 1);
13522 cmpcode = GET_CODE (op0);
13524 else
13526 inner = op0;
13527 comparator = const0_rtx;
13528 cmpcode = NE;
13531 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13533 /* Conditional branch. */
13534 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13535 return true;
13536 else
13538 if (cmpcode == NE || cmpcode == EQ)
13540 if (comparator == const0_rtx)
13542 /* TBZ/TBNZ/CBZ/CBNZ. */
13543 if (GET_CODE (inner) == ZERO_EXTRACT)
13544 /* TBZ/TBNZ. */
13545 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13546 ZERO_EXTRACT, 0, speed);
13547 else
13548 /* CBZ/CBNZ. */
13549 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13551 return true;
13553 if (register_operand (inner, VOIDmode)
13554 && aarch64_imm24 (comparator, VOIDmode))
13556 /* SUB and SUBS. */
13557 *cost += COSTS_N_INSNS (2);
13558 if (speed)
13559 *cost += extra_cost->alu.arith * 2;
13560 return true;
13563 else if (cmpcode == LT || cmpcode == GE)
13565 /* TBZ/TBNZ. */
13566 if (comparator == const0_rtx)
13567 return true;
13571 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13573 /* CCMP. */
13574 if (GET_CODE (op1) == COMPARE)
13576 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13577 if (XEXP (op1, 1) == const0_rtx)
13578 *cost += 1;
13579 if (speed)
13581 machine_mode mode = GET_MODE (XEXP (op1, 0));
13583 if (GET_MODE_CLASS (mode) == MODE_INT)
13584 *cost += extra_cost->alu.arith;
13585 else
13586 *cost += extra_cost->fp[mode == DFmode].compare;
13588 return true;
13591 /* It's a conditional operation based on the status flags,
13592 so it must be some flavor of CSEL. */
13594 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13595 if (GET_CODE (op1) == NEG
13596 || GET_CODE (op1) == NOT
13597 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13598 op1 = XEXP (op1, 0);
13599 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13601 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13602 op1 = XEXP (op1, 0);
13603 op2 = XEXP (op2, 0);
13605 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13607 inner = XEXP (op1, 0);
13608 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13609 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13610 op1 = XEXP (inner, 0);
13613 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13614 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13615 return true;
13618 /* We don't know what this is, cost all operands. */
13619 return false;
13622 /* Check whether X is a bitfield operation of the form shift + extend that
13623 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13624 operand to which the bitfield operation is applied. Otherwise return
13625 NULL_RTX. */
13627 static rtx
13628 aarch64_extend_bitfield_pattern_p (rtx x)
13630 rtx_code outer_code = GET_CODE (x);
13631 machine_mode outer_mode = GET_MODE (x);
13633 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13634 && outer_mode != SImode && outer_mode != DImode)
13635 return NULL_RTX;
13637 rtx inner = XEXP (x, 0);
13638 rtx_code inner_code = GET_CODE (inner);
13639 machine_mode inner_mode = GET_MODE (inner);
13640 rtx op = NULL_RTX;
13642 switch (inner_code)
13644 case ASHIFT:
13645 if (CONST_INT_P (XEXP (inner, 1))
13646 && (inner_mode == QImode || inner_mode == HImode))
13647 op = XEXP (inner, 0);
13648 break;
13649 case LSHIFTRT:
13650 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13651 && (inner_mode == QImode || inner_mode == HImode))
13652 op = XEXP (inner, 0);
13653 break;
13654 case ASHIFTRT:
13655 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13656 && (inner_mode == QImode || inner_mode == HImode))
13657 op = XEXP (inner, 0);
13658 break;
13659 default:
13660 break;
13663 return op;
13666 /* Return true if the mask and a shift amount from an RTX of the form
13667 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13668 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
13670 bool
13671 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13672 rtx shft_amnt)
13674 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13675 && INTVAL (mask) > 0
13676 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13677 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13678 && (UINTVAL (mask)
13679 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13682 /* Return true if the masks and a shift amount from an RTX of the form
13683 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13684 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
13686 bool
13687 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13688 unsigned HOST_WIDE_INT mask1,
13689 unsigned HOST_WIDE_INT shft_amnt,
13690 unsigned HOST_WIDE_INT mask2)
13692 unsigned HOST_WIDE_INT t;
13694 /* Verify that there is no overlap in what bits are set in the two masks. */
13695 if (mask1 != ~mask2)
13696 return false;
13698 /* Verify that mask2 is not all zeros or ones. */
13699 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13700 return false;
13702 /* The shift amount should always be less than the mode size. */
13703 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13705 /* Verify that the mask being shifted is contiguous and would be in the
13706 least significant bits after shifting by shft_amnt. */
13707 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13708 return (t == (t & -t));
13711 /* Calculate the cost of calculating X, storing it in *COST. Result
13712 is true if the total cost of the operation has now been calculated. */
13713 static bool
13714 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13715 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13717 rtx op0, op1, op2;
13718 const struct cpu_cost_table *extra_cost
13719 = aarch64_tune_params.insn_extra_cost;
13720 rtx_code code = GET_CODE (x);
13721 scalar_int_mode int_mode;
13723 /* By default, assume that everything has equivalent cost to the
13724 cheapest instruction. Any additional costs are applied as a delta
13725 above this default. */
13726 *cost = COSTS_N_INSNS (1);
13728 switch (code)
13730 case SET:
13731 /* The cost depends entirely on the operands to SET. */
13732 *cost = 0;
13733 op0 = SET_DEST (x);
13734 op1 = SET_SRC (x);
13736 switch (GET_CODE (op0))
13738 case MEM:
13739 if (speed)
13741 rtx address = XEXP (op0, 0);
13742 if (VECTOR_MODE_P (mode))
13743 *cost += extra_cost->ldst.storev;
13744 else if (GET_MODE_CLASS (mode) == MODE_INT)
13745 *cost += extra_cost->ldst.store;
13746 else if (mode == SFmode || mode == SDmode)
13747 *cost += extra_cost->ldst.storef;
13748 else if (mode == DFmode || mode == DDmode)
13749 *cost += extra_cost->ldst.stored;
13751 *cost +=
13752 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13753 0, speed));
13756 *cost += rtx_cost (op1, mode, SET, 1, speed);
13757 return true;
13759 case SUBREG:
13760 if (! REG_P (SUBREG_REG (op0)))
13761 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13763 /* Fall through. */
13764 case REG:
13765 /* The cost is one per vector-register copied. */
13766 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13768 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13769 *cost = COSTS_N_INSNS (nregs);
13771 /* const0_rtx is in general free, but we will use an
13772 instruction to set a register to 0. */
13773 else if (REG_P (op1) || op1 == const0_rtx)
13775 /* The cost is 1 per register copied. */
13776 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13777 *cost = COSTS_N_INSNS (nregs);
13779 else
13780 /* Cost is just the cost of the RHS of the set. */
13781 *cost += rtx_cost (op1, mode, SET, 1, speed);
13782 return true;
13784 case ZERO_EXTRACT:
13785 case SIGN_EXTRACT:
13786 /* Bit-field insertion. Strip any redundant widening of
13787 the RHS to meet the width of the target. */
13788 if (SUBREG_P (op1))
13789 op1 = SUBREG_REG (op1);
13790 if ((GET_CODE (op1) == ZERO_EXTEND
13791 || GET_CODE (op1) == SIGN_EXTEND)
13792 && CONST_INT_P (XEXP (op0, 1))
13793 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13794 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13795 op1 = XEXP (op1, 0);
13797 if (CONST_INT_P (op1))
13799 /* MOV immediate is assumed to always be cheap. */
13800 *cost = COSTS_N_INSNS (1);
13802 else
13804 /* BFM. */
13805 if (speed)
13806 *cost += extra_cost->alu.bfi;
13807 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13810 return true;
13812 default:
13813 /* We can't make sense of this, assume default cost. */
13814 *cost = COSTS_N_INSNS (1);
13815 return false;
13817 return false;
13819 case CONST_INT:
13820 /* If an instruction can incorporate a constant within the
13821 instruction, the instruction's expression avoids calling
13822 rtx_cost() on the constant. If rtx_cost() is called on a
13823 constant, then it is usually because the constant must be
13824 moved into a register by one or more instructions.
13826 The exception is constant 0, which can be expressed
13827 as XZR/WZR and is therefore free. The exception to this is
13828 if we have (set (reg) (const0_rtx)) in which case we must cost
13829 the move. However, we can catch that when we cost the SET, so
13830 we don't need to consider that here. */
13831 if (x == const0_rtx)
13832 *cost = 0;
13833 else
13835 /* To an approximation, building any other constant is
13836 proportionally expensive to the number of instructions
13837 required to build that constant. This is true whether we
13838 are compiling for SPEED or otherwise. */
13839 machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
13840 ? SImode : DImode;
13841 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13842 (NULL_RTX, x, false, imode));
13844 return true;
13846 case CONST_DOUBLE:
13848 /* First determine number of instructions to do the move
13849 as an integer constant. */
13850 if (!aarch64_float_const_representable_p (x)
13851 && !aarch64_can_const_movi_rtx_p (x, mode)
13852 && aarch64_float_const_rtx_p (x))
13854 unsigned HOST_WIDE_INT ival;
13855 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13856 gcc_assert (succeed);
13858 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
13859 ? DImode : SImode;
13860 int ncost = aarch64_internal_mov_immediate
13861 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13862 *cost += COSTS_N_INSNS (ncost);
13863 return true;
13866 if (speed)
13868 /* mov[df,sf]_aarch64. */
13869 if (aarch64_float_const_representable_p (x))
13870 /* FMOV (scalar immediate). */
13871 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
13872 else if (!aarch64_float_const_zero_rtx_p (x))
13874 /* This will be a load from memory. */
13875 if (mode == DFmode || mode == DDmode)
13876 *cost += extra_cost->ldst.loadd;
13877 else
13878 *cost += extra_cost->ldst.loadf;
13880 else
13881 /* Otherwise this is +0.0. We get this using MOVI d0, #0
13882 or MOV v0.s[0], wzr - neither of which are modeled by the
13883 cost tables. Just use the default cost. */
13888 return true;
13890 case MEM:
13891 if (speed)
13893 /* For loads we want the base cost of a load, plus an
13894 approximation for the additional cost of the addressing
13895 mode. */
13896 rtx address = XEXP (x, 0);
13897 if (VECTOR_MODE_P (mode))
13898 *cost += extra_cost->ldst.loadv;
13899 else if (GET_MODE_CLASS (mode) == MODE_INT)
13900 *cost += extra_cost->ldst.load;
13901 else if (mode == SFmode || mode == SDmode)
13902 *cost += extra_cost->ldst.loadf;
13903 else if (mode == DFmode || mode == DDmode)
13904 *cost += extra_cost->ldst.loadd;
13906 *cost +=
13907 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13908 0, speed));
13911 return true;
13913 case NEG:
13914 op0 = XEXP (x, 0);
13916 if (VECTOR_MODE_P (mode))
13918 if (speed)
13920 /* FNEG. */
13921 *cost += extra_cost->vect.alu;
13923 return false;
13926 if (GET_MODE_CLASS (mode) == MODE_INT)
13928 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
13929 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
13931 /* CSETM. */
13932 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
13933 return true;
13936 /* Cost this as SUB wzr, X. */
13937 op0 = CONST0_RTX (mode);
13938 op1 = XEXP (x, 0);
13939 goto cost_minus;
13942 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13944 /* Support (neg(fma...)) as a single instruction only if
13945 sign of zeros is unimportant. This matches the decision
13946 making in aarch64.md. */
13947 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
13949 /* FNMADD. */
13950 *cost = rtx_cost (op0, mode, NEG, 0, speed);
13951 return true;
13953 if (GET_CODE (op0) == MULT)
13955 /* FNMUL. */
13956 *cost = rtx_cost (op0, mode, NEG, 0, speed);
13957 return true;
13959 if (speed)
13960 /* FNEG. */
13961 *cost += extra_cost->fp[mode == DFmode].neg;
13962 return false;
13965 return false;
13967 case CLRSB:
13968 case CLZ:
13969 if (speed)
13971 if (VECTOR_MODE_P (mode))
13972 *cost += extra_cost->vect.alu;
13973 else
13974 *cost += extra_cost->alu.clz;
13977 return false;
13979 case CTZ:
13980 *cost = COSTS_N_INSNS (2);
13982 if (speed)
13983 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
13984 return false;
13986 case COMPARE:
13987 op0 = XEXP (x, 0);
13988 op1 = XEXP (x, 1);
13990 if (op1 == const0_rtx
13991 && GET_CODE (op0) == AND)
13993 x = op0;
13994 mode = GET_MODE (op0);
13995 goto cost_logic;
13998 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14000 /* TODO: A write to the CC flags possibly costs extra, this
14001 needs encoding in the cost tables. */
14003 mode = GET_MODE (op0);
14004 /* ANDS. */
14005 if (GET_CODE (op0) == AND)
14007 x = op0;
14008 goto cost_logic;
14011 if (GET_CODE (op0) == PLUS)
14013 /* ADDS (and CMN alias). */
14014 x = op0;
14015 goto cost_plus;
14018 if (GET_CODE (op0) == MINUS)
14020 /* SUBS. */
14021 x = op0;
14022 goto cost_minus;
14025 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14026 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14027 && CONST_INT_P (XEXP (op0, 2)))
14029 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14030 Handle it here directly rather than going to cost_logic
14031 since we know the immediate generated for the TST is valid
14032 so we can avoid creating an intermediate rtx for it only
14033 for costing purposes. */
14034 if (speed)
14035 *cost += extra_cost->alu.logical;
14037 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14038 ZERO_EXTRACT, 0, speed);
14039 return true;
14042 if (GET_CODE (op1) == NEG)
14044 /* CMN. */
14045 if (speed)
14046 *cost += extra_cost->alu.arith;
14048 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14049 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14050 return true;
14053 /* CMP.
14055 Compare can freely swap the order of operands, and
14056 canonicalization puts the more complex operation first.
14057 But the integer MINUS logic expects the shift/extend
14058 operation in op1. */
14059 if (! (REG_P (op0)
14060 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14062 op0 = XEXP (x, 1);
14063 op1 = XEXP (x, 0);
14065 goto cost_minus;
14068 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14070 /* FCMP. */
14071 if (speed)
14072 *cost += extra_cost->fp[mode == DFmode].compare;
14074 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14076 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14077 /* FCMP supports constant 0.0 for no extra cost. */
14078 return true;
14080 return false;
14083 if (VECTOR_MODE_P (mode))
14085 /* Vector compare. */
14086 if (speed)
14087 *cost += extra_cost->vect.alu;
14089 if (aarch64_float_const_zero_rtx_p (op1))
14091 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14092 cost. */
14093 return true;
14095 return false;
14097 return false;
14099 case MINUS:
14101 op0 = XEXP (x, 0);
14102 op1 = XEXP (x, 1);
14104 cost_minus:
14105 if (VECTOR_MODE_P (mode))
14107 /* SUBL2 and SUBW2. */
14108 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14109 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14111 /* The select-operand-high-half versions of the sub instruction
14112 have the same cost as the regular three vector version -
14113 don't add the costs of the select into the costs of the sub.
14115 op0 = aarch64_strip_extend_vec_half (op0);
14116 op1 = aarch64_strip_extend_vec_half (op1);
14120 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14122 /* Detect valid immediates. */
14123 if ((GET_MODE_CLASS (mode) == MODE_INT
14124 || (GET_MODE_CLASS (mode) == MODE_CC
14125 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14126 && CONST_INT_P (op1)
14127 && aarch64_uimm12_shift (INTVAL (op1)))
14129 if (speed)
14130 /* SUB(S) (immediate). */
14131 *cost += extra_cost->alu.arith;
14132 return true;
14135 /* Look for SUB (extended register). */
14136 if (is_a <scalar_int_mode> (mode)
14137 && aarch64_rtx_arith_op_extract_p (op1))
14139 if (speed)
14140 *cost += extra_cost->alu.extend_arith;
14142 op1 = aarch64_strip_extend (op1, true);
14143 *cost += rtx_cost (op1, VOIDmode,
14144 (enum rtx_code) GET_CODE (op1), 0, speed);
14145 return true;
14148 rtx new_op1 = aarch64_strip_extend (op1, false);
14150 /* Cost this as an FMA-alike operation. */
14151 if ((GET_CODE (new_op1) == MULT
14152 || aarch64_shift_p (GET_CODE (new_op1)))
14153 && code != COMPARE)
14155 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14156 (enum rtx_code) code,
14157 speed);
14158 return true;
14161 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14163 if (speed)
14165 if (VECTOR_MODE_P (mode))
14167 /* Vector SUB. */
14168 *cost += extra_cost->vect.alu;
14170 else if (GET_MODE_CLASS (mode) == MODE_INT)
14172 /* SUB(S). */
14173 *cost += extra_cost->alu.arith;
14175 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14177 /* FSUB. */
14178 *cost += extra_cost->fp[mode == DFmode].addsub;
14181 return true;
14184 case PLUS:
14186 rtx new_op0;
14188 op0 = XEXP (x, 0);
14189 op1 = XEXP (x, 1);
14191 cost_plus:
14192 if (VECTOR_MODE_P (mode))
14194 /* ADDL2 and ADDW2. */
14195 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14196 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14198 /* The select-operand-high-half versions of the add instruction
14199 have the same cost as the regular three vector version -
14200 don't add the costs of the select into the costs of the add.
14202 op0 = aarch64_strip_extend_vec_half (op0);
14203 op1 = aarch64_strip_extend_vec_half (op1);
14207 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14208 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14210 /* CSINC. */
14211 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14212 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14213 return true;
14216 if (GET_MODE_CLASS (mode) == MODE_INT
14217 && (aarch64_plus_immediate (op1, mode)
14218 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14220 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14222 if (speed)
14224 /* ADD (immediate). */
14225 *cost += extra_cost->alu.arith;
14227 /* Some tunings prefer to not use the VL-based scalar ops.
14228 Increase the cost of the poly immediate to prevent their
14229 formation. */
14230 if (GET_CODE (op1) == CONST_POLY_INT
14231 && (aarch64_tune_params.extra_tuning_flags
14232 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14233 *cost += COSTS_N_INSNS (1);
14235 return true;
14238 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14240 /* Look for ADD (extended register). */
14241 if (is_a <scalar_int_mode> (mode)
14242 && aarch64_rtx_arith_op_extract_p (op0))
14244 if (speed)
14245 *cost += extra_cost->alu.extend_arith;
14247 op0 = aarch64_strip_extend (op0, true);
14248 *cost += rtx_cost (op0, VOIDmode,
14249 (enum rtx_code) GET_CODE (op0), 0, speed);
14250 return true;
14253 /* Strip any extend, leave shifts behind as we will
14254 cost them through mult_cost. */
14255 new_op0 = aarch64_strip_extend (op0, false);
14257 if (GET_CODE (new_op0) == MULT
14258 || aarch64_shift_p (GET_CODE (new_op0)))
14260 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14261 speed);
14262 return true;
14265 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14267 if (speed)
14269 if (VECTOR_MODE_P (mode))
14271 /* Vector ADD. */
14272 *cost += extra_cost->vect.alu;
14274 else if (GET_MODE_CLASS (mode) == MODE_INT)
14276 /* ADD. */
14277 *cost += extra_cost->alu.arith;
14279 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14281 /* FADD. */
14282 *cost += extra_cost->fp[mode == DFmode].addsub;
14285 return true;
14288 case BSWAP:
14289 *cost = COSTS_N_INSNS (1);
14291 if (speed)
14293 if (VECTOR_MODE_P (mode))
14294 *cost += extra_cost->vect.alu;
14295 else
14296 *cost += extra_cost->alu.rev;
14298 return false;
14300 case IOR:
14301 if (aarch_rev16_p (x))
14303 *cost = COSTS_N_INSNS (1);
14305 if (speed)
14307 if (VECTOR_MODE_P (mode))
14308 *cost += extra_cost->vect.alu;
14309 else
14310 *cost += extra_cost->alu.rev;
14312 return true;
14315 if (aarch64_extr_rtx_p (x, &op0, &op1))
14317 *cost += rtx_cost (op0, mode, IOR, 0, speed);
14318 *cost += rtx_cost (op1, mode, IOR, 1, speed);
14319 if (speed)
14320 *cost += extra_cost->alu.shift;
14322 return true;
14324 /* Fall through. */
14325 case XOR:
14326 case AND:
14327 cost_logic:
14328 op0 = XEXP (x, 0);
14329 op1 = XEXP (x, 1);
14331 if (VECTOR_MODE_P (mode))
14333 if (speed)
14334 *cost += extra_cost->vect.alu;
14335 return true;
14338 if (code == AND
14339 && GET_CODE (op0) == MULT
14340 && CONST_INT_P (XEXP (op0, 1))
14341 && CONST_INT_P (op1)
14342 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14343 INTVAL (op1)) != 0)
14345 /* This is a UBFM/SBFM. */
14346 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14347 if (speed)
14348 *cost += extra_cost->alu.bfx;
14349 return true;
14352 if (is_int_mode (mode, &int_mode))
14354 if (CONST_INT_P (op1))
14356 /* We have a mask + shift version of a UBFIZ
14357 i.e. the *andim_ashift<mode>_bfiz pattern. */
14358 if (GET_CODE (op0) == ASHIFT
14359 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14360 XEXP (op0, 1)))
14362 *cost += rtx_cost (XEXP (op0, 0), int_mode,
14363 (enum rtx_code) code, 0, speed);
14364 if (speed)
14365 *cost += extra_cost->alu.bfx;
14367 return true;
14369 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14371 /* We possibly get the immediate for free, this is not
14372 modelled. */
14373 *cost += rtx_cost (op0, int_mode,
14374 (enum rtx_code) code, 0, speed);
14375 if (speed)
14376 *cost += extra_cost->alu.logical;
14378 return true;
14381 else
14383 rtx new_op0 = op0;
14385 /* Handle ORN, EON, or BIC. */
14386 if (GET_CODE (op0) == NOT)
14387 op0 = XEXP (op0, 0);
14389 new_op0 = aarch64_strip_shift (op0);
14391 /* If we had a shift on op0 then this is a logical-shift-
14392 by-register/immediate operation. Otherwise, this is just
14393 a logical operation. */
14394 if (speed)
14396 if (new_op0 != op0)
14398 /* Shift by immediate. */
14399 if (CONST_INT_P (XEXP (op0, 1)))
14400 *cost += extra_cost->alu.log_shift;
14401 else
14402 *cost += extra_cost->alu.log_shift_reg;
14404 else
14405 *cost += extra_cost->alu.logical;
14408 /* In both cases we want to cost both operands. */
14409 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14410 0, speed);
14411 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14412 1, speed);
14414 return true;
14417 return false;
14419 case NOT:
14420 x = XEXP (x, 0);
14421 op0 = aarch64_strip_shift (x);
14423 if (VECTOR_MODE_P (mode))
14425 /* Vector NOT. */
14426 *cost += extra_cost->vect.alu;
14427 return false;
14430 /* MVN-shifted-reg. */
14431 if (op0 != x)
14433 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14435 if (speed)
14436 *cost += extra_cost->alu.log_shift;
14438 return true;
14440 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14441 Handle the second form here taking care that 'a' in the above can
14442 be a shift. */
14443 else if (GET_CODE (op0) == XOR)
14445 rtx newop0 = XEXP (op0, 0);
14446 rtx newop1 = XEXP (op0, 1);
14447 rtx op0_stripped = aarch64_strip_shift (newop0);
14449 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14450 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14452 if (speed)
14454 if (op0_stripped != newop0)
14455 *cost += extra_cost->alu.log_shift;
14456 else
14457 *cost += extra_cost->alu.logical;
14460 return true;
14462 /* MVN. */
14463 if (speed)
14464 *cost += extra_cost->alu.logical;
14466 return false;
14468 case ZERO_EXTEND:
14470 op0 = XEXP (x, 0);
14471 /* If a value is written in SI mode, then zero extended to DI
14472 mode, the operation will in general be free as a write to
14473 a 'w' register implicitly zeroes the upper bits of an 'x'
14474 register. However, if this is
14476 (set (reg) (zero_extend (reg)))
14478 we must cost the explicit register move. */
14479 if (mode == DImode
14480 && GET_MODE (op0) == SImode)
14482 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14484 /* If OP_COST is non-zero, then the cost of the zero extend
14485 is effectively the cost of the inner operation. Otherwise
14486 we have a MOV instruction and we take the cost from the MOV
14487 itself. This is true independently of whether we are
14488 optimizing for space or time. */
14489 if (op_cost)
14490 *cost = op_cost;
14492 return true;
14494 else if (MEM_P (op0))
14496 /* All loads can zero extend to any size for free. */
14497 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14498 return true;
14501 op0 = aarch64_extend_bitfield_pattern_p (x);
14502 if (op0)
14504 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14505 if (speed)
14506 *cost += extra_cost->alu.bfx;
14507 return true;
14510 if (speed)
14512 if (VECTOR_MODE_P (mode))
14514 /* UMOV. */
14515 *cost += extra_cost->vect.alu;
14517 else
14519 /* We generate an AND instead of UXTB/UXTH. */
14520 *cost += extra_cost->alu.logical;
14523 return false;
14525 case SIGN_EXTEND:
14526 if (MEM_P (XEXP (x, 0)))
14528 /* LDRSH. */
14529 if (speed)
14531 rtx address = XEXP (XEXP (x, 0), 0);
14532 *cost += extra_cost->ldst.load_sign_extend;
14534 *cost +=
14535 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14536 0, speed));
14538 return true;
14541 op0 = aarch64_extend_bitfield_pattern_p (x);
14542 if (op0)
14544 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14545 if (speed)
14546 *cost += extra_cost->alu.bfx;
14547 return true;
14550 if (speed)
14552 if (VECTOR_MODE_P (mode))
14553 *cost += extra_cost->vect.alu;
14554 else
14555 *cost += extra_cost->alu.extend;
14557 return false;
14559 case ASHIFT:
14560 op0 = XEXP (x, 0);
14561 op1 = XEXP (x, 1);
14563 if (CONST_INT_P (op1))
14565 if (speed)
14567 if (VECTOR_MODE_P (mode))
14569 /* Vector shift (immediate). */
14570 *cost += extra_cost->vect.alu;
14572 else
14574 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
14575 aliases. */
14576 *cost += extra_cost->alu.shift;
14580 /* We can incorporate zero/sign extend for free. */
14581 if (GET_CODE (op0) == ZERO_EXTEND
14582 || GET_CODE (op0) == SIGN_EXTEND)
14583 op0 = XEXP (op0, 0);
14585 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14586 return true;
14588 else
14590 if (VECTOR_MODE_P (mode))
14592 if (speed)
14593 /* Vector shift (register). */
14594 *cost += extra_cost->vect.alu;
14596 else
14598 if (speed)
14599 /* LSLV. */
14600 *cost += extra_cost->alu.shift_reg;
14602 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14603 && CONST_INT_P (XEXP (op1, 1))
14604 && known_eq (INTVAL (XEXP (op1, 1)),
14605 GET_MODE_BITSIZE (mode) - 1))
14607 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14608 /* We already demanded XEXP (op1, 0) to be REG_P, so
14609 don't recurse into it. */
14610 return true;
14613 return false; /* All arguments need to be in registers. */
14616 case ROTATE:
14617 case ROTATERT:
14618 case LSHIFTRT:
14619 case ASHIFTRT:
14620 op0 = XEXP (x, 0);
14621 op1 = XEXP (x, 1);
14623 if (CONST_INT_P (op1))
14625 /* ASR (immediate) and friends. */
14626 if (speed)
14628 if (VECTOR_MODE_P (mode))
14629 *cost += extra_cost->vect.alu;
14630 else
14631 *cost += extra_cost->alu.shift;
14634 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14635 return true;
14637 else
14639 if (VECTOR_MODE_P (mode))
14641 if (speed)
14642 /* Vector shift (register). */
14643 *cost += extra_cost->vect.alu;
14645 else
14647 if (speed)
14648 /* ASR (register) and friends. */
14649 *cost += extra_cost->alu.shift_reg;
14651 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14652 && CONST_INT_P (XEXP (op1, 1))
14653 && known_eq (INTVAL (XEXP (op1, 1)),
14654 GET_MODE_BITSIZE (mode) - 1))
14656 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14657 /* We already demanded XEXP (op1, 0) to be REG_P, so
14658 don't recurse into it. */
14659 return true;
14662 return false; /* All arguments need to be in registers. */
14665 case SYMBOL_REF:
14667 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14668 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14670 /* LDR. */
14671 if (speed)
14672 *cost += extra_cost->ldst.load;
14674 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14675 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14677 /* ADRP, followed by ADD. */
14678 *cost += COSTS_N_INSNS (1);
14679 if (speed)
14680 *cost += 2 * extra_cost->alu.arith;
14682 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14683 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14685 /* ADR. */
14686 if (speed)
14687 *cost += extra_cost->alu.arith;
14690 if (flag_pic)
14692 /* One extra load instruction, after accessing the GOT. */
14693 *cost += COSTS_N_INSNS (1);
14694 if (speed)
14695 *cost += extra_cost->ldst.load;
14697 return true;
14699 case HIGH:
14700 case LO_SUM:
14701 /* ADRP/ADD (immediate). */
14702 if (speed)
14703 *cost += extra_cost->alu.arith;
14704 return true;
14706 case ZERO_EXTRACT:
14707 case SIGN_EXTRACT:
14708 /* UBFX/SBFX. */
14709 if (speed)
14711 if (VECTOR_MODE_P (mode))
14712 *cost += extra_cost->vect.alu;
14713 else
14714 *cost += extra_cost->alu.bfx;
14717 /* We can trust that the immediates used will be correct (there
14718 are no by-register forms), so we need only cost op0. */
14719 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14720 return true;
14722 case MULT:
14723 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14724 /* aarch64_rtx_mult_cost always handles recursion to its
14725 operands. */
14726 return true;
14728 case MOD:
14729 /* We can expand signed mod by power of 2 using a NEGS, two parallel
14730 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
14731 an unconditional negate. This case should only ever be reached through
14732 the set_smod_pow2_cheap check in expmed.cc. */
14733 if (CONST_INT_P (XEXP (x, 1))
14734 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14735 && (mode == SImode || mode == DImode))
14737 /* We expand to 4 instructions. Reset the baseline. */
14738 *cost = COSTS_N_INSNS (4);
14740 if (speed)
14741 *cost += 2 * extra_cost->alu.logical
14742 + 2 * extra_cost->alu.arith;
14744 return true;
14747 /* Fall-through. */
14748 case UMOD:
14749 if (speed)
14751 /* Slighly prefer UMOD over SMOD. */
14752 if (VECTOR_MODE_P (mode))
14753 *cost += extra_cost->vect.alu;
14754 else if (GET_MODE_CLASS (mode) == MODE_INT)
14755 *cost += (extra_cost->mult[mode == DImode].add
14756 + extra_cost->mult[mode == DImode].idiv
14757 + (code == MOD ? 1 : 0));
14759 return false; /* All arguments need to be in registers. */
14761 case DIV:
14762 case UDIV:
14763 case SQRT:
14764 if (speed)
14766 if (VECTOR_MODE_P (mode))
14767 *cost += extra_cost->vect.alu;
14768 else if (GET_MODE_CLASS (mode) == MODE_INT)
14769 /* There is no integer SQRT, so only DIV and UDIV can get
14770 here. */
14771 *cost += (extra_cost->mult[mode == DImode].idiv
14772 /* Slighly prefer UDIV over SDIV. */
14773 + (code == DIV ? 1 : 0));
14774 else
14775 *cost += extra_cost->fp[mode == DFmode].div;
14777 return false; /* All arguments need to be in registers. */
14779 case IF_THEN_ELSE:
14780 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14781 XEXP (x, 2), cost, speed);
14783 case EQ:
14784 case NE:
14785 case GT:
14786 case GTU:
14787 case LT:
14788 case LTU:
14789 case GE:
14790 case GEU:
14791 case LE:
14792 case LEU:
14794 return false; /* All arguments must be in registers. */
14796 case FMA:
14797 op0 = XEXP (x, 0);
14798 op1 = XEXP (x, 1);
14799 op2 = XEXP (x, 2);
14801 if (speed)
14803 if (VECTOR_MODE_P (mode))
14804 *cost += extra_cost->vect.alu;
14805 else
14806 *cost += extra_cost->fp[mode == DFmode].fma;
14809 /* FMSUB, FNMADD, and FNMSUB are free. */
14810 if (GET_CODE (op0) == NEG)
14811 op0 = XEXP (op0, 0);
14813 if (GET_CODE (op2) == NEG)
14814 op2 = XEXP (op2, 0);
14816 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14817 and the by-element operand as operand 0. */
14818 if (GET_CODE (op1) == NEG)
14819 op1 = XEXP (op1, 0);
14821 /* Catch vector-by-element operations. The by-element operand can
14822 either be (vec_duplicate (vec_select (x))) or just
14823 (vec_select (x)), depending on whether we are multiplying by
14824 a vector or a scalar.
14826 Canonicalization is not very good in these cases, FMA4 will put the
14827 by-element operand as operand 0, FNMA4 will have it as operand 1. */
14828 if (GET_CODE (op0) == VEC_DUPLICATE)
14829 op0 = XEXP (op0, 0);
14830 else if (GET_CODE (op1) == VEC_DUPLICATE)
14831 op1 = XEXP (op1, 0);
14833 if (GET_CODE (op0) == VEC_SELECT)
14834 op0 = XEXP (op0, 0);
14835 else if (GET_CODE (op1) == VEC_SELECT)
14836 op1 = XEXP (op1, 0);
14838 /* If the remaining parameters are not registers,
14839 get the cost to put them into registers. */
14840 *cost += rtx_cost (op0, mode, FMA, 0, speed);
14841 *cost += rtx_cost (op1, mode, FMA, 1, speed);
14842 *cost += rtx_cost (op2, mode, FMA, 2, speed);
14843 return true;
14845 case FLOAT:
14846 case UNSIGNED_FLOAT:
14847 if (speed)
14848 *cost += extra_cost->fp[mode == DFmode].fromint;
14849 return false;
14851 case FLOAT_EXTEND:
14852 if (speed)
14854 if (VECTOR_MODE_P (mode))
14856 /*Vector truncate. */
14857 *cost += extra_cost->vect.alu;
14859 else
14860 *cost += extra_cost->fp[mode == DFmode].widen;
14862 return false;
14864 case FLOAT_TRUNCATE:
14865 if (speed)
14867 if (VECTOR_MODE_P (mode))
14869 /*Vector conversion. */
14870 *cost += extra_cost->vect.alu;
14872 else
14873 *cost += extra_cost->fp[mode == DFmode].narrow;
14875 return false;
14877 case FIX:
14878 case UNSIGNED_FIX:
14879 x = XEXP (x, 0);
14880 /* Strip the rounding part. They will all be implemented
14881 by the fcvt* family of instructions anyway. */
14882 if (GET_CODE (x) == UNSPEC)
14884 unsigned int uns_code = XINT (x, 1);
14886 if (uns_code == UNSPEC_FRINTA
14887 || uns_code == UNSPEC_FRINTM
14888 || uns_code == UNSPEC_FRINTN
14889 || uns_code == UNSPEC_FRINTP
14890 || uns_code == UNSPEC_FRINTZ)
14891 x = XVECEXP (x, 0, 0);
14894 if (speed)
14896 if (VECTOR_MODE_P (mode))
14897 *cost += extra_cost->vect.alu;
14898 else
14899 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
14902 /* We can combine fmul by a power of 2 followed by a fcvt into a single
14903 fixed-point fcvt. */
14904 if (GET_CODE (x) == MULT
14905 && ((VECTOR_MODE_P (mode)
14906 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
14907 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
14909 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
14910 0, speed);
14911 return true;
14914 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
14915 return true;
14917 case ABS:
14918 if (VECTOR_MODE_P (mode))
14920 /* ABS (vector). */
14921 if (speed)
14922 *cost += extra_cost->vect.alu;
14924 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14926 op0 = XEXP (x, 0);
14928 /* FABD, which is analogous to FADD. */
14929 if (GET_CODE (op0) == MINUS)
14931 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
14932 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
14933 if (speed)
14934 *cost += extra_cost->fp[mode == DFmode].addsub;
14936 return true;
14938 /* Simple FABS is analogous to FNEG. */
14939 if (speed)
14940 *cost += extra_cost->fp[mode == DFmode].neg;
14942 else
14944 /* Integer ABS will either be split to
14945 two arithmetic instructions, or will be an ABS
14946 (scalar), which we don't model. */
14947 *cost = COSTS_N_INSNS (2);
14948 if (speed)
14949 *cost += 2 * extra_cost->alu.arith;
14951 return false;
14953 case SMAX:
14954 case SMIN:
14955 if (speed)
14957 if (VECTOR_MODE_P (mode))
14958 *cost += extra_cost->vect.alu;
14959 else
14961 /* FMAXNM/FMINNM/FMAX/FMIN.
14962 TODO: This may not be accurate for all implementations, but
14963 we do not model this in the cost tables. */
14964 *cost += extra_cost->fp[mode == DFmode].addsub;
14967 return false;
14969 case UNSPEC:
14970 /* The floating point round to integer frint* instructions. */
14971 if (aarch64_frint_unspec_p (XINT (x, 1)))
14973 if (speed)
14974 *cost += extra_cost->fp[mode == DFmode].roundint;
14976 return false;
14979 if (XINT (x, 1) == UNSPEC_RBIT)
14981 if (speed)
14982 *cost += extra_cost->alu.rev;
14984 return false;
14986 break;
14988 case TRUNCATE:
14990 /* Decompose <su>muldi3_highpart. */
14991 if (/* (truncate:DI */
14992 mode == DImode
14993 /* (lshiftrt:TI */
14994 && GET_MODE (XEXP (x, 0)) == TImode
14995 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
14996 /* (mult:TI */
14997 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
14998 /* (ANY_EXTEND:TI (reg:DI))
14999 (ANY_EXTEND:TI (reg:DI))) */
15000 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15001 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15002 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15003 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15004 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15005 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15006 /* (const_int 64) */
15007 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15008 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15010 /* UMULH/SMULH. */
15011 if (speed)
15012 *cost += extra_cost->mult[mode == DImode].extend;
15013 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15014 mode, MULT, 0, speed);
15015 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15016 mode, MULT, 1, speed);
15017 return true;
15019 break;
15020 case CONST_VECTOR:
15022 /* Load using MOVI/MVNI. */
15023 if (aarch64_simd_valid_immediate (x, NULL))
15024 *cost = extra_cost->vect.movi;
15025 else /* Load using constant pool. */
15026 *cost = extra_cost->ldst.load;
15027 break;
15029 case VEC_CONCAT:
15030 /* depending on the operation, either DUP or INS.
15031 For now, keep default costing. */
15032 break;
15033 case VEC_DUPLICATE:
15034 /* Load using a DUP. */
15035 *cost = extra_cost->vect.dup;
15036 return false;
15037 case VEC_SELECT:
15039 rtx op0 = XEXP (x, 0);
15040 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15042 /* cost subreg of 0 as free, otherwise as DUP */
15043 rtx op1 = XEXP (x, 1);
15044 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15046 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15047 *cost = extra_cost->vect.dup;
15048 else
15049 *cost = extra_cost->vect.extract;
15050 return true;
15052 default:
15053 break;
15056 if (dump_file
15057 && flag_aarch64_verbose_cost)
15058 fprintf (dump_file,
15059 "\nFailed to cost RTX. Assuming default cost.\n");
15061 return true;
15064 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15065 calculated for X. This cost is stored in *COST. Returns true
15066 if the total cost of X was calculated. */
15067 static bool
15068 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15069 int param, int *cost, bool speed)
15071 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15073 if (dump_file
15074 && flag_aarch64_verbose_cost)
15076 print_rtl_single (dump_file, x);
15077 fprintf (dump_file, "\n%s cost: %d (%s)\n",
15078 speed ? "Hot" : "Cold",
15079 *cost, result ? "final" : "partial");
15082 return result;
15085 static int
15086 aarch64_register_move_cost (machine_mode mode,
15087 reg_class_t from_i, reg_class_t to_i)
15089 enum reg_class from = (enum reg_class) from_i;
15090 enum reg_class to = (enum reg_class) to_i;
15091 const struct cpu_regmove_cost *regmove_cost
15092 = aarch64_tune_params.regmove_cost;
15094 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
15095 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
15096 || to == STUB_REGS)
15097 to = GENERAL_REGS;
15099 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
15100 || from == STUB_REGS)
15101 from = GENERAL_REGS;
15103 /* Make RDFFR very expensive. In particular, if we know that the FFR
15104 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15105 as a way of obtaining a PTRUE. */
15106 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15107 && hard_reg_set_subset_p (reg_class_contents[from_i],
15108 reg_class_contents[FFR_REGS]))
15109 return 80;
15111 /* Moving between GPR and stack cost is the same as GP2GP. */
15112 if ((from == GENERAL_REGS && to == STACK_REG)
15113 || (to == GENERAL_REGS && from == STACK_REG))
15114 return regmove_cost->GP2GP;
15116 /* To/From the stack register, we move via the gprs. */
15117 if (to == STACK_REG || from == STACK_REG)
15118 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15119 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15121 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15122 if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15123 && known_eq (GET_MODE_SIZE (mode), 16))
15125 /* 128-bit operations on general registers require 2 instructions. */
15126 if (from == GENERAL_REGS && to == GENERAL_REGS)
15127 return regmove_cost->GP2GP * 2;
15128 else if (from == GENERAL_REGS)
15129 return regmove_cost->GP2FP * 2;
15130 else if (to == GENERAL_REGS)
15131 return regmove_cost->FP2GP * 2;
15133 /* When AdvSIMD instructions are disabled it is not possible to move
15134 a 128-bit value directly between Q registers. This is handled in
15135 secondary reload. A general register is used as a scratch to move
15136 the upper DI value and the lower DI value is moved directly,
15137 hence the cost is the sum of three moves. */
15138 if (! TARGET_SIMD)
15139 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15141 return regmove_cost->FP2FP;
15144 if (from == GENERAL_REGS && to == GENERAL_REGS)
15145 return regmove_cost->GP2GP;
15146 else if (from == GENERAL_REGS)
15147 return regmove_cost->GP2FP;
15148 else if (to == GENERAL_REGS)
15149 return regmove_cost->FP2GP;
15151 if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15153 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15154 The cost must be greater than 2 units to indicate that direct
15155 moves aren't possible. */
15156 auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15157 + aarch64_tune_params.memmov_cost.store_fp);
15158 return MIN (CEIL (per_vector, 2), 4);
15161 return regmove_cost->FP2FP;
15164 /* Implements TARGET_MEMORY_MOVE_COST. */
15165 static int
15166 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15168 enum reg_class rclass = (enum reg_class) rclass_i;
15169 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15170 ? reg_classes_intersect_p (rclass, PR_REGS)
15171 : reg_class_subset_p (rclass, PR_REGS))
15172 return (in
15173 ? aarch64_tune_params.memmov_cost.load_pred
15174 : aarch64_tune_params.memmov_cost.store_pred);
15176 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15177 ? reg_classes_intersect_p (rclass, FP_REGS)
15178 : reg_class_subset_p (rclass, FP_REGS))
15179 return (in
15180 ? aarch64_tune_params.memmov_cost.load_fp
15181 : aarch64_tune_params.memmov_cost.store_fp);
15183 return (in
15184 ? aarch64_tune_params.memmov_cost.load_int
15185 : aarch64_tune_params.memmov_cost.store_int);
15188 /* Implement TARGET_INIT_BUILTINS. */
15189 static void
15190 aarch64_init_builtins ()
15192 aarch64_general_init_builtins ();
15193 aarch64_sve::init_builtins ();
15194 #ifdef SUBTARGET_INIT_BUILTINS
15195 SUBTARGET_INIT_BUILTINS;
15196 #endif
15199 /* Implement TARGET_FOLD_BUILTIN. */
15200 static tree
15201 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15203 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15204 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15205 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15206 switch (code & AARCH64_BUILTIN_CLASS)
15208 case AARCH64_BUILTIN_GENERAL:
15209 return aarch64_general_fold_builtin (subcode, type, nargs, args);
15211 case AARCH64_BUILTIN_SVE:
15212 return NULL_TREE;
15214 gcc_unreachable ();
15217 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15218 static bool
15219 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15221 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15222 tree fndecl = gimple_call_fndecl (stmt);
15223 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15224 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15225 gimple *new_stmt = NULL;
15226 switch (code & AARCH64_BUILTIN_CLASS)
15228 case AARCH64_BUILTIN_GENERAL:
15229 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15230 break;
15232 case AARCH64_BUILTIN_SVE:
15233 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15234 break;
15237 if (!new_stmt)
15238 return false;
15240 gsi_replace (gsi, new_stmt, true);
15241 return true;
15244 /* Implement TARGET_EXPAND_BUILTIN. */
15245 static rtx
15246 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15248 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15249 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15250 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15251 switch (code & AARCH64_BUILTIN_CLASS)
15253 case AARCH64_BUILTIN_GENERAL:
15254 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15256 case AARCH64_BUILTIN_SVE:
15257 return aarch64_sve::expand_builtin (subcode, exp, target);
15259 gcc_unreachable ();
15262 /* Implement TARGET_BUILTIN_DECL. */
15263 static tree
15264 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15266 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15267 switch (code & AARCH64_BUILTIN_CLASS)
15269 case AARCH64_BUILTIN_GENERAL:
15270 return aarch64_general_builtin_decl (subcode, initialize_p);
15272 case AARCH64_BUILTIN_SVE:
15273 return aarch64_sve::builtin_decl (subcode, initialize_p);
15275 gcc_unreachable ();
15278 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15279 to optimize 1.0/sqrt. */
15281 static bool
15282 use_rsqrt_p (machine_mode mode)
15284 return (!flag_trapping_math
15285 && flag_unsafe_math_optimizations
15286 && ((aarch64_tune_params.approx_modes->recip_sqrt
15287 & AARCH64_APPROX_MODE (mode))
15288 || flag_mrecip_low_precision_sqrt));
15291 /* Function to decide when to use the approximate reciprocal square root
15292 builtin. */
15294 static tree
15295 aarch64_builtin_reciprocal (tree fndecl)
15297 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15299 if (!use_rsqrt_p (mode))
15300 return NULL_TREE;
15301 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15302 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15303 switch (code & AARCH64_BUILTIN_CLASS)
15305 case AARCH64_BUILTIN_GENERAL:
15306 return aarch64_general_builtin_rsqrt (subcode);
15308 case AARCH64_BUILTIN_SVE:
15309 return NULL_TREE;
15311 gcc_unreachable ();
15314 /* Emit code to perform the floating-point operation:
15316 DST = SRC1 * SRC2
15318 where all three operands are already known to be registers.
15319 If the operation is an SVE one, PTRUE is a suitable all-true
15320 predicate. */
15322 static void
15323 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15325 if (ptrue)
15326 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15327 dst, ptrue, src1, src2,
15328 gen_int_mode (SVE_RELAXED_GP, SImode)));
15329 else
15330 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15333 /* Emit instruction sequence to compute either the approximate square root
15334 or its approximate reciprocal, depending on the flag RECP, and return
15335 whether the sequence was emitted or not. */
15337 bool
15338 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15340 machine_mode mode = GET_MODE (dst);
15342 if (GET_MODE_INNER (mode) == HFmode)
15344 gcc_assert (!recp);
15345 return false;
15348 if (!recp)
15350 if (!(flag_mlow_precision_sqrt
15351 || (aarch64_tune_params.approx_modes->sqrt
15352 & AARCH64_APPROX_MODE (mode))))
15353 return false;
15355 if (!flag_finite_math_only
15356 || flag_trapping_math
15357 || !flag_unsafe_math_optimizations
15358 || optimize_function_for_size_p (cfun))
15359 return false;
15361 else
15362 /* Caller assumes we cannot fail. */
15363 gcc_assert (use_rsqrt_p (mode));
15365 rtx pg = NULL_RTX;
15366 if (aarch64_sve_mode_p (mode))
15367 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15368 machine_mode mmsk = (VECTOR_MODE_P (mode)
15369 ? related_int_vector_mode (mode).require ()
15370 : int_mode_for_mode (mode).require ());
15371 rtx xmsk = NULL_RTX;
15372 if (!recp)
15374 /* When calculating the approximate square root, compare the
15375 argument with 0.0 and create a mask. */
15376 rtx zero = CONST0_RTX (mode);
15377 if (pg)
15379 xmsk = gen_reg_rtx (GET_MODE (pg));
15380 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15381 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15382 xmsk, pg, hint, src, zero));
15384 else
15386 xmsk = gen_reg_rtx (mmsk);
15387 emit_insn (gen_rtx_SET (xmsk,
15388 gen_rtx_NEG (mmsk,
15389 gen_rtx_EQ (mmsk, src, zero))));
15393 /* Estimate the approximate reciprocal square root. */
15394 rtx xdst = gen_reg_rtx (mode);
15395 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15397 /* Iterate over the series twice for SF and thrice for DF. */
15398 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15400 /* Optionally iterate over the series once less for faster performance
15401 while sacrificing the accuracy. */
15402 if ((recp && flag_mrecip_low_precision_sqrt)
15403 || (!recp && flag_mlow_precision_sqrt))
15404 iterations--;
15406 /* Iterate over the series to calculate the approximate reciprocal square
15407 root. */
15408 rtx x1 = gen_reg_rtx (mode);
15409 while (iterations--)
15411 rtx x2 = gen_reg_rtx (mode);
15412 aarch64_emit_mult (x2, pg, xdst, xdst);
15414 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15416 if (iterations > 0)
15417 aarch64_emit_mult (xdst, pg, xdst, x1);
15420 if (!recp)
15422 if (pg)
15423 /* Multiply nonzero source values by the corresponding intermediate
15424 result elements, so that the final calculation is the approximate
15425 square root rather than its reciprocal. Select a zero result for
15426 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15427 otherwise. */
15428 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15429 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15430 else
15432 /* Qualify the approximate reciprocal square root when the
15433 argument is 0.0 by squashing the intermediary result to 0.0. */
15434 rtx xtmp = gen_reg_rtx (mmsk);
15435 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15436 gen_rtx_SUBREG (mmsk, xdst, 0)));
15437 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15439 /* Calculate the approximate square root. */
15440 aarch64_emit_mult (xdst, pg, xdst, src);
15444 /* Finalize the approximation. */
15445 aarch64_emit_mult (dst, pg, xdst, x1);
15447 return true;
15450 /* Emit the instruction sequence to compute the approximation for the division
15451 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15453 bool
15454 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15456 machine_mode mode = GET_MODE (quo);
15458 if (GET_MODE_INNER (mode) == HFmode)
15459 return false;
15461 bool use_approx_division_p = (flag_mlow_precision_div
15462 || (aarch64_tune_params.approx_modes->division
15463 & AARCH64_APPROX_MODE (mode)));
15465 if (!flag_finite_math_only
15466 || flag_trapping_math
15467 || !flag_unsafe_math_optimizations
15468 || optimize_function_for_size_p (cfun)
15469 || !use_approx_division_p)
15470 return false;
15472 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15473 return false;
15475 rtx pg = NULL_RTX;
15476 if (aarch64_sve_mode_p (mode))
15477 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15479 /* Estimate the approximate reciprocal. */
15480 rtx xrcp = gen_reg_rtx (mode);
15481 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15483 /* Iterate over the series twice for SF and thrice for DF. */
15484 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15486 /* Optionally iterate over the series less for faster performance,
15487 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15488 if (flag_mlow_precision_div)
15489 iterations = (GET_MODE_INNER (mode) == DFmode
15490 ? aarch64_double_recp_precision
15491 : aarch64_float_recp_precision);
15493 /* Iterate over the series to calculate the approximate reciprocal. */
15494 rtx xtmp = gen_reg_rtx (mode);
15495 while (iterations--)
15497 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15499 if (iterations > 0)
15500 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15503 if (num != CONST1_RTX (mode))
15505 /* As the approximate reciprocal of DEN is already calculated, only
15506 calculate the approximate division when NUM is not 1.0. */
15507 rtx xnum = force_reg (mode, num);
15508 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15511 /* Finalize the approximation. */
15512 aarch64_emit_mult (quo, pg, xrcp, xtmp);
15513 return true;
15516 /* Return the number of instructions that can be issued per cycle. */
15517 static int
15518 aarch64_sched_issue_rate (void)
15520 return aarch64_tune_params.issue_rate;
15523 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15524 static int
15525 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15527 if (DEBUG_INSN_P (insn))
15528 return more;
15530 rtx_code code = GET_CODE (PATTERN (insn));
15531 if (code == USE || code == CLOBBER)
15532 return more;
15534 if (get_attr_type (insn) == TYPE_NO_INSN)
15535 return more;
15537 return more - 1;
15540 static int
15541 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15543 int issue_rate = aarch64_sched_issue_rate ();
15545 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15549 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15550 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15551 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15553 static int
15554 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15555 int ready_index)
15557 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15561 /* Vectorizer cost model target hooks. */
15563 /* Information about how the CPU would issue the scalar, Advanced SIMD
15564 or SVE version of a vector loop, using the scheme defined by the
15565 aarch64_base_vec_issue_info hierarchy of structures. */
15566 class aarch64_vec_op_count
15568 public:
15569 aarch64_vec_op_count () = default;
15570 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15571 unsigned int = 1);
15573 unsigned int vec_flags () const { return m_vec_flags; }
15574 unsigned int vf_factor () const { return m_vf_factor; }
15576 const aarch64_base_vec_issue_info *base_issue_info () const;
15577 const aarch64_simd_vec_issue_info *simd_issue_info () const;
15578 const aarch64_sve_vec_issue_info *sve_issue_info () const;
15580 fractional_cost rename_cycles_per_iter () const;
15581 fractional_cost min_nonpred_cycles_per_iter () const;
15582 fractional_cost min_pred_cycles_per_iter () const;
15583 fractional_cost min_cycles_per_iter () const;
15585 void dump () const;
15587 /* The number of individual "general" operations. See the comments
15588 in aarch64_base_vec_issue_info for details. */
15589 unsigned int general_ops = 0;
15591 /* The number of load and store operations, under the same scheme
15592 as above. */
15593 unsigned int loads = 0;
15594 unsigned int stores = 0;
15596 /* The minimum number of cycles needed to execute all loop-carried
15597 operations, which in the vector code become associated with
15598 reductions. */
15599 unsigned int reduction_latency = 0;
15601 /* The number of individual predicate operations. See the comments
15602 in aarch64_sve_vec_issue_info for details. */
15603 unsigned int pred_ops = 0;
15605 private:
15606 /* The issue information for the core. */
15607 const aarch64_vec_issue_info *m_issue_info = nullptr;
15609 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15610 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15611 Advanced SIMD code.
15612 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15613 SVE code. */
15614 unsigned int m_vec_flags = 0;
15616 /* Assume that, when the code is executing on the core described
15617 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15618 times more data than the vectorizer anticipates.
15620 This is only ever different from 1 for SVE. It allows us to consider
15621 what would happen on a 256-bit SVE target even when the -mtune
15622 parameters say that the “likely” SVE length is 128 bits. */
15623 unsigned int m_vf_factor = 1;
15626 aarch64_vec_op_count::
15627 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15628 unsigned int vec_flags, unsigned int vf_factor)
15629 : m_issue_info (issue_info),
15630 m_vec_flags (vec_flags),
15631 m_vf_factor (vf_factor)
15635 /* Return the base issue information (i.e. the parts that make sense
15636 for both scalar and vector code). Return null if we have no issue
15637 information. */
15638 const aarch64_base_vec_issue_info *
15639 aarch64_vec_op_count::base_issue_info () const
15641 if (auto *ret = simd_issue_info ())
15642 return ret;
15643 return m_issue_info->scalar;
15646 /* If the structure describes vector code and we have associated issue
15647 information, return that issue information, otherwise return null. */
15648 const aarch64_simd_vec_issue_info *
15649 aarch64_vec_op_count::simd_issue_info () const
15651 if (auto *ret = sve_issue_info ())
15652 return ret;
15653 if (m_vec_flags)
15654 return m_issue_info->advsimd;
15655 return nullptr;
15658 /* If the structure describes SVE code and we have associated issue
15659 information, return that issue information, otherwise return null. */
15660 const aarch64_sve_vec_issue_info *
15661 aarch64_vec_op_count::sve_issue_info () const
15663 if (m_vec_flags & VEC_ANY_SVE)
15664 return m_issue_info->sve;
15665 return nullptr;
15668 /* Estimate the minimum number of cycles per iteration needed to rename
15669 the instructions.
15671 ??? For now this is done inline rather than via cost tables, since it
15672 isn't clear how it should be parameterized for the general case. */
15673 fractional_cost
15674 aarch64_vec_op_count::rename_cycles_per_iter () const
15676 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15677 || sve_issue_info () == &neoversen2_sve_issue_info
15678 || sve_issue_info () == &neoversev2_sve_issue_info)
15679 /* + 1 for an addition. We've already counted a general op for each
15680 store, so we don't need to account for stores separately. The branch
15681 reads no registers and so does not need to be counted either.
15683 ??? This value is very much on the pessimistic side, but seems to work
15684 pretty well in practice. */
15685 return { general_ops + loads + pred_ops + 1, 5 };
15687 return 0;
15690 /* Like min_cycles_per_iter, but excluding predicate operations. */
15691 fractional_cost
15692 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15694 auto *issue_info = base_issue_info ();
15696 fractional_cost cycles = MAX (reduction_latency, 1);
15697 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15698 cycles = std::max (cycles, { loads + stores,
15699 issue_info->loads_stores_per_cycle });
15700 cycles = std::max (cycles, { general_ops,
15701 issue_info->general_ops_per_cycle });
15702 cycles = std::max (cycles, rename_cycles_per_iter ());
15703 return cycles;
15706 /* Like min_cycles_per_iter, but including only the predicate operations. */
15707 fractional_cost
15708 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15710 if (auto *issue_info = sve_issue_info ())
15711 return { pred_ops, issue_info->pred_ops_per_cycle };
15712 return 0;
15715 /* Estimate the minimum number of cycles needed to issue the operations.
15716 This is a very simplistic model! */
15717 fractional_cost
15718 aarch64_vec_op_count::min_cycles_per_iter () const
15720 return std::max (min_nonpred_cycles_per_iter (),
15721 min_pred_cycles_per_iter ());
15724 /* Dump information about the structure. */
15725 void
15726 aarch64_vec_op_count::dump () const
15728 dump_printf_loc (MSG_NOTE, vect_location,
15729 " load operations = %d\n", loads);
15730 dump_printf_loc (MSG_NOTE, vect_location,
15731 " store operations = %d\n", stores);
15732 dump_printf_loc (MSG_NOTE, vect_location,
15733 " general operations = %d\n", general_ops);
15734 if (sve_issue_info ())
15735 dump_printf_loc (MSG_NOTE, vect_location,
15736 " predicate operations = %d\n", pred_ops);
15737 dump_printf_loc (MSG_NOTE, vect_location,
15738 " reduction latency = %d\n", reduction_latency);
15739 if (auto rcpi = rename_cycles_per_iter ())
15740 dump_printf_loc (MSG_NOTE, vect_location,
15741 " estimated cycles per iteration to rename = %f\n",
15742 rcpi.as_double ());
15743 if (auto pred_cpi = min_pred_cycles_per_iter ())
15745 dump_printf_loc (MSG_NOTE, vect_location,
15746 " estimated min cycles per iteration"
15747 " without predication = %f\n",
15748 min_nonpred_cycles_per_iter ().as_double ());
15749 dump_printf_loc (MSG_NOTE, vect_location,
15750 " estimated min cycles per iteration"
15751 " for predication = %f\n", pred_cpi.as_double ());
15753 if (auto cpi = min_cycles_per_iter ())
15754 dump_printf_loc (MSG_NOTE, vect_location,
15755 " estimated min cycles per iteration = %f\n",
15756 cpi.as_double ());
15759 /* Information about vector code that we're in the process of costing. */
15760 class aarch64_vector_costs : public vector_costs
15762 public:
15763 aarch64_vector_costs (vec_info *, bool);
15765 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15766 stmt_vec_info stmt_info, slp_tree, tree vectype,
15767 int misalign,
15768 vect_cost_model_location where) override;
15769 void finish_cost (const vector_costs *) override;
15770 bool better_main_loop_than_p (const vector_costs *other) const override;
15772 private:
15773 void record_potential_advsimd_unrolling (loop_vec_info);
15774 void analyze_loop_vinfo (loop_vec_info);
15775 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15776 aarch64_vec_op_count *);
15777 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15778 fractional_cost, unsigned int,
15779 unsigned int *, bool *);
15780 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15781 unsigned int);
15782 bool prefer_unrolled_loop () const;
15783 unsigned int determine_suggested_unroll_factor ();
15785 /* True if we have performed one-time initialization based on the
15786 vec_info. */
15787 bool m_analyzed_vinfo = false;
15789 /* This loop uses an average operation that is not supported by SVE, but is
15790 supported by Advanced SIMD and SVE2. */
15791 bool m_has_avg = false;
15793 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15794 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15795 SIMD code.
15796 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
15797 unsigned int m_vec_flags = 0;
15799 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15800 This means that code such as:
15802 a[0] = x;
15803 a[1] = x;
15805 will be costed as two scalar instructions and two vector instructions
15806 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
15807 wins if the costs are equal, because of the fact that the vector costs
15808 include constant initializations whereas the scalar costs don't.
15809 We would therefore tend to vectorize the code above, even though
15810 the scalar version can use a single STP.
15812 We should eventually fix this and model LDP and STP in the main costs;
15813 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15814 Until then, we look specifically for code that does nothing more than
15815 STP-like operations. We cost them on that basis in addition to the
15816 normal latency-based costs.
15818 If the scalar or vector code could be a sequence of STPs +
15819 initialization, this variable counts the cost of the sequence,
15820 with 2 units per instruction. The variable is ~0U for other
15821 kinds of code. */
15822 unsigned int m_stp_sequence_cost = 0;
15824 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15825 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
15826 situations, we try to predict whether an Advanced SIMD implementation
15827 of the loop could be completely unrolled and become straight-line code.
15828 If so, it is generally better to use the Advanced SIMD version rather
15829 than length-agnostic SVE, since the SVE loop would execute an unknown
15830 number of times and so could not be completely unrolled in the same way.
15832 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15833 number of Advanced SIMD loop iterations that would be unrolled and
15834 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15835 in the unrolled loop. Both values are zero if we're not applying
15836 the heuristic. */
15837 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15838 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15840 /* If we're vectorizing a loop that executes a constant number of times,
15841 this variable gives the number of times that the vector loop would
15842 iterate, otherwise it is zero. */
15843 uint64_t m_num_vector_iterations = 0;
15845 /* Used only when vectorizing loops. Estimates the number and kind of
15846 operations that would be needed by one iteration of the scalar
15847 or vector loop. There is one entry for each tuning option of
15848 interest. */
15849 auto_vec<aarch64_vec_op_count, 2> m_ops;
15852 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15853 bool costing_for_scalar)
15854 : vector_costs (vinfo, costing_for_scalar),
15855 m_vec_flags (costing_for_scalar ? 0
15856 : aarch64_classify_vector_mode (vinfo->vector_mode))
15858 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15860 m_ops.quick_push ({ issue_info, m_vec_flags });
15861 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15863 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15864 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15865 vf_factor });
15870 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
15871 vector_costs *
15872 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15874 return new aarch64_vector_costs (vinfo, costing_for_scalar);
15877 /* Return true if the current CPU should use the new costs defined
15878 in GCC 11. This should be removed for GCC 12 and above, with the
15879 costs applying to all CPUs instead. */
15880 static bool
15881 aarch64_use_new_vector_costs_p ()
15883 return (aarch64_tune_params.extra_tuning_flags
15884 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
15887 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
15888 static const simd_vec_cost *
15889 aarch64_simd_vec_costs (tree vectype)
15891 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15892 if (vectype != NULL
15893 && aarch64_sve_mode_p (TYPE_MODE (vectype))
15894 && costs->sve != NULL)
15895 return costs->sve;
15896 return costs->advsimd;
15899 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
15900 static const simd_vec_cost *
15901 aarch64_simd_vec_costs_for_flags (unsigned int flags)
15903 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15904 if ((flags & VEC_ANY_SVE) && costs->sve)
15905 return costs->sve;
15906 return costs->advsimd;
15909 /* If STMT_INFO is a memory reference, return the scalar memory type,
15910 otherwise return null. */
15911 static tree
15912 aarch64_dr_type (stmt_vec_info stmt_info)
15914 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
15915 return TREE_TYPE (DR_REF (dr));
15916 return NULL_TREE;
15919 /* Decide whether to use the unrolling heuristic described above
15920 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
15921 describes the loop that we're vectorizing. */
15922 void
15923 aarch64_vector_costs::
15924 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
15926 /* The heuristic only makes sense on targets that have the same
15927 vector throughput for SVE and Advanced SIMD. */
15928 if (!(aarch64_tune_params.extra_tuning_flags
15929 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
15930 return;
15932 /* We only want to apply the heuristic if LOOP_VINFO is being
15933 vectorized for SVE. */
15934 if (!(m_vec_flags & VEC_ANY_SVE))
15935 return;
15937 /* Check whether it is possible in principle to use Advanced SIMD
15938 instead. */
15939 if (aarch64_autovec_preference == 2)
15940 return;
15942 /* We don't want to apply the heuristic to outer loops, since it's
15943 harder to track two levels of unrolling. */
15944 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
15945 return;
15947 /* Only handle cases in which the number of Advanced SIMD iterations
15948 would be known at compile time but the number of SVE iterations
15949 would not. */
15950 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
15951 || aarch64_sve_vg.is_constant ())
15952 return;
15954 /* Guess how many times the Advanced SIMD loop would iterate and make
15955 sure that it is within the complete unrolling limit. Even if the
15956 number of iterations is small enough, the number of statements might
15957 not be, which is why we need to estimate the number of statements too. */
15958 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15959 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
15960 unsigned HOST_WIDE_INT unrolled_advsimd_niters
15961 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
15962 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
15963 return;
15965 /* Record that we're applying the heuristic and should try to estimate
15966 the number of statements in the Advanced SIMD loop. */
15967 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
15970 /* Do one-time initialization of the aarch64_vector_costs given that we're
15971 costing the loop vectorization described by LOOP_VINFO. */
15972 void
15973 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
15975 /* Record the number of times that the vector loop would execute,
15976 if known. */
15977 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
15978 auto scalar_niters = max_stmt_executions_int (loop);
15979 if (scalar_niters >= 0)
15981 unsigned int vf = vect_vf_for_cost (loop_vinfo);
15982 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
15983 m_num_vector_iterations = scalar_niters / vf;
15984 else
15985 m_num_vector_iterations = CEIL (scalar_niters, vf);
15988 /* Detect whether we're vectorizing for SVE and should apply the unrolling
15989 heuristic described above m_unrolled_advsimd_niters. */
15990 record_potential_advsimd_unrolling (loop_vinfo);
15992 /* Record the issue information for any SVE WHILE instructions that the
15993 loop needs. */
15994 if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
15996 unsigned int num_masks = 0;
15997 rgroup_controls *rgm;
15998 unsigned int num_vectors_m1;
15999 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
16000 if (rgm->type)
16001 num_masks += num_vectors_m1 + 1;
16002 for (auto &ops : m_ops)
16003 if (auto *issue = ops.sve_issue_info ())
16004 ops.pred_ops += num_masks * issue->while_pred_ops;
16008 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16009 static int
16010 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16011 tree vectype,
16012 int misalign ATTRIBUTE_UNUSED)
16014 unsigned elements;
16015 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16016 bool fp = false;
16018 if (vectype != NULL)
16019 fp = FLOAT_TYPE_P (vectype);
16021 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16023 switch (type_of_cost)
16025 case scalar_stmt:
16026 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16028 case scalar_load:
16029 return costs->scalar_load_cost;
16031 case scalar_store:
16032 return costs->scalar_store_cost;
16034 case vector_stmt:
16035 return fp ? simd_costs->fp_stmt_cost
16036 : simd_costs->int_stmt_cost;
16038 case vector_load:
16039 return simd_costs->align_load_cost;
16041 case vector_store:
16042 return simd_costs->store_cost;
16044 case vec_to_scalar:
16045 return simd_costs->vec_to_scalar_cost;
16047 case scalar_to_vec:
16048 return simd_costs->scalar_to_vec_cost;
16050 case unaligned_load:
16051 case vector_gather_load:
16052 return simd_costs->unalign_load_cost;
16054 case unaligned_store:
16055 case vector_scatter_store:
16056 return simd_costs->unalign_store_cost;
16058 case cond_branch_taken:
16059 return costs->cond_taken_branch_cost;
16061 case cond_branch_not_taken:
16062 return costs->cond_not_taken_branch_cost;
16064 case vec_perm:
16065 return simd_costs->permute_cost;
16067 case vec_promote_demote:
16068 return fp ? simd_costs->fp_stmt_cost
16069 : simd_costs->int_stmt_cost;
16071 case vec_construct:
16072 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16073 return elements / 2 + 1;
16075 default:
16076 gcc_unreachable ();
16080 /* Return true if an access of kind KIND for STMT_INFO represents one
16081 vector of an LD[234] or ST[234] operation. Return the total number of
16082 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16083 static int
16084 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16086 if ((kind == vector_load
16087 || kind == unaligned_load
16088 || kind == vector_store
16089 || kind == unaligned_store)
16090 && STMT_VINFO_DATA_REF (stmt_info))
16092 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16093 if (stmt_info
16094 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16095 return DR_GROUP_SIZE (stmt_info);
16097 return 0;
16100 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16101 vectors would produce a series of LDP or STP operations. KIND is the
16102 kind of statement that STMT_INFO represents. */
16103 static bool
16104 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16105 stmt_vec_info stmt_info)
16107 switch (kind)
16109 case vector_load:
16110 case vector_store:
16111 case unaligned_load:
16112 case unaligned_store:
16113 break;
16115 default:
16116 return false;
16119 if (aarch64_tune_params.extra_tuning_flags
16120 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16121 return false;
16123 return is_gimple_assign (stmt_info->stmt);
16126 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16127 or multiply-subtract sequence that might be suitable for fusing into a
16128 single instruction. If VEC_FLAGS is zero, analyze the operation as
16129 a scalar one, otherwise analyze it as an operation on vectors with those
16130 VEC_* flags. */
16131 static bool
16132 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16133 unsigned int vec_flags)
16135 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16136 if (!assign)
16137 return false;
16138 tree_code code = gimple_assign_rhs_code (assign);
16139 if (code != PLUS_EXPR && code != MINUS_EXPR)
16140 return false;
16142 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
16143 || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
16144 return false;
16146 for (int i = 1; i < 3; ++i)
16148 tree rhs = gimple_op (assign, i);
16149 /* ??? Should we try to check for a single use as well? */
16150 if (TREE_CODE (rhs) != SSA_NAME)
16151 continue;
16153 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16154 if (!def_stmt_info
16155 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16156 continue;
16157 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16158 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16159 continue;
16161 if (vec_flags & VEC_ADVSIMD)
16163 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16164 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16165 only supports MLA forms, so will require a move if the result
16166 cannot be tied to the accumulator. The most important case in
16167 which this is true is when the accumulator input is invariant. */
16168 rhs = gimple_op (assign, 3 - i);
16169 if (TREE_CODE (rhs) != SSA_NAME)
16170 return false;
16171 def_stmt_info = vinfo->lookup_def (rhs);
16172 if (!def_stmt_info
16173 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16174 return false;
16177 return true;
16179 return false;
16182 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16183 in-loop reduction that SVE supports directly, return its latency in cycles,
16184 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16185 instructions. */
16186 static unsigned int
16187 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16188 stmt_vec_info stmt_info,
16189 const sve_vec_cost *sve_costs)
16191 switch (vect_reduc_type (vinfo, stmt_info))
16193 case EXTRACT_LAST_REDUCTION:
16194 return sve_costs->clast_cost;
16196 case FOLD_LEFT_REDUCTION:
16197 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16199 case E_HFmode:
16200 case E_BFmode:
16201 return sve_costs->fadda_f16_cost;
16203 case E_SFmode:
16204 return sve_costs->fadda_f32_cost;
16206 case E_DFmode:
16207 return sve_costs->fadda_f64_cost;
16209 default:
16210 break;
16212 break;
16215 return 0;
16218 /* STMT_INFO describes a loop-carried operation in the original scalar code
16219 that we are considering implementing as a reduction. Return one of the
16220 following values, depending on VEC_FLAGS:
16222 - If VEC_FLAGS is zero, return the loop carry latency of the original
16223 scalar operation.
16225 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16226 Advanced SIMD implementation.
16228 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16229 SVE implementation. */
16230 static unsigned int
16231 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16232 unsigned int vec_flags)
16234 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16235 const sve_vec_cost *sve_costs = nullptr;
16236 if (vec_flags & VEC_ANY_SVE)
16237 sve_costs = aarch64_tune_params.vec_costs->sve;
16239 /* If the caller is asking for the SVE latency, check for forms of reduction
16240 that only SVE can handle directly. */
16241 if (sve_costs)
16243 unsigned int latency
16244 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16245 if (latency)
16246 return latency;
16249 /* Handle scalar costs. */
16250 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16251 if (vec_flags == 0)
16253 if (is_float)
16254 return vec_costs->scalar_fp_stmt_cost;
16255 return vec_costs->scalar_int_stmt_cost;
16258 /* Otherwise, the loop body just contains normal integer or FP operations,
16259 with a vector reduction outside the loop. */
16260 const simd_vec_cost *simd_costs
16261 = aarch64_simd_vec_costs_for_flags (vec_flags);
16262 if (is_float)
16263 return simd_costs->fp_stmt_cost;
16264 return simd_costs->int_stmt_cost;
16267 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16268 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16269 try to subdivide the target-independent categorization provided by KIND
16270 to get a more accurate cost. */
16271 static fractional_cost
16272 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16273 stmt_vec_info stmt_info,
16274 fractional_cost stmt_cost)
16276 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16277 the extension with the load. */
16278 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16279 return 0;
16281 return stmt_cost;
16284 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16285 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16286 when vectorized would operate on vector type VECTYPE. Try to subdivide
16287 the target-independent categorization provided by KIND to get a more
16288 accurate cost. WHERE specifies where the cost associated with KIND
16289 occurs. */
16290 static fractional_cost
16291 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16292 stmt_vec_info stmt_info, tree vectype,
16293 enum vect_cost_model_location where,
16294 fractional_cost stmt_cost)
16296 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16297 const sve_vec_cost *sve_costs = nullptr;
16298 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16299 sve_costs = aarch64_tune_params.vec_costs->sve;
16301 /* It's generally better to avoid costing inductions, since the induction
16302 will usually be hidden by other operations. This is particularly true
16303 for things like COND_REDUCTIONS. */
16304 if (is_a<gphi *> (stmt_info->stmt))
16305 return 0;
16307 /* Detect cases in which vec_to_scalar is describing the extraction of a
16308 vector element in preparation for a scalar store. The store itself is
16309 costed separately. */
16310 if (vect_is_store_elt_extraction (kind, stmt_info))
16311 return simd_costs->store_elt_extra_cost;
16313 /* Detect SVE gather loads, which are costed as a single scalar_load
16314 for each element. We therefore need to divide the full-instruction
16315 cost by the number of elements in the vector. */
16316 if (kind == scalar_load
16317 && sve_costs
16318 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16320 unsigned int nunits = vect_nunits_for_cost (vectype);
16321 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16322 return { sve_costs->gather_load_x64_cost, nunits };
16323 return { sve_costs->gather_load_x32_cost, nunits };
16326 /* Detect cases in which a scalar_store is really storing one element
16327 in a scatter operation. */
16328 if (kind == scalar_store
16329 && sve_costs
16330 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16331 return sve_costs->scatter_store_elt_cost;
16333 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16334 if (kind == vec_to_scalar
16335 && where == vect_body
16336 && sve_costs)
16338 unsigned int latency
16339 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16340 if (latency)
16341 return latency;
16344 /* Detect cases in which vec_to_scalar represents a single reduction
16345 instruction like FADDP or MAXV. */
16346 if (kind == vec_to_scalar
16347 && where == vect_epilogue
16348 && vect_is_reduction (stmt_info))
16349 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16351 case E_QImode:
16352 return simd_costs->reduc_i8_cost;
16354 case E_HImode:
16355 return simd_costs->reduc_i16_cost;
16357 case E_SImode:
16358 return simd_costs->reduc_i32_cost;
16360 case E_DImode:
16361 return simd_costs->reduc_i64_cost;
16363 case E_HFmode:
16364 case E_BFmode:
16365 return simd_costs->reduc_f16_cost;
16367 case E_SFmode:
16368 return simd_costs->reduc_f32_cost;
16370 case E_DFmode:
16371 return simd_costs->reduc_f64_cost;
16373 default:
16374 break;
16377 /* Otherwise stick with the original categorization. */
16378 return stmt_cost;
16381 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16382 for STMT_INFO, which has cost kind KIND and which when vectorized would
16383 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16384 targets. */
16385 static fractional_cost
16386 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16387 stmt_vec_info stmt_info, tree vectype,
16388 fractional_cost stmt_cost)
16390 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16391 vector register size or number of units. Integer promotions of this
16392 type therefore map to SXT[BHW] or UXT[BHW].
16394 Most loads have extending forms that can do the sign or zero extension
16395 on the fly. Optimistically assume that a load followed by an extension
16396 will fold to this form during combine, and that the extension therefore
16397 comes for free. */
16398 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16399 stmt_cost = 0;
16401 /* For similar reasons, vector_stmt integer truncations are a no-op,
16402 because we can just ignore the unused upper bits of the source. */
16403 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16404 stmt_cost = 0;
16406 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16407 but there are no equivalent instructions for SVE. This means that
16408 (all other things being equal) 128-bit SVE needs twice as many load
16409 and store instructions as Advanced SIMD in order to process vector pairs.
16411 Also, scalar code can often use LDP and STP to access pairs of values,
16412 so it is too simplistic to say that one SVE load or store replaces
16413 VF scalar loads and stores.
16415 Ideally we would account for this in the scalar and Advanced SIMD
16416 costs by making suitable load/store pairs as cheap as a single
16417 load/store. However, that would be a very invasive change and in
16418 practice it tends to stress other parts of the cost model too much.
16419 E.g. stores of scalar constants currently count just a store,
16420 whereas stores of vector constants count a store and a vec_init.
16421 This is an artificial distinction for AArch64, where stores of
16422 nonzero scalar constants need the same kind of register invariant
16423 as vector stores.
16425 An alternative would be to double the cost of any SVE loads and stores
16426 that could be paired in Advanced SIMD (and possibly also paired in
16427 scalar code). But this tends to stress other parts of the cost model
16428 in the same way. It also means that we can fall back to Advanced SIMD
16429 even if full-loop predication would have been useful.
16431 Here we go for a more conservative version: double the costs of SVE
16432 loads and stores if one iteration of the scalar loop processes enough
16433 elements for it to use a whole number of Advanced SIMD LDP or STP
16434 instructions. This makes it very likely that the VF would be 1 for
16435 Advanced SIMD, and so no epilogue should be needed. */
16436 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16438 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16439 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16440 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16441 if (multiple_p (count * elt_bits, 256)
16442 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16443 stmt_cost *= 2;
16446 return stmt_cost;
16449 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16450 and which when vectorized would operate on vector type VECTYPE. Add the
16451 cost of any embedded operations. */
16452 static fractional_cost
16453 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16454 tree vectype, fractional_cost stmt_cost)
16456 if (vectype)
16458 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16460 /* Detect cases in which a vector load or store represents an
16461 LD[234] or ST[234] instruction. */
16462 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16464 case 2:
16465 stmt_cost += simd_costs->ld2_st2_permute_cost;
16466 break;
16468 case 3:
16469 stmt_cost += simd_costs->ld3_st3_permute_cost;
16470 break;
16472 case 4:
16473 stmt_cost += simd_costs->ld4_st4_permute_cost;
16474 break;
16477 if (kind == vector_stmt || kind == vec_to_scalar)
16478 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16480 if (FLOAT_TYPE_P (cmp_type))
16481 stmt_cost += simd_costs->fp_stmt_cost;
16482 else
16483 stmt_cost += simd_costs->int_stmt_cost;
16487 if (kind == scalar_stmt)
16488 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16490 if (FLOAT_TYPE_P (cmp_type))
16491 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16492 else
16493 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16496 return stmt_cost;
16499 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16500 and they describe an operation in the body of a vector loop. Record issue
16501 information relating to the vector operation in OPS. */
16502 void
16503 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16504 stmt_vec_info stmt_info,
16505 aarch64_vec_op_count *ops)
16507 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16508 if (!base_issue)
16509 return;
16510 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16511 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16513 /* Calculate the minimum cycles per iteration imposed by a reduction
16514 operation. */
16515 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16516 && vect_is_reduction (stmt_info))
16518 unsigned int base
16519 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16521 /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16522 that's not yet the case. */
16523 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16526 /* Assume that multiply-adds will become a single operation. */
16527 if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16528 return;
16530 /* Count the basic operation cost associated with KIND. */
16531 switch (kind)
16533 case cond_branch_taken:
16534 case cond_branch_not_taken:
16535 case vector_gather_load:
16536 case vector_scatter_store:
16537 /* We currently don't expect these to be used in a loop body. */
16538 break;
16540 case vec_perm:
16541 case vec_promote_demote:
16542 case vec_construct:
16543 case vec_to_scalar:
16544 case scalar_to_vec:
16545 case vector_stmt:
16546 case scalar_stmt:
16547 ops->general_ops += count;
16548 break;
16550 case scalar_load:
16551 case vector_load:
16552 case unaligned_load:
16553 ops->loads += count;
16554 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16555 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16556 break;
16558 case vector_store:
16559 case unaligned_store:
16560 case scalar_store:
16561 ops->stores += count;
16562 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16563 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16564 break;
16567 /* Add any embedded comparison operations. */
16568 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16569 && vect_embedded_comparison_type (stmt_info))
16570 ops->general_ops += count;
16572 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16573 have only accounted for one. */
16574 if ((kind == vector_stmt || kind == vec_to_scalar)
16575 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16576 ops->general_ops += count;
16578 /* Count the predicate operations needed by an SVE comparison. */
16579 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16580 if (tree type = vect_comparison_type (stmt_info))
16582 unsigned int base = (FLOAT_TYPE_P (type)
16583 ? sve_issue->fp_cmp_pred_ops
16584 : sve_issue->int_cmp_pred_ops);
16585 ops->pred_ops += base * count;
16588 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
16589 if (simd_issue)
16590 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16592 case 2:
16593 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16594 break;
16596 case 3:
16597 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16598 break;
16600 case 4:
16601 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16602 break;
16605 /* Add any overhead associated with gather loads and scatter stores. */
16606 if (sve_issue
16607 && (kind == scalar_load || kind == scalar_store)
16608 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16610 unsigned int pairs = CEIL (count, 2);
16611 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16612 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16616 /* Return true if STMT_INFO contains a memory access and if the constant
16617 component of the memory address is aligned to SIZE bytes. */
16618 static bool
16619 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16620 poly_uint64 size)
16622 if (!STMT_VINFO_DATA_REF (stmt_info))
16623 return false;
16625 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16626 stmt_info = first_stmt;
16627 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16628 /* Needed for gathers & scatters, for example. */
16629 if (!constant_offset)
16630 return false;
16632 return multiple_p (wi::to_poly_offset (constant_offset), size);
16635 /* Check if a scalar or vector stmt could be part of a region of code
16636 that does nothing more than store values to memory, in the scalar
16637 case using STP. Return the cost of the stmt if so, counting 2 for
16638 one instruction. Return ~0U otherwise.
16640 The arguments are a subset of those passed to add_stmt_cost. */
16641 unsigned int
16642 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16643 stmt_vec_info stmt_info, tree vectype)
16645 /* Code that stores vector constants uses a vector_load to create
16646 the constant. We don't apply the heuristic to that case for two
16647 main reasons:
16649 - At the moment, STPs are only formed via peephole2, and the
16650 constant scalar moves would often come between STRs and so
16651 prevent STP formation.
16653 - The scalar code also has to load the constant somehow, and that
16654 isn't costed. */
16655 switch (kind)
16657 case scalar_to_vec:
16658 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
16659 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16661 case vec_construct:
16662 if (FLOAT_TYPE_P (vectype))
16663 /* Count 1 insn for the maximum number of FP->SIMD INS
16664 instructions. */
16665 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16667 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16668 maximum number of GPR->SIMD INS instructions. */
16669 return vect_nunits_for_cost (vectype) * 4 * count;
16671 case vector_store:
16672 case unaligned_store:
16673 /* Count 1 insn per vector if we can't form STP Q pairs. */
16674 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16675 return count * 2;
16676 if (aarch64_tune_params.extra_tuning_flags
16677 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16678 return count * 2;
16680 if (stmt_info)
16682 /* Assume we won't be able to use STP if the constant offset
16683 component of the address is misaligned. ??? This could be
16684 removed if we formed STP pairs earlier, rather than relying
16685 on peephole2. */
16686 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16687 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16688 return count * 2;
16690 return CEIL (count, 2) * 2;
16692 case scalar_store:
16693 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16695 /* Check for a mode in which STP pairs can be formed. */
16696 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16697 if (maybe_ne (size, 4) && maybe_ne (size, 8))
16698 return ~0U;
16700 /* Assume we won't be able to use STP if the constant offset
16701 component of the address is misaligned. ??? This could be
16702 removed if we formed STP pairs earlier, rather than relying
16703 on peephole2. */
16704 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16705 return ~0U;
16707 return count;
16709 default:
16710 return ~0U;
16714 unsigned
16715 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16716 stmt_vec_info stmt_info, slp_tree,
16717 tree vectype, int misalign,
16718 vect_cost_model_location where)
16720 fractional_cost stmt_cost
16721 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16723 bool in_inner_loop_p = (where == vect_body
16724 && stmt_info
16725 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16727 /* Do one-time initialization based on the vinfo. */
16728 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16729 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16731 if (loop_vinfo)
16732 analyze_loop_vinfo (loop_vinfo);
16734 m_analyzed_vinfo = true;
16737 /* Apply the heuristic described above m_stp_sequence_cost. */
16738 if (m_stp_sequence_cost != ~0U)
16740 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16741 stmt_info, vectype);
16742 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16745 /* Try to get a more accurate cost by looking at STMT_INFO instead
16746 of just looking at KIND. */
16747 if (stmt_info && aarch64_use_new_vector_costs_p ())
16749 /* If we scalarize a strided store, the vectorizer costs one
16750 vec_to_scalar for each element. However, we can store the first
16751 element using an FP store without a separate extract step. */
16752 if (vect_is_store_elt_extraction (kind, stmt_info))
16753 count -= 1;
16755 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16756 stmt_info, stmt_cost);
16758 if (vectype && m_vec_flags)
16759 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16760 stmt_info, vectype,
16761 where, stmt_cost);
16764 /* Do any SVE-specific adjustments to the cost. */
16765 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16766 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16767 vectype, stmt_cost);
16769 if (stmt_info && aarch64_use_new_vector_costs_p ())
16771 /* Account for any extra "embedded" costs that apply additively
16772 to the base cost calculated above. */
16773 stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16774 stmt_cost);
16776 /* If we're recording a nonzero vector loop body cost for the
16777 innermost loop, also estimate the operations that would need
16778 to be issued by all relevant implementations of the loop. */
16779 if (loop_vinfo
16780 && (m_costing_for_scalar || where == vect_body)
16781 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16782 && stmt_cost != 0)
16783 for (auto &ops : m_ops)
16784 count_ops (count, kind, stmt_info, &ops);
16786 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16787 estimate the number of statements in the unrolled Advanced SIMD
16788 loop. For simplicitly, we assume that one iteration of the
16789 Advanced SIMD loop would need the same number of statements
16790 as one iteration of the SVE loop. */
16791 if (where == vect_body && m_unrolled_advsimd_niters)
16792 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16794 /* Detect the use of an averaging operation. */
16795 gimple *stmt = stmt_info->stmt;
16796 if (is_gimple_call (stmt)
16797 && gimple_call_internal_p (stmt))
16799 switch (gimple_call_internal_fn (stmt))
16801 case IFN_AVG_FLOOR:
16802 case IFN_AVG_CEIL:
16803 m_has_avg = true;
16804 default:
16805 break;
16809 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16812 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16813 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16814 says that we should prefer the Advanced SIMD loop. */
16815 bool
16816 aarch64_vector_costs::prefer_unrolled_loop () const
16818 if (!m_unrolled_advsimd_stmts)
16819 return false;
16821 if (dump_enabled_p ())
16822 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16823 " unrolled Advanced SIMD loop = "
16824 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
16825 m_unrolled_advsimd_stmts);
16827 /* The balance here is tricky. On the one hand, we can't be sure whether
16828 the code is vectorizable with Advanced SIMD or not. However, even if
16829 it isn't vectorizable with Advanced SIMD, there's a possibility that
16830 the scalar code could also be unrolled. Some of the code might then
16831 benefit from SLP, or from using LDP and STP. We therefore apply
16832 the heuristic regardless of can_use_advsimd_p. */
16833 return (m_unrolled_advsimd_stmts
16834 && (m_unrolled_advsimd_stmts
16835 <= (unsigned int) param_max_completely_peeled_insns));
16838 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
16839 how fast the SVE code can be issued and compare it to the equivalent value
16840 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
16841 also compare it to the issue rate of Advanced SIMD code
16842 (ADVSIMD_CYCLES_PER_ITER).
16844 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16845 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
16846 is true if we think the loop body is too expensive. */
16848 fractional_cost
16849 aarch64_vector_costs::
16850 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16851 fractional_cost scalar_cycles_per_iter,
16852 unsigned int orig_body_cost, unsigned int *body_cost,
16853 bool *should_disparage)
16855 if (dump_enabled_p ())
16856 ops->dump ();
16858 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16859 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16861 /* If the scalar version of the loop could issue at least as
16862 quickly as the predicate parts of the SVE loop, make the SVE loop
16863 prohibitively expensive. In this case vectorization is adding an
16864 overhead that the original scalar code didn't have.
16866 This is mostly intended to detect cases in which WHILELOs dominate
16867 for very tight loops, which is something that normal latency-based
16868 costs would not model. Adding this kind of cliffedge would be
16869 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16870 code in the caller handles that case in a more conservative way. */
16871 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
16872 if (scalar_cycles_per_iter < sve_estimate)
16874 unsigned int min_cost
16875 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
16876 if (*body_cost < min_cost)
16878 if (dump_enabled_p ())
16879 dump_printf_loc (MSG_NOTE, vect_location,
16880 "Increasing body cost to %d because the"
16881 " scalar code could issue within the limit"
16882 " imposed by predicate operations\n",
16883 min_cost);
16884 *body_cost = min_cost;
16885 *should_disparage = true;
16889 return sve_cycles_per_iter;
16892 unsigned int
16893 aarch64_vector_costs::determine_suggested_unroll_factor ()
16895 bool sve = m_vec_flags & VEC_ANY_SVE;
16896 /* If we are trying to unroll an Advanced SIMD main loop that contains
16897 an averaging operation that we do not support with SVE and we might use a
16898 predicated epilogue, we need to be conservative and block unrolling as
16899 this might lead to a less optimal loop for the first and only epilogue
16900 using the original loop's vectorization factor.
16901 TODO: Remove this constraint when we add support for multiple epilogue
16902 vectorization. */
16903 if (!sve && !TARGET_SVE2 && m_has_avg)
16904 return 1;
16906 unsigned int max_unroll_factor = 1;
16907 for (auto vec_ops : m_ops)
16909 aarch64_simd_vec_issue_info const *vec_issue
16910 = vec_ops.simd_issue_info ();
16911 if (!vec_issue)
16912 return 1;
16913 /* Limit unroll factor to a value adjustable by the user, the default
16914 value is 4. */
16915 unsigned int unroll_factor = aarch64_vect_unroll_limit;
16916 unsigned int factor
16917 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
16918 unsigned int temp;
16920 /* Sanity check, this should never happen. */
16921 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
16922 return 1;
16924 /* Check stores. */
16925 if (vec_ops.stores > 0)
16927 temp = CEIL (factor * vec_issue->stores_per_cycle,
16928 vec_ops.stores);
16929 unroll_factor = MIN (unroll_factor, temp);
16932 /* Check loads + stores. */
16933 if (vec_ops.loads > 0)
16935 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
16936 vec_ops.loads + vec_ops.stores);
16937 unroll_factor = MIN (unroll_factor, temp);
16940 /* Check general ops. */
16941 if (vec_ops.general_ops > 0)
16943 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
16944 vec_ops.general_ops);
16945 unroll_factor = MIN (unroll_factor, temp);
16947 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
16950 /* Make sure unroll factor is power of 2. */
16951 return 1 << ceil_log2 (max_unroll_factor);
16954 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
16955 and return the new cost. */
16956 unsigned int
16957 aarch64_vector_costs::
16958 adjust_body_cost (loop_vec_info loop_vinfo,
16959 const aarch64_vector_costs *scalar_costs,
16960 unsigned int body_cost)
16962 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
16963 return body_cost;
16965 const auto &scalar_ops = scalar_costs->m_ops[0];
16966 const auto &vector_ops = m_ops[0];
16967 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
16968 unsigned int orig_body_cost = body_cost;
16969 bool should_disparage = false;
16971 if (dump_enabled_p ())
16972 dump_printf_loc (MSG_NOTE, vect_location,
16973 "Original vector body cost = %d\n", body_cost);
16975 fractional_cost scalar_cycles_per_iter
16976 = scalar_ops.min_cycles_per_iter () * estimated_vf;
16978 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
16980 if (dump_enabled_p ())
16982 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
16983 dump_printf_loc (MSG_NOTE, vect_location,
16984 "Vector loop iterates at most %wd times\n",
16985 m_num_vector_iterations);
16986 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
16987 scalar_ops.dump ();
16988 dump_printf_loc (MSG_NOTE, vect_location,
16989 " estimated cycles per vector iteration"
16990 " (for VF %d) = %f\n",
16991 estimated_vf, scalar_cycles_per_iter.as_double ());
16994 if (vector_ops.sve_issue_info ())
16996 if (dump_enabled_p ())
16997 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
16998 vector_cycles_per_iter
16999 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17000 orig_body_cost, &body_cost, &should_disparage);
17002 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17004 /* Also take Neoverse V1 tuning into account, doubling the
17005 scalar and Advanced SIMD estimates to account for the
17006 doubling in SVE vector length. */
17007 if (dump_enabled_p ())
17008 dump_printf_loc (MSG_NOTE, vect_location,
17009 "Neoverse V1 estimate:\n");
17010 auto vf_factor = m_ops[1].vf_factor ();
17011 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17012 orig_body_cost, &body_cost, &should_disparage);
17015 else
17017 if (dump_enabled_p ())
17019 dump_printf_loc (MSG_NOTE, vect_location,
17020 "Vector issue estimate:\n");
17021 vector_ops.dump ();
17025 /* Decide whether to stick to latency-based costs or whether to try to
17026 take issue rates into account. */
17027 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17028 if (m_vec_flags & VEC_ANY_SVE)
17029 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17031 if (m_num_vector_iterations >= 1
17032 && m_num_vector_iterations < threshold)
17034 if (dump_enabled_p ())
17035 dump_printf_loc (MSG_NOTE, vect_location,
17036 "Low iteration count, so using pure latency"
17037 " costs\n");
17039 /* Increase the cost of the vector code if it looks like the scalar code
17040 could issue more quickly. These values are only rough estimates,
17041 so minor differences should only result in minor changes. */
17042 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17044 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17045 scalar_cycles_per_iter);
17046 if (dump_enabled_p ())
17047 dump_printf_loc (MSG_NOTE, vect_location,
17048 "Increasing body cost to %d because scalar code"
17049 " would issue more quickly\n", body_cost);
17051 /* In general, it's expected that the proposed vector code would be able
17052 to issue more quickly than the original scalar code. This should
17053 already be reflected to some extent in the latency-based costs.
17055 However, the latency-based costs effectively assume that the scalar
17056 code and the vector code execute serially, which tends to underplay
17057 one important case: if the real (non-serialized) execution time of
17058 a scalar iteration is dominated by loop-carried dependencies,
17059 and if the vector code is able to reduce both the length of
17060 the loop-carried dependencies *and* the number of cycles needed
17061 to issue the code in general, we can be more confident that the
17062 vector code is an improvement, even if adding the other (non-loop-carried)
17063 latencies tends to hide this saving. We therefore reduce the cost of the
17064 vector loop body in proportion to the saving. */
17065 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17066 && scalar_ops.reduction_latency == scalar_cycles_per_iter
17067 && scalar_cycles_per_iter > vector_cycles_per_iter
17068 && !should_disparage)
17070 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17071 scalar_cycles_per_iter);
17072 if (dump_enabled_p ())
17073 dump_printf_loc (MSG_NOTE, vect_location,
17074 "Decreasing body cost to %d account for smaller"
17075 " reduction latency\n", body_cost);
17078 return body_cost;
17081 void
17082 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17084 auto *scalar_costs
17085 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17086 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17087 if (loop_vinfo
17088 && m_vec_flags
17089 && aarch64_use_new_vector_costs_p ())
17091 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17092 m_costs[vect_body]);
17093 m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17096 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17097 the scalar code in the event of a tie, since there is more chance
17098 of scalar code being optimized with surrounding operations. */
17099 if (!loop_vinfo
17100 && scalar_costs
17101 && m_stp_sequence_cost != ~0U
17102 && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17103 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17105 vector_costs::finish_cost (scalar_costs);
17108 bool
17109 aarch64_vector_costs::
17110 better_main_loop_than_p (const vector_costs *uncast_other) const
17112 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17114 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17115 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17117 if (dump_enabled_p ())
17118 dump_printf_loc (MSG_NOTE, vect_location,
17119 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17120 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17121 vect_vf_for_cost (this_loop_vinfo),
17122 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17123 vect_vf_for_cost (other_loop_vinfo));
17125 /* Apply the unrolling heuristic described above
17126 m_unrolled_advsimd_niters. */
17127 if (bool (m_unrolled_advsimd_stmts)
17128 != bool (other->m_unrolled_advsimd_stmts))
17130 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17131 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17132 if (this_prefer_unrolled != other_prefer_unrolled)
17134 if (dump_enabled_p ())
17135 dump_printf_loc (MSG_NOTE, vect_location,
17136 "Preferring Advanced SIMD loop because"
17137 " it can be unrolled\n");
17138 return other_prefer_unrolled;
17142 for (unsigned int i = 0; i < m_ops.length (); ++i)
17144 if (dump_enabled_p ())
17146 if (i)
17147 dump_printf_loc (MSG_NOTE, vect_location,
17148 "Reconsidering with subtuning %d\n", i);
17149 dump_printf_loc (MSG_NOTE, vect_location,
17150 "Issue info for %s loop:\n",
17151 GET_MODE_NAME (this_loop_vinfo->vector_mode));
17152 this->m_ops[i].dump ();
17153 dump_printf_loc (MSG_NOTE, vect_location,
17154 "Issue info for %s loop:\n",
17155 GET_MODE_NAME (other_loop_vinfo->vector_mode));
17156 other->m_ops[i].dump ();
17159 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17160 * this->m_ops[i].vf_factor ());
17161 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17162 * other->m_ops[i].vf_factor ());
17164 /* If it appears that one loop could process the same amount of data
17165 in fewer cycles, prefer that loop over the other one. */
17166 fractional_cost this_cost
17167 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17168 fractional_cost other_cost
17169 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17170 if (dump_enabled_p ())
17172 dump_printf_loc (MSG_NOTE, vect_location,
17173 "Weighted cycles per iteration of %s loop ~= %f\n",
17174 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17175 this_cost.as_double ());
17176 dump_printf_loc (MSG_NOTE, vect_location,
17177 "Weighted cycles per iteration of %s loop ~= %f\n",
17178 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17179 other_cost.as_double ());
17181 if (this_cost != other_cost)
17183 if (dump_enabled_p ())
17184 dump_printf_loc (MSG_NOTE, vect_location,
17185 "Preferring loop with lower cycles"
17186 " per iteration\n");
17187 return this_cost < other_cost;
17190 /* If the issue rate of SVE code is limited by predicate operations
17191 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17192 and if Advanced SIMD code could issue within the limit imposed
17193 by the predicate operations, the predicate operations are adding an
17194 overhead that the original code didn't have and so we should prefer
17195 the Advanced SIMD version. */
17196 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17197 const aarch64_vec_op_count &b) -> bool
17199 if (a.pred_ops == 0
17200 && (b.min_pred_cycles_per_iter ()
17201 > b.min_nonpred_cycles_per_iter ()))
17203 if (dump_enabled_p ())
17204 dump_printf_loc (MSG_NOTE, vect_location,
17205 "Preferring Advanced SIMD loop since"
17206 " SVE loop is predicate-limited\n");
17207 return true;
17209 return false;
17211 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17212 return true;
17213 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17214 return false;
17217 return vector_costs::better_main_loop_than_p (other);
17220 static void initialize_aarch64_code_model (struct gcc_options *);
17222 /* Parse the TO_PARSE string and put the architecture struct that it
17223 selects into RES and the architectural features into ISA_FLAGS.
17224 Return an aarch64_parse_opt_result describing the parse result.
17225 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17226 When the TO_PARSE string contains an invalid extension,
17227 a copy of the string is created and stored to INVALID_EXTENSION. */
17229 static enum aarch64_parse_opt_result
17230 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17231 aarch64_feature_flags *isa_flags,
17232 std::string *invalid_extension)
17234 const char *ext;
17235 const struct processor *arch;
17236 size_t len;
17238 ext = strchr (to_parse, '+');
17240 if (ext != NULL)
17241 len = ext - to_parse;
17242 else
17243 len = strlen (to_parse);
17245 if (len == 0)
17246 return AARCH64_PARSE_MISSING_ARG;
17249 /* Loop through the list of supported ARCHes to find a match. */
17250 for (arch = all_architectures; arch->name != NULL; arch++)
17252 if (strlen (arch->name) == len
17253 && strncmp (arch->name, to_parse, len) == 0)
17255 auto isa_temp = arch->flags;
17257 if (ext != NULL)
17259 /* TO_PARSE string contains at least one extension. */
17260 enum aarch64_parse_opt_result ext_res
17261 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17263 if (ext_res != AARCH64_PARSE_OK)
17264 return ext_res;
17266 /* Extension parsing was successful. Confirm the result
17267 arch and ISA flags. */
17268 *res = arch;
17269 *isa_flags = isa_temp;
17270 return AARCH64_PARSE_OK;
17274 /* ARCH name not found in list. */
17275 return AARCH64_PARSE_INVALID_ARG;
17278 /* Parse the TO_PARSE string and put the result tuning in RES and the
17279 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
17280 describing the parse result. If there is an error parsing, RES and
17281 ISA_FLAGS are left unchanged.
17282 When the TO_PARSE string contains an invalid extension,
17283 a copy of the string is created and stored to INVALID_EXTENSION. */
17285 static enum aarch64_parse_opt_result
17286 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17287 aarch64_feature_flags *isa_flags,
17288 std::string *invalid_extension)
17290 const char *ext;
17291 const struct processor *cpu;
17292 size_t len;
17294 ext = strchr (to_parse, '+');
17296 if (ext != NULL)
17297 len = ext - to_parse;
17298 else
17299 len = strlen (to_parse);
17301 if (len == 0)
17302 return AARCH64_PARSE_MISSING_ARG;
17305 /* Loop through the list of supported CPUs to find a match. */
17306 for (cpu = all_cores; cpu->name != NULL; cpu++)
17308 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17310 auto isa_temp = cpu->flags;
17312 if (ext != NULL)
17314 /* TO_PARSE string contains at least one extension. */
17315 enum aarch64_parse_opt_result ext_res
17316 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17318 if (ext_res != AARCH64_PARSE_OK)
17319 return ext_res;
17321 /* Extension parsing was successfull. Confirm the result
17322 cpu and ISA flags. */
17323 *res = cpu;
17324 *isa_flags = isa_temp;
17325 return AARCH64_PARSE_OK;
17329 /* CPU name not found in list. */
17330 return AARCH64_PARSE_INVALID_ARG;
17333 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17334 Return an aarch64_parse_opt_result describing the parse result.
17335 If the parsing fails the RES does not change. */
17337 static enum aarch64_parse_opt_result
17338 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17340 const struct processor *cpu;
17342 /* Loop through the list of supported CPUs to find a match. */
17343 for (cpu = all_cores; cpu->name != NULL; cpu++)
17345 if (strcmp (cpu->name, to_parse) == 0)
17347 *res = cpu;
17348 return AARCH64_PARSE_OK;
17352 /* CPU name not found in list. */
17353 return AARCH64_PARSE_INVALID_ARG;
17356 /* Parse TOKEN, which has length LENGTH to see if it is an option
17357 described in FLAG. If it is, return the index bit for that fusion type.
17358 If not, error (printing OPTION_NAME) and return zero. */
17360 static unsigned int
17361 aarch64_parse_one_option_token (const char *token,
17362 size_t length,
17363 const struct aarch64_flag_desc *flag,
17364 const char *option_name)
17366 for (; flag->name != NULL; flag++)
17368 if (length == strlen (flag->name)
17369 && !strncmp (flag->name, token, length))
17370 return flag->flag;
17373 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17374 return 0;
17377 /* Parse OPTION which is a comma-separated list of flags to enable.
17378 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17379 default state we inherit from the CPU tuning structures. OPTION_NAME
17380 gives the top-level option we are parsing in the -moverride string,
17381 for use in error messages. */
17383 static unsigned int
17384 aarch64_parse_boolean_options (const char *option,
17385 const struct aarch64_flag_desc *flags,
17386 unsigned int initial_state,
17387 const char *option_name)
17389 const char separator = '.';
17390 const char* specs = option;
17391 const char* ntoken = option;
17392 unsigned int found_flags = initial_state;
17394 while ((ntoken = strchr (specs, separator)))
17396 size_t token_length = ntoken - specs;
17397 unsigned token_ops = aarch64_parse_one_option_token (specs,
17398 token_length,
17399 flags,
17400 option_name);
17401 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17402 in the token stream, reset the supported operations. So:
17404 adrp+add.cmp+branch.none.adrp+add
17406 would have the result of turning on only adrp+add fusion. */
17407 if (!token_ops)
17408 found_flags = 0;
17410 found_flags |= token_ops;
17411 specs = ++ntoken;
17414 /* We ended with a comma, print something. */
17415 if (!(*specs))
17417 error ("%qs string ill-formed", option_name);
17418 return 0;
17421 /* We still have one more token to parse. */
17422 size_t token_length = strlen (specs);
17423 unsigned token_ops = aarch64_parse_one_option_token (specs,
17424 token_length,
17425 flags,
17426 option_name);
17427 if (!token_ops)
17428 found_flags = 0;
17430 found_flags |= token_ops;
17431 return found_flags;
17434 /* Support for overriding instruction fusion. */
17436 static void
17437 aarch64_parse_fuse_string (const char *fuse_string,
17438 struct tune_params *tune)
17440 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17441 aarch64_fusible_pairs,
17442 tune->fusible_ops,
17443 "fuse=");
17446 /* Support for overriding other tuning flags. */
17448 static void
17449 aarch64_parse_tune_string (const char *tune_string,
17450 struct tune_params *tune)
17452 tune->extra_tuning_flags
17453 = aarch64_parse_boolean_options (tune_string,
17454 aarch64_tuning_flags,
17455 tune->extra_tuning_flags,
17456 "tune=");
17459 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17460 Accept the valid SVE vector widths allowed by
17461 aarch64_sve_vector_bits_enum and use it to override sve_width
17462 in TUNE. */
17464 static void
17465 aarch64_parse_sve_width_string (const char *tune_string,
17466 struct tune_params *tune)
17468 int width = -1;
17470 int n = sscanf (tune_string, "%d", &width);
17471 if (n == EOF)
17473 error ("invalid format for %<sve_width%>");
17474 return;
17476 switch (width)
17478 case SVE_128:
17479 case SVE_256:
17480 case SVE_512:
17481 case SVE_1024:
17482 case SVE_2048:
17483 break;
17484 default:
17485 error ("invalid %<sve_width%> value: %d", width);
17487 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17490 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17491 we understand. If it is, extract the option string and handoff to
17492 the appropriate function. */
17494 void
17495 aarch64_parse_one_override_token (const char* token,
17496 size_t length,
17497 struct tune_params *tune)
17499 const struct aarch64_tuning_override_function *fn
17500 = aarch64_tuning_override_functions;
17502 const char *option_part = strchr (token, '=');
17503 if (!option_part)
17505 error ("tuning string missing in option (%s)", token);
17506 return;
17509 /* Get the length of the option name. */
17510 length = option_part - token;
17511 /* Skip the '=' to get to the option string. */
17512 option_part++;
17514 for (; fn->name != NULL; fn++)
17516 if (!strncmp (fn->name, token, length))
17518 fn->parse_override (option_part, tune);
17519 return;
17523 error ("unknown tuning option (%s)",token);
17524 return;
17527 /* A checking mechanism for the implementation of the tls size. */
17529 static void
17530 initialize_aarch64_tls_size (struct gcc_options *opts)
17532 if (aarch64_tls_size == 0)
17533 aarch64_tls_size = 24;
17535 switch (opts->x_aarch64_cmodel_var)
17537 case AARCH64_CMODEL_TINY:
17538 /* Both the default and maximum TLS size allowed under tiny is 1M which
17539 needs two instructions to address, so we clamp the size to 24. */
17540 if (aarch64_tls_size > 24)
17541 aarch64_tls_size = 24;
17542 break;
17543 case AARCH64_CMODEL_SMALL:
17544 /* The maximum TLS size allowed under small is 4G. */
17545 if (aarch64_tls_size > 32)
17546 aarch64_tls_size = 32;
17547 break;
17548 case AARCH64_CMODEL_LARGE:
17549 /* The maximum TLS size allowed under large is 16E.
17550 FIXME: 16E should be 64bit, we only support 48bit offset now. */
17551 if (aarch64_tls_size > 48)
17552 aarch64_tls_size = 48;
17553 break;
17554 default:
17555 gcc_unreachable ();
17558 return;
17561 /* Return the CPU corresponding to the enum CPU. */
17563 static const struct processor *
17564 aarch64_get_tune_cpu (enum aarch64_processor cpu)
17566 gcc_assert (cpu != aarch64_none);
17568 return &all_cores[cpu];
17571 /* Return the architecture corresponding to the enum ARCH. */
17573 static const struct processor *
17574 aarch64_get_arch (enum aarch64_arch arch)
17576 gcc_assert (arch != aarch64_no_arch);
17578 return &all_architectures[arch];
17581 /* Parse STRING looking for options in the format:
17582 string :: option:string
17583 option :: name=substring
17584 name :: {a-z}
17585 substring :: defined by option. */
17587 static void
17588 aarch64_parse_override_string (const char* input_string,
17589 struct tune_params* tune)
17591 const char separator = ':';
17592 size_t string_length = strlen (input_string) + 1;
17593 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17594 char *string = string_root;
17595 strncpy (string, input_string, string_length);
17596 string[string_length - 1] = '\0';
17598 char* ntoken = string;
17600 while ((ntoken = strchr (string, separator)))
17602 size_t token_length = ntoken - string;
17603 /* Make this substring look like a string. */
17604 *ntoken = '\0';
17605 aarch64_parse_one_override_token (string, token_length, tune);
17606 string = ++ntoken;
17609 /* One last option to parse. */
17610 aarch64_parse_one_override_token (string, strlen (string), tune);
17611 free (string_root);
17614 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17615 are best for a generic target with the currently-enabled architecture
17616 extensions. */
17617 static void
17618 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17620 /* Neoverse V1 is the only core that is known to benefit from
17621 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
17622 point enabling it for SVE2 and above. */
17623 if (TARGET_SVE2)
17624 current_tune.extra_tuning_flags
17625 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17628 static void
17629 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17631 if (accepted_branch_protection_string)
17633 opts->x_aarch64_branch_protection_string
17634 = xstrdup (accepted_branch_protection_string);
17637 /* PR 70044: We have to be careful about being called multiple times for the
17638 same function. This means all changes should be repeatable. */
17640 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17641 Disable the frame pointer flag so the mid-end will not use a frame
17642 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17643 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17644 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
17645 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17646 if (opts->x_flag_omit_frame_pointer == 0)
17647 opts->x_flag_omit_frame_pointer = 2;
17649 /* If not optimizing for size, set the default
17650 alignment to what the target wants. */
17651 if (!opts->x_optimize_size)
17653 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17654 opts->x_str_align_loops = aarch64_tune_params.loop_align;
17655 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17656 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17657 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17658 opts->x_str_align_functions = aarch64_tune_params.function_align;
17661 /* We default to no pc-relative literal loads. */
17663 aarch64_pcrelative_literal_loads = false;
17665 /* If -mpc-relative-literal-loads is set on the command line, this
17666 implies that the user asked for PC relative literal loads. */
17667 if (opts->x_pcrelative_literal_loads == 1)
17668 aarch64_pcrelative_literal_loads = true;
17670 /* In the tiny memory model it makes no sense to disallow PC relative
17671 literal pool loads. */
17672 if (aarch64_cmodel == AARCH64_CMODEL_TINY
17673 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17674 aarch64_pcrelative_literal_loads = true;
17676 /* When enabling the lower precision Newton series for the square root, also
17677 enable it for the reciprocal square root, since the latter is an
17678 intermediary step for the former. */
17679 if (flag_mlow_precision_sqrt)
17680 flag_mrecip_low_precision_sqrt = true;
17683 /* 'Unpack' up the internal tuning structs and update the options
17684 in OPTS. The caller must have set up selected_tune and selected_arch
17685 as all the other target-specific codegen decisions are
17686 derived from them. */
17688 void
17689 aarch64_override_options_internal (struct gcc_options *opts)
17691 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17692 aarch64_tune_flags = tune->flags;
17693 aarch64_tune = tune->sched_core;
17694 /* Make a copy of the tuning parameters attached to the core, which
17695 we may later overwrite. */
17696 aarch64_tune_params = *(tune->tune);
17697 if (tune->tune == &generic_tunings)
17698 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17700 if (opts->x_aarch64_override_tune_string)
17701 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17702 &aarch64_tune_params);
17704 /* This target defaults to strict volatile bitfields. */
17705 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17706 opts->x_flag_strict_volatile_bitfields = 1;
17708 if (aarch64_stack_protector_guard == SSP_GLOBAL
17709 && opts->x_aarch64_stack_protector_guard_offset_str)
17711 error ("incompatible options %<-mstack-protector-guard=global%> and "
17712 "%<-mstack-protector-guard-offset=%s%>",
17713 aarch64_stack_protector_guard_offset_str);
17716 if (aarch64_stack_protector_guard == SSP_SYSREG
17717 && !(opts->x_aarch64_stack_protector_guard_offset_str
17718 && opts->x_aarch64_stack_protector_guard_reg_str))
17720 error ("both %<-mstack-protector-guard-offset%> and "
17721 "%<-mstack-protector-guard-reg%> must be used "
17722 "with %<-mstack-protector-guard=sysreg%>");
17725 if (opts->x_aarch64_stack_protector_guard_reg_str)
17727 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17728 error ("specify a system register with a small string length");
17731 if (opts->x_aarch64_stack_protector_guard_offset_str)
17733 char *end;
17734 const char *str = aarch64_stack_protector_guard_offset_str;
17735 errno = 0;
17736 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17737 if (!*str || *end || errno)
17738 error ("%qs is not a valid offset in %qs", str,
17739 "-mstack-protector-guard-offset=");
17740 aarch64_stack_protector_guard_offset = offs;
17743 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17744 && !fixed_regs[R18_REGNUM])
17745 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17747 initialize_aarch64_code_model (opts);
17748 initialize_aarch64_tls_size (opts);
17750 int queue_depth = 0;
17751 switch (aarch64_tune_params.autoprefetcher_model)
17753 case tune_params::AUTOPREFETCHER_OFF:
17754 queue_depth = -1;
17755 break;
17756 case tune_params::AUTOPREFETCHER_WEAK:
17757 queue_depth = 0;
17758 break;
17759 case tune_params::AUTOPREFETCHER_STRONG:
17760 queue_depth = max_insn_queue_index + 1;
17761 break;
17762 default:
17763 gcc_unreachable ();
17766 /* We don't mind passing in global_options_set here as we don't use
17767 the *options_set structs anyway. */
17768 SET_OPTION_IF_UNSET (opts, &global_options_set,
17769 param_sched_autopref_queue_depth, queue_depth);
17771 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17772 comparison. */
17773 if (aarch64_autovec_preference == 1)
17774 SET_OPTION_IF_UNSET (opts, &global_options_set,
17775 aarch64_sve_compare_costs, 0);
17777 /* Set up parameters to be used in prefetching algorithm. Do not
17778 override the defaults unless we are tuning for a core we have
17779 researched values for. */
17780 if (aarch64_tune_params.prefetch->num_slots > 0)
17781 SET_OPTION_IF_UNSET (opts, &global_options_set,
17782 param_simultaneous_prefetches,
17783 aarch64_tune_params.prefetch->num_slots);
17784 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17785 SET_OPTION_IF_UNSET (opts, &global_options_set,
17786 param_l1_cache_size,
17787 aarch64_tune_params.prefetch->l1_cache_size);
17788 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17789 SET_OPTION_IF_UNSET (opts, &global_options_set,
17790 param_l1_cache_line_size,
17791 aarch64_tune_params.prefetch->l1_cache_line_size);
17793 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17795 SET_OPTION_IF_UNSET (opts, &global_options_set,
17796 param_destruct_interfere_size,
17797 aarch64_tune_params.prefetch->l1_cache_line_size);
17798 SET_OPTION_IF_UNSET (opts, &global_options_set,
17799 param_construct_interfere_size,
17800 aarch64_tune_params.prefetch->l1_cache_line_size);
17802 else
17804 /* For a generic AArch64 target, cover the current range of cache line
17805 sizes. */
17806 SET_OPTION_IF_UNSET (opts, &global_options_set,
17807 param_destruct_interfere_size,
17808 256);
17809 SET_OPTION_IF_UNSET (opts, &global_options_set,
17810 param_construct_interfere_size,
17811 64);
17814 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17815 SET_OPTION_IF_UNSET (opts, &global_options_set,
17816 param_l2_cache_size,
17817 aarch64_tune_params.prefetch->l2_cache_size);
17818 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17819 SET_OPTION_IF_UNSET (opts, &global_options_set,
17820 param_prefetch_dynamic_strides, 0);
17821 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17822 SET_OPTION_IF_UNSET (opts, &global_options_set,
17823 param_prefetch_minimum_stride,
17824 aarch64_tune_params.prefetch->minimum_stride);
17826 /* Use the alternative scheduling-pressure algorithm by default. */
17827 SET_OPTION_IF_UNSET (opts, &global_options_set,
17828 param_sched_pressure_algorithm,
17829 SCHED_PRESSURE_MODEL);
17831 /* Validate the guard size. */
17832 int guard_size = param_stack_clash_protection_guard_size;
17834 if (guard_size != 12 && guard_size != 16)
17835 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17836 "size. Given value %d (%llu KB) is out of range",
17837 guard_size, (1ULL << guard_size) / 1024ULL);
17839 /* Enforce that interval is the same size as size so the mid-end does the
17840 right thing. */
17841 SET_OPTION_IF_UNSET (opts, &global_options_set,
17842 param_stack_clash_protection_probe_interval,
17843 guard_size);
17845 /* The maybe_set calls won't update the value if the user has explicitly set
17846 one. Which means we need to validate that probing interval and guard size
17847 are equal. */
17848 int probe_interval
17849 = param_stack_clash_protection_probe_interval;
17850 if (guard_size != probe_interval)
17851 error ("stack clash guard size %<%d%> must be equal to probing interval "
17852 "%<%d%>", guard_size, probe_interval);
17854 /* Enable sw prefetching at specified optimization level for
17855 CPUS that have prefetch. Lower optimization level threshold by 1
17856 when profiling is enabled. */
17857 if (opts->x_flag_prefetch_loop_arrays < 0
17858 && !opts->x_optimize_size
17859 && aarch64_tune_params.prefetch->default_opt_level >= 0
17860 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
17861 opts->x_flag_prefetch_loop_arrays = 1;
17863 aarch64_override_options_after_change_1 (opts);
17866 /* Print a hint with a suggestion for a core or architecture name that
17867 most closely resembles what the user passed in STR. ARCH is true if
17868 the user is asking for an architecture name. ARCH is false if the user
17869 is asking for a core name. */
17871 static void
17872 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
17874 auto_vec<const char *> candidates;
17875 const struct processor *entry = arch ? all_architectures : all_cores;
17876 for (; entry->name != NULL; entry++)
17877 candidates.safe_push (entry->name);
17879 #ifdef HAVE_LOCAL_CPU_DETECT
17880 /* Add also "native" as possible value. */
17881 if (arch)
17882 candidates.safe_push ("native");
17883 #endif
17885 char *s;
17886 const char *hint = candidates_list_and_hint (str, s, candidates);
17887 if (hint)
17888 inform (input_location, "valid arguments are: %s;"
17889 " did you mean %qs?", s, hint);
17890 else
17891 inform (input_location, "valid arguments are: %s", s);
17893 XDELETEVEC (s);
17896 /* Print a hint with a suggestion for a core name that most closely resembles
17897 what the user passed in STR. */
17899 inline static void
17900 aarch64_print_hint_for_core (const char *str)
17902 aarch64_print_hint_for_core_or_arch (str, false);
17905 /* Print a hint with a suggestion for an architecture name that most closely
17906 resembles what the user passed in STR. */
17908 inline static void
17909 aarch64_print_hint_for_arch (const char *str)
17911 aarch64_print_hint_for_core_or_arch (str, true);
17915 /* Print a hint with a suggestion for an extension name
17916 that most closely resembles what the user passed in STR. */
17918 void
17919 aarch64_print_hint_for_extensions (const std::string &str)
17921 auto_vec<const char *> candidates;
17922 aarch64_get_all_extension_candidates (&candidates);
17923 char *s;
17924 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
17925 if (hint)
17926 inform (input_location, "valid arguments are: %s;"
17927 " did you mean %qs?", s, hint);
17928 else
17929 inform (input_location, "valid arguments are: %s", s);
17931 XDELETEVEC (s);
17934 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
17935 specified in STR and throw errors if appropriate. Put the results if
17936 they are valid in RES and ISA_FLAGS. Return whether the option is
17937 valid. */
17939 static bool
17940 aarch64_validate_mcpu (const char *str, const struct processor **res,
17941 aarch64_feature_flags *isa_flags)
17943 std::string invalid_extension;
17944 enum aarch64_parse_opt_result parse_res
17945 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
17947 if (parse_res == AARCH64_PARSE_OK)
17948 return true;
17950 switch (parse_res)
17952 case AARCH64_PARSE_MISSING_ARG:
17953 error ("missing cpu name in %<-mcpu=%s%>", str);
17954 break;
17955 case AARCH64_PARSE_INVALID_ARG:
17956 error ("unknown value %qs for %<-mcpu%>", str);
17957 aarch64_print_hint_for_core (str);
17958 break;
17959 case AARCH64_PARSE_INVALID_FEATURE:
17960 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
17961 invalid_extension.c_str (), str);
17962 aarch64_print_hint_for_extensions (invalid_extension);
17963 break;
17964 default:
17965 gcc_unreachable ();
17968 return false;
17971 /* Straight line speculation indicators. */
17972 enum aarch64_sls_hardening_type
17974 SLS_NONE = 0,
17975 SLS_RETBR = 1,
17976 SLS_BLR = 2,
17977 SLS_ALL = 3,
17979 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
17981 /* Return whether we should mitigatate Straight Line Speculation for the RET
17982 and BR instructions. */
17983 bool
17984 aarch64_harden_sls_retbr_p (void)
17986 return aarch64_sls_hardening & SLS_RETBR;
17989 /* Return whether we should mitigatate Straight Line Speculation for the BLR
17990 instruction. */
17991 bool
17992 aarch64_harden_sls_blr_p (void)
17994 return aarch64_sls_hardening & SLS_BLR;
17997 /* As of yet we only allow setting these options globally, in the future we may
17998 allow setting them per function. */
17999 static void
18000 aarch64_validate_sls_mitigation (const char *const_str)
18002 char *token_save = NULL;
18003 char *str = NULL;
18005 if (strcmp (const_str, "none") == 0)
18007 aarch64_sls_hardening = SLS_NONE;
18008 return;
18010 if (strcmp (const_str, "all") == 0)
18012 aarch64_sls_hardening = SLS_ALL;
18013 return;
18016 char *str_root = xstrdup (const_str);
18017 str = strtok_r (str_root, ",", &token_save);
18018 if (!str)
18019 error ("invalid argument given to %<-mharden-sls=%>");
18021 int temp = SLS_NONE;
18022 while (str)
18024 if (strcmp (str, "blr") == 0)
18025 temp |= SLS_BLR;
18026 else if (strcmp (str, "retbr") == 0)
18027 temp |= SLS_RETBR;
18028 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18030 error ("%qs must be by itself for %<-mharden-sls=%>", str);
18031 break;
18033 else
18035 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18036 break;
18038 str = strtok_r (NULL, ",", &token_save);
18040 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18041 free (str_root);
18044 /* Parses CONST_STR for branch protection features specified in
18045 aarch64_branch_protect_types, and set any global variables required. Returns
18046 the parsing result and assigns LAST_STR to the last processed token from
18047 CONST_STR so that it can be used for error reporting. */
18049 static enum
18050 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
18051 char** last_str)
18053 char *str_root = xstrdup (const_str);
18054 char* token_save = NULL;
18055 char *str = strtok_r (str_root, "+", &token_save);
18056 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
18057 if (!str)
18058 res = AARCH64_PARSE_MISSING_ARG;
18059 else
18061 char *next_str = strtok_r (NULL, "+", &token_save);
18062 /* Reset the branch protection features to their defaults. */
18063 aarch64_handle_no_branch_protection (NULL, NULL);
18065 while (str && res == AARCH64_PARSE_OK)
18067 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
18068 bool found = false;
18069 /* Search for this type. */
18070 while (type && type->name && !found && res == AARCH64_PARSE_OK)
18072 if (strcmp (str, type->name) == 0)
18074 found = true;
18075 res = type->handler (str, next_str);
18076 str = next_str;
18077 next_str = strtok_r (NULL, "+", &token_save);
18079 else
18080 type++;
18082 if (found && res == AARCH64_PARSE_OK)
18084 bool found_subtype = true;
18085 /* Loop through each token until we find one that isn't a
18086 subtype. */
18087 while (found_subtype)
18089 found_subtype = false;
18090 const aarch64_branch_protect_type *subtype = type->subtypes;
18091 /* Search for the subtype. */
18092 while (str && subtype && subtype->name && !found_subtype
18093 && res == AARCH64_PARSE_OK)
18095 if (strcmp (str, subtype->name) == 0)
18097 found_subtype = true;
18098 res = subtype->handler (str, next_str);
18099 str = next_str;
18100 next_str = strtok_r (NULL, "+", &token_save);
18102 else
18103 subtype++;
18107 else if (!found)
18108 res = AARCH64_PARSE_INVALID_ARG;
18111 /* Copy the last processed token into the argument to pass it back.
18112 Used by option and attribute validation to print the offending token. */
18113 if (last_str)
18115 if (str) strcpy (*last_str, str);
18116 else *last_str = NULL;
18118 if (res == AARCH64_PARSE_OK)
18120 /* If needed, alloc the accepted string then copy in const_str.
18121 Used by override_option_after_change_1. */
18122 if (!accepted_branch_protection_string)
18123 accepted_branch_protection_string = (char *) xmalloc (
18124 BRANCH_PROTECT_STR_MAX
18125 + 1);
18126 strncpy (accepted_branch_protection_string, const_str,
18127 BRANCH_PROTECT_STR_MAX + 1);
18128 /* Forcibly null-terminate. */
18129 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
18131 return res;
18134 static bool
18135 aarch64_validate_mbranch_protection (const char *const_str)
18137 char *str = (char *) xmalloc (strlen (const_str));
18138 enum aarch64_parse_opt_result res =
18139 aarch64_parse_branch_protection (const_str, &str);
18140 if (res == AARCH64_PARSE_INVALID_ARG)
18141 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
18142 else if (res == AARCH64_PARSE_MISSING_ARG)
18143 error ("missing argument for %<-mbranch-protection=%>");
18144 free (str);
18145 return res == AARCH64_PARSE_OK;
18148 /* Validate a command-line -march option. Parse the arch and extensions
18149 (if any) specified in STR and throw errors if appropriate. Put the
18150 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18151 option is valid. */
18153 static bool
18154 aarch64_validate_march (const char *str, const struct processor **res,
18155 aarch64_feature_flags *isa_flags)
18157 std::string invalid_extension;
18158 enum aarch64_parse_opt_result parse_res
18159 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18161 if (parse_res == AARCH64_PARSE_OK)
18162 return true;
18164 switch (parse_res)
18166 case AARCH64_PARSE_MISSING_ARG:
18167 error ("missing arch name in %<-march=%s%>", str);
18168 break;
18169 case AARCH64_PARSE_INVALID_ARG:
18170 error ("unknown value %qs for %<-march%>", str);
18171 aarch64_print_hint_for_arch (str);
18172 /* A common user error is confusing -march and -mcpu.
18173 If the -march string matches a known CPU suggest -mcpu. */
18174 parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18175 if (parse_res == AARCH64_PARSE_OK)
18176 inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18177 break;
18178 case AARCH64_PARSE_INVALID_FEATURE:
18179 error ("invalid feature modifier %qs in %<-march=%s%>",
18180 invalid_extension.c_str (), str);
18181 aarch64_print_hint_for_extensions (invalid_extension);
18182 break;
18183 default:
18184 gcc_unreachable ();
18187 return false;
18190 /* Validate a command-line -mtune option. Parse the cpu
18191 specified in STR and throw errors if appropriate. Put the
18192 result, if it is valid, in RES. Return whether the option is
18193 valid. */
18195 static bool
18196 aarch64_validate_mtune (const char *str, const struct processor **res)
18198 enum aarch64_parse_opt_result parse_res
18199 = aarch64_parse_tune (str, res);
18201 if (parse_res == AARCH64_PARSE_OK)
18202 return true;
18204 switch (parse_res)
18206 case AARCH64_PARSE_MISSING_ARG:
18207 error ("missing cpu name in %<-mtune=%s%>", str);
18208 break;
18209 case AARCH64_PARSE_INVALID_ARG:
18210 error ("unknown value %qs for %<-mtune%>", str);
18211 aarch64_print_hint_for_core (str);
18212 break;
18213 default:
18214 gcc_unreachable ();
18216 return false;
18219 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18221 static poly_uint16
18222 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18224 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18225 on big-endian targets, so we would need to forbid subregs that convert
18226 from one to the other. By default a reinterpret sequence would then
18227 involve a store to memory in one mode and a load back in the other.
18228 Even if we optimize that sequence using reverse instructions,
18229 it would still be a significant potential overhead.
18231 For now, it seems better to generate length-agnostic code for that
18232 case instead. */
18233 if (value == SVE_SCALABLE
18234 || (value == SVE_128 && BYTES_BIG_ENDIAN))
18235 return poly_uint16 (2, 2);
18236 else
18237 return (int) value / 64;
18240 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18241 aarch64_isa_flags accordingly. */
18243 void
18244 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18246 aarch64_set_asm_isa_flags (&global_options, flags);
18249 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18250 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18251 tuning structs. In particular it must set selected_tune and
18252 aarch64_asm_isa_flags that define the available ISA features and tuning
18253 decisions. It must also set selected_arch as this will be used to
18254 output the .arch asm tags for each function. */
18256 static void
18257 aarch64_override_options (void)
18259 aarch64_feature_flags cpu_isa = 0;
18260 aarch64_feature_flags arch_isa = 0;
18261 aarch64_set_asm_isa_flags (0);
18263 const struct processor *cpu = NULL;
18264 const struct processor *arch = NULL;
18265 const struct processor *tune = NULL;
18267 if (aarch64_harden_sls_string)
18268 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18270 if (aarch64_branch_protection_string)
18271 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
18273 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18274 If either of -march or -mtune is given, they override their
18275 respective component of -mcpu. */
18276 if (aarch64_cpu_string)
18277 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18279 if (aarch64_arch_string)
18280 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18282 if (aarch64_tune_string)
18283 aarch64_validate_mtune (aarch64_tune_string, &tune);
18285 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18286 SUBTARGET_OVERRIDE_OPTIONS;
18287 #endif
18289 if (cpu && arch)
18291 /* If both -mcpu and -march are specified, warn if they are not
18292 architecturally compatible and prefer the -march ISA flags. */
18293 if (arch->arch != cpu->arch)
18295 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18296 aarch64_cpu_string,
18297 aarch64_arch_string);
18300 selected_arch = arch->arch;
18301 aarch64_set_asm_isa_flags (arch_isa);
18303 else if (cpu)
18305 selected_arch = cpu->arch;
18306 aarch64_set_asm_isa_flags (cpu_isa);
18308 else if (arch)
18310 cpu = &all_cores[arch->ident];
18311 selected_arch = arch->arch;
18312 aarch64_set_asm_isa_flags (arch_isa);
18314 else
18316 /* No -mcpu or -march specified, so use the default CPU. */
18317 cpu = &all_cores[TARGET_CPU_DEFAULT];
18318 selected_arch = cpu->arch;
18319 aarch64_set_asm_isa_flags (cpu->flags);
18322 selected_tune = tune ? tune->ident : cpu->ident;
18324 if (aarch64_enable_bti == 2)
18326 #ifdef TARGET_ENABLE_BTI
18327 aarch64_enable_bti = 1;
18328 #else
18329 aarch64_enable_bti = 0;
18330 #endif
18333 /* Return address signing is currently not supported for ILP32 targets. For
18334 LP64 targets use the configured option in the absence of a command-line
18335 option for -mbranch-protection. */
18336 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18338 #ifdef TARGET_ENABLE_PAC_RET
18339 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
18340 #else
18341 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
18342 #endif
18345 #ifndef HAVE_AS_MABI_OPTION
18346 /* The compiler may have been configured with 2.23.* binutils, which does
18347 not have support for ILP32. */
18348 if (TARGET_ILP32)
18349 error ("assembler does not support %<-mabi=ilp32%>");
18350 #endif
18352 /* Convert -msve-vector-bits to a VG count. */
18353 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18355 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
18356 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18358 /* The pass to insert speculation tracking runs before
18359 shrink-wrapping and the latter does not know how to update the
18360 tracking status. So disable it in this case. */
18361 if (aarch64_track_speculation)
18362 flag_shrink_wrap = 0;
18364 aarch64_override_options_internal (&global_options);
18366 /* Save these options as the default ones in case we push and pop them later
18367 while processing functions with potential target attributes. */
18368 target_option_default_node = target_option_current_node
18369 = build_target_option_node (&global_options, &global_options_set);
18372 /* Implement targetm.override_options_after_change. */
18374 static void
18375 aarch64_override_options_after_change (void)
18377 aarch64_override_options_after_change_1 (&global_options);
18380 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18381 static char *
18382 aarch64_offload_options (void)
18384 if (TARGET_ILP32)
18385 return xstrdup ("-foffload-abi=ilp32");
18386 else
18387 return xstrdup ("-foffload-abi=lp64");
18390 static struct machine_function *
18391 aarch64_init_machine_status (void)
18393 struct machine_function *machine;
18394 machine = ggc_cleared_alloc<machine_function> ();
18395 return machine;
18398 void
18399 aarch64_init_expanders (void)
18401 init_machine_status = aarch64_init_machine_status;
18404 /* A checking mechanism for the implementation of the various code models. */
18405 static void
18406 initialize_aarch64_code_model (struct gcc_options *opts)
18408 aarch64_cmodel = opts->x_aarch64_cmodel_var;
18409 switch (opts->x_aarch64_cmodel_var)
18411 case AARCH64_CMODEL_TINY:
18412 if (opts->x_flag_pic)
18413 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18414 break;
18415 case AARCH64_CMODEL_SMALL:
18416 if (opts->x_flag_pic)
18418 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18419 aarch64_cmodel = (flag_pic == 2
18420 ? AARCH64_CMODEL_SMALL_PIC
18421 : AARCH64_CMODEL_SMALL_SPIC);
18422 #else
18423 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18424 #endif
18426 break;
18427 case AARCH64_CMODEL_LARGE:
18428 if (opts->x_flag_pic)
18429 sorry ("code model %qs with %<-f%s%>", "large",
18430 opts->x_flag_pic > 1 ? "PIC" : "pic");
18431 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18432 sorry ("code model %qs not supported in ilp32 mode", "large");
18433 break;
18434 case AARCH64_CMODEL_TINY_PIC:
18435 case AARCH64_CMODEL_SMALL_PIC:
18436 case AARCH64_CMODEL_SMALL_SPIC:
18437 gcc_unreachable ();
18441 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
18442 using the information saved in PTR. */
18444 static void
18445 aarch64_option_restore (struct gcc_options *opts,
18446 struct gcc_options * /* opts_set */,
18447 struct cl_target_option * /* ptr */)
18449 aarch64_override_options_internal (opts);
18452 /* Implement TARGET_OPTION_PRINT. */
18454 static void
18455 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18457 const struct processor *cpu
18458 = aarch64_get_tune_cpu (ptr->x_selected_tune);
18459 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18460 std::string extension
18461 = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
18462 arch->flags);
18464 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18465 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18466 arch->name, extension.c_str ());
18469 static GTY(()) tree aarch64_previous_fndecl;
18471 void
18472 aarch64_reset_previous_fndecl (void)
18474 aarch64_previous_fndecl = NULL;
18477 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18478 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18479 make sure optab availability predicates are recomputed when necessary. */
18481 void
18482 aarch64_save_restore_target_globals (tree new_tree)
18484 if (TREE_TARGET_GLOBALS (new_tree))
18485 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18486 else if (new_tree == target_option_default_node)
18487 restore_target_globals (&default_target_globals);
18488 else
18489 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18492 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
18493 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18494 of the function, if such exists. This function may be called multiple
18495 times on a single function so use aarch64_previous_fndecl to avoid
18496 setting up identical state. */
18498 static void
18499 aarch64_set_current_function (tree fndecl)
18501 if (!fndecl || fndecl == aarch64_previous_fndecl)
18502 return;
18504 tree old_tree = (aarch64_previous_fndecl
18505 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18506 : NULL_TREE);
18508 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18510 /* If current function has no attributes but the previous one did,
18511 use the default node. */
18512 if (!new_tree && old_tree)
18513 new_tree = target_option_default_node;
18515 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
18516 the default have been handled by aarch64_save_restore_target_globals from
18517 aarch64_pragma_target_parse. */
18518 if (old_tree == new_tree)
18519 return;
18521 aarch64_previous_fndecl = fndecl;
18523 /* First set the target options. */
18524 cl_target_option_restore (&global_options, &global_options_set,
18525 TREE_TARGET_OPTION (new_tree));
18527 aarch64_save_restore_target_globals (new_tree);
18530 /* Enum describing the various ways we can handle attributes.
18531 In many cases we can reuse the generic option handling machinery. */
18533 enum aarch64_attr_opt_type
18535 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
18536 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
18537 aarch64_attr_enum, /* Attribute sets an enum variable. */
18538 aarch64_attr_custom /* Attribute requires a custom handling function. */
18541 /* All the information needed to handle a target attribute.
18542 NAME is the name of the attribute.
18543 ATTR_TYPE specifies the type of behavior of the attribute as described
18544 in the definition of enum aarch64_attr_opt_type.
18545 ALLOW_NEG is true if the attribute supports a "no-" form.
18546 HANDLER is the function that takes the attribute string as an argument
18547 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18548 OPT_NUM is the enum specifying the option that the attribute modifies.
18549 This is needed for attributes that mirror the behavior of a command-line
18550 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18551 aarch64_attr_enum. */
18553 struct aarch64_attribute_info
18555 const char *name;
18556 enum aarch64_attr_opt_type attr_type;
18557 bool allow_neg;
18558 bool (*handler) (const char *);
18559 enum opt_code opt_num;
18562 /* Handle the ARCH_STR argument to the arch= target attribute. */
18564 static bool
18565 aarch64_handle_attr_arch (const char *str)
18567 const struct processor *tmp_arch = NULL;
18568 std::string invalid_extension;
18569 aarch64_feature_flags tmp_flags;
18570 enum aarch64_parse_opt_result parse_res
18571 = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
18573 if (parse_res == AARCH64_PARSE_OK)
18575 gcc_assert (tmp_arch);
18576 selected_arch = tmp_arch->arch;
18577 aarch64_set_asm_isa_flags (tmp_flags);
18578 return true;
18581 switch (parse_res)
18583 case AARCH64_PARSE_MISSING_ARG:
18584 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18585 break;
18586 case AARCH64_PARSE_INVALID_ARG:
18587 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18588 aarch64_print_hint_for_arch (str);
18589 break;
18590 case AARCH64_PARSE_INVALID_FEATURE:
18591 error ("invalid feature modifier %s of value %qs in "
18592 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18593 aarch64_print_hint_for_extensions (invalid_extension);
18594 break;
18595 default:
18596 gcc_unreachable ();
18599 return false;
18602 /* Handle the argument CPU_STR to the cpu= target attribute. */
18604 static bool
18605 aarch64_handle_attr_cpu (const char *str)
18607 const struct processor *tmp_cpu = NULL;
18608 std::string invalid_extension;
18609 aarch64_feature_flags tmp_flags;
18610 enum aarch64_parse_opt_result parse_res
18611 = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
18613 if (parse_res == AARCH64_PARSE_OK)
18615 gcc_assert (tmp_cpu);
18616 selected_tune = tmp_cpu->ident;
18617 selected_arch = tmp_cpu->arch;
18618 aarch64_set_asm_isa_flags (tmp_flags);
18619 return true;
18622 switch (parse_res)
18624 case AARCH64_PARSE_MISSING_ARG:
18625 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18626 break;
18627 case AARCH64_PARSE_INVALID_ARG:
18628 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18629 aarch64_print_hint_for_core (str);
18630 break;
18631 case AARCH64_PARSE_INVALID_FEATURE:
18632 error ("invalid feature modifier %qs of value %qs in "
18633 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18634 aarch64_print_hint_for_extensions (invalid_extension);
18635 break;
18636 default:
18637 gcc_unreachable ();
18640 return false;
18643 /* Handle the argument STR to the branch-protection= attribute. */
18645 static bool
18646 aarch64_handle_attr_branch_protection (const char* str)
18648 char *err_str = (char *) xmalloc (strlen (str) + 1);
18649 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
18650 &err_str);
18651 bool success = false;
18652 switch (res)
18654 case AARCH64_PARSE_MISSING_ARG:
18655 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18656 " attribute");
18657 break;
18658 case AARCH64_PARSE_INVALID_ARG:
18659 error ("invalid protection type %qs in %<target(\"branch-protection"
18660 "=\")%> pragma or attribute", err_str);
18661 break;
18662 case AARCH64_PARSE_OK:
18663 success = true;
18664 /* Fall through. */
18665 case AARCH64_PARSE_INVALID_FEATURE:
18666 break;
18667 default:
18668 gcc_unreachable ();
18670 free (err_str);
18671 return success;
18674 /* Handle the argument STR to the tune= target attribute. */
18676 static bool
18677 aarch64_handle_attr_tune (const char *str)
18679 const struct processor *tmp_tune = NULL;
18680 enum aarch64_parse_opt_result parse_res
18681 = aarch64_parse_tune (str, &tmp_tune);
18683 if (parse_res == AARCH64_PARSE_OK)
18685 gcc_assert (tmp_tune);
18686 selected_tune = tmp_tune->ident;
18687 return true;
18690 switch (parse_res)
18692 case AARCH64_PARSE_INVALID_ARG:
18693 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18694 aarch64_print_hint_for_core (str);
18695 break;
18696 default:
18697 gcc_unreachable ();
18700 return false;
18703 /* Parse an architecture extensions target attribute string specified in STR.
18704 For example "+fp+nosimd". Show any errors if needed. Return TRUE
18705 if successful. Update aarch64_isa_flags to reflect the ISA features
18706 modified. */
18708 static bool
18709 aarch64_handle_attr_isa_flags (char *str)
18711 enum aarch64_parse_opt_result parse_res;
18712 auto isa_flags = aarch64_asm_isa_flags;
18714 /* We allow "+nothing" in the beginning to clear out all architectural
18715 features if the user wants to handpick specific features. */
18716 if (strncmp ("+nothing", str, 8) == 0)
18718 isa_flags = 0;
18719 str += 8;
18722 std::string invalid_extension;
18723 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18725 if (parse_res == AARCH64_PARSE_OK)
18727 aarch64_set_asm_isa_flags (isa_flags);
18728 return true;
18731 switch (parse_res)
18733 case AARCH64_PARSE_MISSING_ARG:
18734 error ("missing value in %<target()%> pragma or attribute");
18735 break;
18737 case AARCH64_PARSE_INVALID_FEATURE:
18738 error ("invalid feature modifier %qs of value %qs in "
18739 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18740 break;
18742 default:
18743 gcc_unreachable ();
18746 return false;
18749 /* The target attributes that we support. On top of these we also support just
18750 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
18751 handled explicitly in aarch64_process_one_target_attr. */
18753 static const struct aarch64_attribute_info aarch64_attributes[] =
18755 { "general-regs-only", aarch64_attr_mask, false, NULL,
18756 OPT_mgeneral_regs_only },
18757 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18758 OPT_mfix_cortex_a53_835769 },
18759 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18760 OPT_mfix_cortex_a53_843419 },
18761 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18762 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18763 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18764 OPT_momit_leaf_frame_pointer },
18765 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18766 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18767 OPT_march_ },
18768 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18769 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18770 OPT_mtune_ },
18771 { "branch-protection", aarch64_attr_custom, false,
18772 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18773 { "sign-return-address", aarch64_attr_enum, false, NULL,
18774 OPT_msign_return_address_ },
18775 { "outline-atomics", aarch64_attr_bool, true, NULL,
18776 OPT_moutline_atomics},
18777 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18780 /* Parse ARG_STR which contains the definition of one target attribute.
18781 Show appropriate errors if any or return true if the attribute is valid. */
18783 static bool
18784 aarch64_process_one_target_attr (char *arg_str)
18786 bool invert = false;
18788 size_t len = strlen (arg_str);
18790 if (len == 0)
18792 error ("malformed %<target()%> pragma or attribute");
18793 return false;
18796 char *str_to_check = (char *) alloca (len + 1);
18797 strcpy (str_to_check, arg_str);
18799 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18800 It is easier to detect and handle it explicitly here rather than going
18801 through the machinery for the rest of the target attributes in this
18802 function. */
18803 if (*str_to_check == '+')
18804 return aarch64_handle_attr_isa_flags (str_to_check);
18806 if (len > 3 && startswith (str_to_check, "no-"))
18808 invert = true;
18809 str_to_check += 3;
18811 char *arg = strchr (str_to_check, '=');
18813 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18814 and point ARG to "foo". */
18815 if (arg)
18817 *arg = '\0';
18818 arg++;
18820 const struct aarch64_attribute_info *p_attr;
18821 bool found = false;
18822 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18824 /* If the names don't match up, or the user has given an argument
18825 to an attribute that doesn't accept one, or didn't give an argument
18826 to an attribute that expects one, fail to match. */
18827 if (strcmp (str_to_check, p_attr->name) != 0)
18828 continue;
18830 found = true;
18831 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18832 || p_attr->attr_type == aarch64_attr_enum;
18834 if (attr_need_arg_p ^ (arg != NULL))
18836 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18837 return false;
18840 /* If the name matches but the attribute does not allow "no-" versions
18841 then we can't match. */
18842 if (invert && !p_attr->allow_neg)
18844 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18845 return false;
18848 switch (p_attr->attr_type)
18850 /* Has a custom handler registered.
18851 For example, cpu=, arch=, tune=. */
18852 case aarch64_attr_custom:
18853 gcc_assert (p_attr->handler);
18854 if (!p_attr->handler (arg))
18855 return false;
18856 break;
18858 /* Either set or unset a boolean option. */
18859 case aarch64_attr_bool:
18861 struct cl_decoded_option decoded;
18863 generate_option (p_attr->opt_num, NULL, !invert,
18864 CL_TARGET, &decoded);
18865 aarch64_handle_option (&global_options, &global_options_set,
18866 &decoded, input_location);
18867 break;
18869 /* Set or unset a bit in the target_flags. aarch64_handle_option
18870 should know what mask to apply given the option number. */
18871 case aarch64_attr_mask:
18873 struct cl_decoded_option decoded;
18874 /* We only need to specify the option number.
18875 aarch64_handle_option will know which mask to apply. */
18876 decoded.opt_index = p_attr->opt_num;
18877 decoded.value = !invert;
18878 aarch64_handle_option (&global_options, &global_options_set,
18879 &decoded, input_location);
18880 break;
18882 /* Use the option setting machinery to set an option to an enum. */
18883 case aarch64_attr_enum:
18885 gcc_assert (arg);
18886 bool valid;
18887 int value;
18888 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
18889 &value, CL_TARGET);
18890 if (valid)
18892 set_option (&global_options, NULL, p_attr->opt_num, value,
18893 NULL, DK_UNSPECIFIED, input_location,
18894 global_dc);
18896 else
18898 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
18900 break;
18902 default:
18903 gcc_unreachable ();
18907 /* If we reached here we either have found an attribute and validated
18908 it or didn't match any. If we matched an attribute but its arguments
18909 were malformed we will have returned false already. */
18910 return found;
18913 /* Count how many times the character C appears in
18914 NULL-terminated string STR. */
18916 static unsigned int
18917 num_occurences_in_str (char c, char *str)
18919 unsigned int res = 0;
18920 while (*str != '\0')
18922 if (*str == c)
18923 res++;
18925 str++;
18928 return res;
18931 /* Parse the tree in ARGS that contains the target attribute information
18932 and update the global target options space. */
18934 bool
18935 aarch64_process_target_attr (tree args)
18937 if (TREE_CODE (args) == TREE_LIST)
18941 tree head = TREE_VALUE (args);
18942 if (head)
18944 if (!aarch64_process_target_attr (head))
18945 return false;
18947 args = TREE_CHAIN (args);
18948 } while (args);
18950 return true;
18953 if (TREE_CODE (args) != STRING_CST)
18955 error ("attribute %<target%> argument not a string");
18956 return false;
18959 size_t len = strlen (TREE_STRING_POINTER (args));
18960 char *str_to_check = (char *) alloca (len + 1);
18961 strcpy (str_to_check, TREE_STRING_POINTER (args));
18963 if (len == 0)
18965 error ("malformed %<target()%> pragma or attribute");
18966 return false;
18969 /* Used to catch empty spaces between commas i.e.
18970 attribute ((target ("attr1,,attr2"))). */
18971 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
18973 /* Handle multiple target attributes separated by ','. */
18974 char *token = strtok_r (str_to_check, ",", &str_to_check);
18976 unsigned int num_attrs = 0;
18977 while (token)
18979 num_attrs++;
18980 if (!aarch64_process_one_target_attr (token))
18982 /* Check if token is possibly an arch extension without
18983 leading '+'. */
18984 aarch64_feature_flags isa_temp = 0;
18985 auto with_plus = std::string ("+") + token;
18986 enum aarch64_parse_opt_result ext_res
18987 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
18989 if (ext_res == AARCH64_PARSE_OK)
18990 error ("arch extension %<%s%> should be prefixed by %<+%>",
18991 token);
18992 else
18993 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
18994 return false;
18997 token = strtok_r (NULL, ",", &str_to_check);
19000 if (num_attrs != num_commas + 1)
19002 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19003 return false;
19006 return true;
19009 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19010 process attribute ((target ("..."))). */
19012 static bool
19013 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19015 struct cl_target_option cur_target;
19016 bool ret;
19017 tree old_optimize;
19018 tree new_target, new_optimize;
19019 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19021 /* If what we're processing is the current pragma string then the
19022 target option node is already stored in target_option_current_node
19023 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19024 having to re-parse the string. This is especially useful to keep
19025 arm_neon.h compile times down since that header contains a lot
19026 of intrinsics enclosed in pragmas. */
19027 if (!existing_target && args == current_target_pragma)
19029 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19030 return true;
19032 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19034 old_optimize
19035 = build_optimization_node (&global_options, &global_options_set);
19036 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19038 /* If the function changed the optimization levels as well as setting
19039 target options, start with the optimizations specified. */
19040 if (func_optimize && func_optimize != old_optimize)
19041 cl_optimization_restore (&global_options, &global_options_set,
19042 TREE_OPTIMIZATION (func_optimize));
19044 /* Save the current target options to restore at the end. */
19045 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19047 /* If fndecl already has some target attributes applied to it, unpack
19048 them so that we add this attribute on top of them, rather than
19049 overwriting them. */
19050 if (existing_target)
19052 struct cl_target_option *existing_options
19053 = TREE_TARGET_OPTION (existing_target);
19055 if (existing_options)
19056 cl_target_option_restore (&global_options, &global_options_set,
19057 existing_options);
19059 else
19060 cl_target_option_restore (&global_options, &global_options_set,
19061 TREE_TARGET_OPTION (target_option_current_node));
19063 ret = aarch64_process_target_attr (args);
19065 /* Set up any additional state. */
19066 if (ret)
19068 aarch64_override_options_internal (&global_options);
19069 new_target = build_target_option_node (&global_options,
19070 &global_options_set);
19072 else
19073 new_target = NULL;
19075 new_optimize = build_optimization_node (&global_options,
19076 &global_options_set);
19078 if (fndecl && ret)
19080 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19082 if (old_optimize != new_optimize)
19083 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19086 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19088 if (old_optimize != new_optimize)
19089 cl_optimization_restore (&global_options, &global_options_set,
19090 TREE_OPTIMIZATION (old_optimize));
19091 return ret;
19094 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
19095 tri-bool options (yes, no, don't care) and the default value is
19096 DEF, determine whether to reject inlining. */
19098 static bool
19099 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
19100 int dont_care, int def)
19102 /* If the callee doesn't care, always allow inlining. */
19103 if (callee == dont_care)
19104 return true;
19106 /* If the caller doesn't care, always allow inlining. */
19107 if (caller == dont_care)
19108 return true;
19110 /* Otherwise, allow inlining if either the callee and caller values
19111 agree, or if the callee is using the default value. */
19112 return (callee == caller || callee == def);
19115 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
19116 to inline CALLEE into CALLER based on target-specific info.
19117 Make sure that the caller and callee have compatible architectural
19118 features. Then go through the other possible target attributes
19119 and see if they can block inlining. Try not to reject always_inline
19120 callees unless they are incompatible architecturally. */
19122 static bool
19123 aarch64_can_inline_p (tree caller, tree callee)
19125 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
19126 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
19128 struct cl_target_option *caller_opts
19129 = TREE_TARGET_OPTION (caller_tree ? caller_tree
19130 : target_option_default_node);
19132 struct cl_target_option *callee_opts
19133 = TREE_TARGET_OPTION (callee_tree ? callee_tree
19134 : target_option_default_node);
19136 /* Callee's ISA flags should be a subset of the caller's. */
19137 if ((caller_opts->x_aarch64_asm_isa_flags
19138 & callee_opts->x_aarch64_asm_isa_flags)
19139 != callee_opts->x_aarch64_asm_isa_flags)
19140 return false;
19141 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
19142 != callee_opts->x_aarch64_isa_flags)
19143 return false;
19145 /* Allow non-strict aligned functions inlining into strict
19146 aligned ones. */
19147 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
19148 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
19149 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
19150 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
19151 return false;
19153 bool always_inline = lookup_attribute ("always_inline",
19154 DECL_ATTRIBUTES (callee));
19156 /* If the architectural features match up and the callee is always_inline
19157 then the other attributes don't matter. */
19158 if (always_inline)
19159 return true;
19161 if (caller_opts->x_aarch64_cmodel_var
19162 != callee_opts->x_aarch64_cmodel_var)
19163 return false;
19165 if (caller_opts->x_aarch64_tls_dialect
19166 != callee_opts->x_aarch64_tls_dialect)
19167 return false;
19169 /* Honour explicit requests to workaround errata. */
19170 if (!aarch64_tribools_ok_for_inlining_p (
19171 caller_opts->x_aarch64_fix_a53_err835769,
19172 callee_opts->x_aarch64_fix_a53_err835769,
19173 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19174 return false;
19176 if (!aarch64_tribools_ok_for_inlining_p (
19177 caller_opts->x_aarch64_fix_a53_err843419,
19178 callee_opts->x_aarch64_fix_a53_err843419,
19179 2, TARGET_FIX_ERR_A53_843419))
19180 return false;
19182 /* If the user explicitly specified -momit-leaf-frame-pointer for the
19183 caller and calle and they don't match up, reject inlining. */
19184 if (!aarch64_tribools_ok_for_inlining_p (
19185 caller_opts->x_flag_omit_leaf_frame_pointer,
19186 callee_opts->x_flag_omit_leaf_frame_pointer,
19187 2, 1))
19188 return false;
19190 /* If the callee has specific tuning overrides, respect them. */
19191 if (callee_opts->x_aarch64_override_tune_string != NULL
19192 && caller_opts->x_aarch64_override_tune_string == NULL)
19193 return false;
19195 /* If the user specified tuning override strings for the
19196 caller and callee and they don't match up, reject inlining.
19197 We just do a string compare here, we don't analyze the meaning
19198 of the string, as it would be too costly for little gain. */
19199 if (callee_opts->x_aarch64_override_tune_string
19200 && caller_opts->x_aarch64_override_tune_string
19201 && (strcmp (callee_opts->x_aarch64_override_tune_string,
19202 caller_opts->x_aarch64_override_tune_string) != 0))
19203 return false;
19205 return true;
19208 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19209 been already. */
19211 unsigned int
19212 aarch64_tlsdesc_abi_id ()
19214 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19215 if (!tlsdesc_abi.initialized_p ())
19217 HARD_REG_SET full_reg_clobbers;
19218 CLEAR_HARD_REG_SET (full_reg_clobbers);
19219 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19220 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19221 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19222 SET_HARD_REG_BIT (full_reg_clobbers, regno);
19223 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19225 return tlsdesc_abi.id ();
19228 /* Return true if SYMBOL_REF X binds locally. */
19230 static bool
19231 aarch64_symbol_binds_local_p (const_rtx x)
19233 return (SYMBOL_REF_DECL (x)
19234 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19235 : SYMBOL_REF_LOCAL_P (x));
19238 /* Return true if SYMBOL_REF X is thread local */
19239 static bool
19240 aarch64_tls_symbol_p (rtx x)
19242 if (! TARGET_HAVE_TLS)
19243 return false;
19245 x = strip_salt (x);
19246 if (!SYMBOL_REF_P (x))
19247 return false;
19249 return SYMBOL_REF_TLS_MODEL (x) != 0;
19252 /* Classify a TLS symbol into one of the TLS kinds. */
19253 enum aarch64_symbol_type
19254 aarch64_classify_tls_symbol (rtx x)
19256 enum tls_model tls_kind = tls_symbolic_operand_type (x);
19258 switch (tls_kind)
19260 case TLS_MODEL_GLOBAL_DYNAMIC:
19261 case TLS_MODEL_LOCAL_DYNAMIC:
19262 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19264 case TLS_MODEL_INITIAL_EXEC:
19265 switch (aarch64_cmodel)
19267 case AARCH64_CMODEL_TINY:
19268 case AARCH64_CMODEL_TINY_PIC:
19269 return SYMBOL_TINY_TLSIE;
19270 default:
19271 return SYMBOL_SMALL_TLSIE;
19274 case TLS_MODEL_LOCAL_EXEC:
19275 if (aarch64_tls_size == 12)
19276 return SYMBOL_TLSLE12;
19277 else if (aarch64_tls_size == 24)
19278 return SYMBOL_TLSLE24;
19279 else if (aarch64_tls_size == 32)
19280 return SYMBOL_TLSLE32;
19281 else if (aarch64_tls_size == 48)
19282 return SYMBOL_TLSLE48;
19283 else
19284 gcc_unreachable ();
19286 case TLS_MODEL_EMULATED:
19287 case TLS_MODEL_NONE:
19288 return SYMBOL_FORCE_TO_MEM;
19290 default:
19291 gcc_unreachable ();
19295 /* Return the correct method for accessing X + OFFSET, where X is either
19296 a SYMBOL_REF or LABEL_REF. */
19298 enum aarch64_symbol_type
19299 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19301 x = strip_salt (x);
19303 if (LABEL_REF_P (x))
19305 switch (aarch64_cmodel)
19307 case AARCH64_CMODEL_LARGE:
19308 return SYMBOL_FORCE_TO_MEM;
19310 case AARCH64_CMODEL_TINY_PIC:
19311 case AARCH64_CMODEL_TINY:
19312 return SYMBOL_TINY_ABSOLUTE;
19314 case AARCH64_CMODEL_SMALL_SPIC:
19315 case AARCH64_CMODEL_SMALL_PIC:
19316 case AARCH64_CMODEL_SMALL:
19317 return SYMBOL_SMALL_ABSOLUTE;
19319 default:
19320 gcc_unreachable ();
19324 if (SYMBOL_REF_P (x))
19326 if (aarch64_tls_symbol_p (x))
19327 return aarch64_classify_tls_symbol (x);
19329 switch (aarch64_cmodel)
19331 case AARCH64_CMODEL_TINY_PIC:
19332 case AARCH64_CMODEL_TINY:
19333 /* With -fPIC non-local symbols use the GOT. For orthogonality
19334 always use the GOT for extern weak symbols. */
19335 if ((flag_pic || SYMBOL_REF_WEAK (x))
19336 && !aarch64_symbol_binds_local_p (x))
19337 return SYMBOL_TINY_GOT;
19339 /* When we retrieve symbol + offset address, we have to make sure
19340 the offset does not cause overflow of the final address. But
19341 we have no way of knowing the address of symbol at compile time
19342 so we can't accurately say if the distance between the PC and
19343 symbol + offset is outside the addressible range of +/-1MB in the
19344 TINY code model. So we limit the maximum offset to +/-64KB and
19345 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19346 If offset_within_block_p is true we allow larger offsets. */
19347 if (!(IN_RANGE (offset, -0x10000, 0x10000)
19348 || offset_within_block_p (x, offset)))
19349 return SYMBOL_FORCE_TO_MEM;
19351 return SYMBOL_TINY_ABSOLUTE;
19354 case AARCH64_CMODEL_SMALL_SPIC:
19355 case AARCH64_CMODEL_SMALL_PIC:
19356 case AARCH64_CMODEL_SMALL:
19357 if ((flag_pic || SYMBOL_REF_WEAK (x))
19358 && !aarch64_symbol_binds_local_p (x))
19359 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19360 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19362 /* Same reasoning as the tiny code model, but the offset cap here is
19363 1MB, allowing +/-3.9GB for the offset to the symbol. */
19364 if (!(IN_RANGE (offset, -0x100000, 0x100000)
19365 || offset_within_block_p (x, offset)))
19366 return SYMBOL_FORCE_TO_MEM;
19368 return SYMBOL_SMALL_ABSOLUTE;
19370 case AARCH64_CMODEL_LARGE:
19371 /* This is alright even in PIC code as the constant
19372 pool reference is always PC relative and within
19373 the same translation unit. */
19374 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19375 return SYMBOL_SMALL_ABSOLUTE;
19376 else
19377 return SYMBOL_FORCE_TO_MEM;
19379 default:
19380 gcc_unreachable ();
19384 /* By default push everything into the constant pool. */
19385 return SYMBOL_FORCE_TO_MEM;
19388 bool
19389 aarch64_constant_address_p (rtx x)
19391 return (CONSTANT_P (x) && memory_address_p (DImode, x));
19394 bool
19395 aarch64_legitimate_pic_operand_p (rtx x)
19397 poly_int64 offset;
19398 x = strip_offset_and_salt (x, &offset);
19399 if (SYMBOL_REF_P (x))
19400 return false;
19402 return true;
19405 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
19406 that should be rematerialized rather than spilled. */
19408 static bool
19409 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19411 /* Support CSE and rematerialization of common constants. */
19412 if (CONST_INT_P (x)
19413 || CONST_DOUBLE_P (x))
19414 return true;
19416 /* Only accept variable-length vector constants if they can be
19417 handled directly.
19419 ??? It would be possible (but complex) to handle rematerialization
19420 of other constants via secondary reloads. */
19421 if (!GET_MODE_SIZE (mode).is_constant ())
19422 return aarch64_simd_valid_immediate (x, NULL);
19424 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19425 least be forced to memory and loaded from there. */
19426 if (CONST_VECTOR_P (x))
19427 return !targetm.cannot_force_const_mem (mode, x);
19429 /* Do not allow vector struct mode constants for Advanced SIMD.
19430 We could support 0 and -1 easily, but they need support in
19431 aarch64-simd.md. */
19432 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19433 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19434 return false;
19436 if (GET_CODE (x) == HIGH)
19437 x = XEXP (x, 0);
19439 /* Accept polynomial constants that can be calculated by using the
19440 destination of a move as the sole temporary. Constants that
19441 require a second temporary cannot be rematerialized (they can't be
19442 forced to memory and also aren't legitimate constants). */
19443 poly_int64 offset;
19444 if (poly_int_rtx_p (x, &offset))
19445 return aarch64_offset_temporaries (false, offset) <= 1;
19447 /* If an offset is being added to something else, we need to allow the
19448 base to be moved into the destination register, meaning that there
19449 are no free temporaries for the offset. */
19450 x = strip_offset_and_salt (x, &offset);
19451 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19452 return false;
19454 /* Do not allow const (plus (anchor_symbol, const_int)). */
19455 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19456 return false;
19458 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
19459 so spilling them is better than rematerialization. */
19460 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19461 return true;
19463 /* Label references are always constant. */
19464 if (LABEL_REF_P (x))
19465 return true;
19467 return false;
19471 aarch64_load_tp (rtx target)
19473 if (!target
19474 || GET_MODE (target) != Pmode
19475 || !register_operand (target, Pmode))
19476 target = gen_reg_rtx (Pmode);
19478 /* Can return in any reg. */
19479 emit_insn (gen_aarch64_load_tp_hard (target));
19480 return target;
19483 /* On AAPCS systems, this is the "struct __va_list". */
19484 static GTY(()) tree va_list_type;
19486 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19487 Return the type to use as __builtin_va_list.
19489 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19491 struct __va_list
19493 void *__stack;
19494 void *__gr_top;
19495 void *__vr_top;
19496 int __gr_offs;
19497 int __vr_offs;
19498 }; */
19500 static tree
19501 aarch64_build_builtin_va_list (void)
19503 tree va_list_name;
19504 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19506 /* Create the type. */
19507 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19508 /* Give it the required name. */
19509 va_list_name = build_decl (BUILTINS_LOCATION,
19510 TYPE_DECL,
19511 get_identifier ("__va_list"),
19512 va_list_type);
19513 DECL_ARTIFICIAL (va_list_name) = 1;
19514 TYPE_NAME (va_list_type) = va_list_name;
19515 TYPE_STUB_DECL (va_list_type) = va_list_name;
19517 /* Create the fields. */
19518 f_stack = build_decl (BUILTINS_LOCATION,
19519 FIELD_DECL, get_identifier ("__stack"),
19520 ptr_type_node);
19521 f_grtop = build_decl (BUILTINS_LOCATION,
19522 FIELD_DECL, get_identifier ("__gr_top"),
19523 ptr_type_node);
19524 f_vrtop = build_decl (BUILTINS_LOCATION,
19525 FIELD_DECL, get_identifier ("__vr_top"),
19526 ptr_type_node);
19527 f_groff = build_decl (BUILTINS_LOCATION,
19528 FIELD_DECL, get_identifier ("__gr_offs"),
19529 integer_type_node);
19530 f_vroff = build_decl (BUILTINS_LOCATION,
19531 FIELD_DECL, get_identifier ("__vr_offs"),
19532 integer_type_node);
19534 /* Tell tree-stdarg pass about our internal offset fields.
19535 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19536 purpose to identify whether the code is updating va_list internal
19537 offset fields through irregular way. */
19538 va_list_gpr_counter_field = f_groff;
19539 va_list_fpr_counter_field = f_vroff;
19541 DECL_ARTIFICIAL (f_stack) = 1;
19542 DECL_ARTIFICIAL (f_grtop) = 1;
19543 DECL_ARTIFICIAL (f_vrtop) = 1;
19544 DECL_ARTIFICIAL (f_groff) = 1;
19545 DECL_ARTIFICIAL (f_vroff) = 1;
19547 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19548 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19549 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19550 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19551 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19553 TYPE_FIELDS (va_list_type) = f_stack;
19554 DECL_CHAIN (f_stack) = f_grtop;
19555 DECL_CHAIN (f_grtop) = f_vrtop;
19556 DECL_CHAIN (f_vrtop) = f_groff;
19557 DECL_CHAIN (f_groff) = f_vroff;
19559 /* Compute its layout. */
19560 layout_type (va_list_type);
19562 return va_list_type;
19565 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
19566 static void
19567 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19569 const CUMULATIVE_ARGS *cum;
19570 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19571 tree stack, grtop, vrtop, groff, vroff;
19572 tree t;
19573 int gr_save_area_size = cfun->va_list_gpr_size;
19574 int vr_save_area_size = cfun->va_list_fpr_size;
19575 int vr_offset;
19577 cum = &crtl->args.info;
19578 if (cfun->va_list_gpr_size)
19579 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19580 cfun->va_list_gpr_size);
19581 if (cfun->va_list_fpr_size)
19582 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19583 * UNITS_PER_VREG, cfun->va_list_fpr_size);
19585 if (!TARGET_FLOAT)
19587 gcc_assert (cum->aapcs_nvrn == 0);
19588 vr_save_area_size = 0;
19591 f_stack = TYPE_FIELDS (va_list_type_node);
19592 f_grtop = DECL_CHAIN (f_stack);
19593 f_vrtop = DECL_CHAIN (f_grtop);
19594 f_groff = DECL_CHAIN (f_vrtop);
19595 f_vroff = DECL_CHAIN (f_groff);
19597 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19598 NULL_TREE);
19599 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19600 NULL_TREE);
19601 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19602 NULL_TREE);
19603 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19604 NULL_TREE);
19605 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19606 NULL_TREE);
19608 /* Emit code to initialize STACK, which points to the next varargs stack
19609 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
19610 by named arguments. STACK is 8-byte aligned. */
19611 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19612 if (cum->aapcs_stack_size > 0)
19613 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19614 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19615 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19617 /* Emit code to initialize GRTOP, the top of the GR save area.
19618 virtual_incoming_args_rtx should have been 16 byte aligned. */
19619 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19620 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19621 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19623 /* Emit code to initialize VRTOP, the top of the VR save area.
19624 This address is gr_save_area_bytes below GRTOP, rounded
19625 down to the next 16-byte boundary. */
19626 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19627 vr_offset = ROUND_UP (gr_save_area_size,
19628 STACK_BOUNDARY / BITS_PER_UNIT);
19630 if (vr_offset)
19631 t = fold_build_pointer_plus_hwi (t, -vr_offset);
19632 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19633 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19635 /* Emit code to initialize GROFF, the offset from GRTOP of the
19636 next GPR argument. */
19637 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19638 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19639 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19641 /* Likewise emit code to initialize VROFF, the offset from FTOP
19642 of the next VR argument. */
19643 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19644 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19645 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19648 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
19650 static tree
19651 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19652 gimple_seq *post_p ATTRIBUTE_UNUSED)
19654 tree addr;
19655 bool indirect_p;
19656 bool is_ha; /* is HFA or HVA. */
19657 bool dw_align; /* double-word align. */
19658 machine_mode ag_mode = VOIDmode;
19659 int nregs;
19660 machine_mode mode;
19662 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19663 tree stack, f_top, f_off, off, arg, roundup, on_stack;
19664 HOST_WIDE_INT size, rsize, adjust, align;
19665 tree t, u, cond1, cond2;
19667 indirect_p = pass_va_arg_by_reference (type);
19668 if (indirect_p)
19669 type = build_pointer_type (type);
19671 mode = TYPE_MODE (type);
19673 f_stack = TYPE_FIELDS (va_list_type_node);
19674 f_grtop = DECL_CHAIN (f_stack);
19675 f_vrtop = DECL_CHAIN (f_grtop);
19676 f_groff = DECL_CHAIN (f_vrtop);
19677 f_vroff = DECL_CHAIN (f_groff);
19679 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19680 f_stack, NULL_TREE);
19681 size = int_size_in_bytes (type);
19683 unsigned int abi_break;
19684 align
19685 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
19687 dw_align = false;
19688 adjust = 0;
19689 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19690 &is_ha, false))
19692 /* No frontends can create types with variable-sized modes, so we
19693 shouldn't be asked to pass or return them. */
19694 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19696 /* TYPE passed in fp/simd registers. */
19697 if (!TARGET_FLOAT)
19698 aarch64_err_no_fpadvsimd (mode);
19700 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19701 unshare_expr (valist), f_vrtop, NULL_TREE);
19702 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19703 unshare_expr (valist), f_vroff, NULL_TREE);
19705 rsize = nregs * UNITS_PER_VREG;
19707 if (is_ha)
19709 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19710 adjust = UNITS_PER_VREG - ag_size;
19712 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19713 && size < UNITS_PER_VREG)
19715 adjust = UNITS_PER_VREG - size;
19718 else
19720 /* TYPE passed in general registers. */
19721 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19722 unshare_expr (valist), f_grtop, NULL_TREE);
19723 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19724 unshare_expr (valist), f_groff, NULL_TREE);
19725 rsize = ROUND_UP (size, UNITS_PER_WORD);
19726 nregs = rsize / UNITS_PER_WORD;
19728 if (align > 8)
19730 if (abi_break && warn_psabi)
19731 inform (input_location, "parameter passing for argument of type "
19732 "%qT changed in GCC 9.1", type);
19733 dw_align = true;
19736 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19737 && size < UNITS_PER_WORD)
19739 adjust = UNITS_PER_WORD - size;
19743 /* Get a local temporary for the field value. */
19744 off = get_initialized_tmp_var (f_off, pre_p, NULL);
19746 /* Emit code to branch if off >= 0. */
19747 t = build2 (GE_EXPR, boolean_type_node, off,
19748 build_int_cst (TREE_TYPE (off), 0));
19749 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19751 if (dw_align)
19753 /* Emit: offs = (offs + 15) & -16. */
19754 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19755 build_int_cst (TREE_TYPE (off), 15));
19756 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19757 build_int_cst (TREE_TYPE (off), -16));
19758 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19760 else
19761 roundup = NULL;
19763 /* Update ap.__[g|v]r_offs */
19764 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19765 build_int_cst (TREE_TYPE (off), rsize));
19766 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19768 /* String up. */
19769 if (roundup)
19770 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19772 /* [cond2] if (ap.__[g|v]r_offs > 0) */
19773 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19774 build_int_cst (TREE_TYPE (f_off), 0));
19775 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19777 /* String up: make sure the assignment happens before the use. */
19778 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19779 COND_EXPR_ELSE (cond1) = t;
19781 /* Prepare the trees handling the argument that is passed on the stack;
19782 the top level node will store in ON_STACK. */
19783 arg = get_initialized_tmp_var (stack, pre_p, NULL);
19784 if (align > 8)
19786 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
19787 t = fold_build_pointer_plus_hwi (arg, 15);
19788 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19789 build_int_cst (TREE_TYPE (t), -16));
19790 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19792 else
19793 roundup = NULL;
19794 /* Advance ap.__stack */
19795 t = fold_build_pointer_plus_hwi (arg, size + 7);
19796 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19797 build_int_cst (TREE_TYPE (t), -8));
19798 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19799 /* String up roundup and advance. */
19800 if (roundup)
19801 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19802 /* String up with arg */
19803 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19804 /* Big-endianness related address adjustment. */
19805 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19806 && size < UNITS_PER_WORD)
19808 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19809 size_int (UNITS_PER_WORD - size));
19810 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19813 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19814 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19816 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
19817 t = off;
19818 if (adjust)
19819 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19820 build_int_cst (TREE_TYPE (off), adjust));
19822 t = fold_convert (sizetype, t);
19823 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19825 if (is_ha)
19827 /* type ha; // treat as "struct {ftype field[n];}"
19828 ... [computing offs]
19829 for (i = 0; i <nregs; ++i, offs += 16)
19830 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19831 return ha; */
19832 int i;
19833 tree tmp_ha, field_t, field_ptr_t;
19835 /* Declare a local variable. */
19836 tmp_ha = create_tmp_var_raw (type, "ha");
19837 gimple_add_tmp_var (tmp_ha);
19839 /* Establish the base type. */
19840 switch (ag_mode)
19842 case E_SFmode:
19843 field_t = float_type_node;
19844 field_ptr_t = float_ptr_type_node;
19845 break;
19846 case E_DFmode:
19847 field_t = double_type_node;
19848 field_ptr_t = double_ptr_type_node;
19849 break;
19850 case E_TFmode:
19851 field_t = long_double_type_node;
19852 field_ptr_t = long_double_ptr_type_node;
19853 break;
19854 case E_SDmode:
19855 field_t = dfloat32_type_node;
19856 field_ptr_t = build_pointer_type (dfloat32_type_node);
19857 break;
19858 case E_DDmode:
19859 field_t = dfloat64_type_node;
19860 field_ptr_t = build_pointer_type (dfloat64_type_node);
19861 break;
19862 case E_TDmode:
19863 field_t = dfloat128_type_node;
19864 field_ptr_t = build_pointer_type (dfloat128_type_node);
19865 break;
19866 case E_HFmode:
19867 field_t = aarch64_fp16_type_node;
19868 field_ptr_t = aarch64_fp16_ptr_type_node;
19869 break;
19870 case E_BFmode:
19871 field_t = aarch64_bf16_type_node;
19872 field_ptr_t = aarch64_bf16_ptr_type_node;
19873 break;
19874 case E_V2SImode:
19875 case E_V4SImode:
19877 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
19878 field_t = build_vector_type_for_mode (innertype, ag_mode);
19879 field_ptr_t = build_pointer_type (field_t);
19881 break;
19882 default:
19883 gcc_assert (0);
19886 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
19887 TREE_ADDRESSABLE (tmp_ha) = 1;
19888 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
19889 addr = t;
19890 t = fold_convert (field_ptr_t, addr);
19891 t = build2 (MODIFY_EXPR, field_t,
19892 build1 (INDIRECT_REF, field_t, tmp_ha),
19893 build1 (INDIRECT_REF, field_t, t));
19895 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
19896 for (i = 1; i < nregs; ++i)
19898 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
19899 u = fold_convert (field_ptr_t, addr);
19900 u = build2 (MODIFY_EXPR, field_t,
19901 build2 (MEM_REF, field_t, tmp_ha,
19902 build_int_cst (field_ptr_t,
19903 (i *
19904 int_size_in_bytes (field_t)))),
19905 build1 (INDIRECT_REF, field_t, u));
19906 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
19909 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
19910 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
19913 COND_EXPR_ELSE (cond2) = t;
19914 addr = fold_convert (build_pointer_type (type), cond1);
19915 addr = build_va_arg_indirect_ref (addr);
19917 if (indirect_p)
19918 addr = build_va_arg_indirect_ref (addr);
19920 return addr;
19923 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
19925 static void
19926 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
19927 const function_arg_info &arg,
19928 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
19930 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
19931 CUMULATIVE_ARGS local_cum;
19932 int gr_saved = cfun->va_list_gpr_size;
19933 int vr_saved = cfun->va_list_fpr_size;
19935 /* The caller has advanced CUM up to, but not beyond, the last named
19936 argument. Advance a local copy of CUM past the last "real" named
19937 argument, to find out how many registers are left over. */
19938 local_cum = *cum;
19939 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
19940 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
19942 /* Found out how many registers we need to save.
19943 Honor tree-stdvar analysis results. */
19944 if (cfun->va_list_gpr_size)
19945 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
19946 cfun->va_list_gpr_size / UNITS_PER_WORD);
19947 if (cfun->va_list_fpr_size)
19948 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
19949 cfun->va_list_fpr_size / UNITS_PER_VREG);
19951 if (!TARGET_FLOAT)
19953 gcc_assert (local_cum.aapcs_nvrn == 0);
19954 vr_saved = 0;
19957 if (!no_rtl)
19959 if (gr_saved > 0)
19961 rtx ptr, mem;
19963 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
19964 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
19965 - gr_saved * UNITS_PER_WORD);
19966 mem = gen_frame_mem (BLKmode, ptr);
19967 set_mem_alias_set (mem, get_varargs_alias_set ());
19969 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
19970 mem, gr_saved);
19972 if (vr_saved > 0)
19974 /* We can't use move_block_from_reg, because it will use
19975 the wrong mode, storing D regs only. */
19976 machine_mode mode = TImode;
19977 int off, i, vr_start;
19979 /* Set OFF to the offset from virtual_incoming_args_rtx of
19980 the first vector register. The VR save area lies below
19981 the GR one, and is aligned to 16 bytes. */
19982 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
19983 STACK_BOUNDARY / BITS_PER_UNIT);
19984 off -= vr_saved * UNITS_PER_VREG;
19986 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
19987 for (i = 0; i < vr_saved; ++i)
19989 rtx ptr, mem;
19991 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
19992 mem = gen_frame_mem (mode, ptr);
19993 set_mem_alias_set (mem, get_varargs_alias_set ());
19994 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
19995 off += UNITS_PER_VREG;
20000 /* We don't save the size into *PRETEND_SIZE because we want to avoid
20001 any complication of having crtl->args.pretend_args_size changed. */
20002 cfun->machine->frame.saved_varargs_size
20003 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
20004 STACK_BOUNDARY / BITS_PER_UNIT)
20005 + vr_saved * UNITS_PER_VREG);
20008 static void
20009 aarch64_conditional_register_usage (void)
20011 int i;
20012 if (!TARGET_FLOAT)
20014 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
20016 fixed_regs[i] = 1;
20017 call_used_regs[i] = 1;
20018 CLEAR_HARD_REG_BIT (operand_reg_set, i);
20021 if (!TARGET_SVE)
20022 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
20024 fixed_regs[i] = 1;
20025 call_used_regs[i] = 1;
20028 /* Only allow the FFR and FFRT to be accessed via special patterns. */
20029 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
20030 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
20032 /* When tracking speculation, we need a couple of call-clobbered registers
20033 to track the speculation state. It would be nice to just use
20034 IP0 and IP1, but currently there are numerous places that just
20035 assume these registers are free for other uses (eg pointer
20036 authentication). */
20037 if (aarch64_track_speculation)
20039 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
20040 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
20041 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20042 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20046 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
20048 bool
20049 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
20051 /* For records we're passed a FIELD_DECL, for arrays we're passed
20052 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
20053 const_tree type = TREE_TYPE (field_or_array);
20055 /* Assign BLKmode to anything that contains multiple SVE predicates.
20056 For structures, the "multiple" case is indicated by MODE being
20057 VOIDmode. */
20058 unsigned int num_zr, num_pr;
20059 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
20061 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
20062 return !simple_cst_equal (TYPE_SIZE (field_or_array),
20063 TYPE_SIZE (type));
20064 return mode == VOIDmode;
20067 return default_member_type_forces_blk (field_or_array, mode);
20070 /* Bitmasks that indicate whether earlier versions of GCC would have
20071 taken a different path through the ABI logic. This should result in
20072 a -Wpsabi warning if the earlier path led to a different ABI decision.
20074 WARN_PSABI_EMPTY_CXX17_BASE
20075 Indicates that the type includes an artificial empty C++17 base field
20076 that, prior to GCC 10.1, would prevent the type from being treated as
20077 a HFA or HVA. See PR94383 for details.
20079 WARN_PSABI_NO_UNIQUE_ADDRESS
20080 Indicates that the type includes an empty [[no_unique_address]] field
20081 that, prior to GCC 10.1, would prevent the type from being treated as
20082 a HFA or HVA. */
20083 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
20084 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
20085 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
20087 /* Walk down the type tree of TYPE counting consecutive base elements.
20088 If *MODEP is VOIDmode, then set it to the first valid floating point
20089 type. If a non-floating point type is found, or if a floating point
20090 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
20091 otherwise return the count in the sub-tree.
20093 The WARN_PSABI_FLAGS argument allows the caller to check whether this
20094 function has changed its behavior relative to earlier versions of GCC.
20095 Normally the argument should be nonnull and point to a zero-initialized
20096 variable. The function then records whether the ABI decision might
20097 be affected by a known fix to the ABI logic, setting the associated
20098 WARN_PSABI_* bits if so.
20100 When the argument is instead a null pointer, the function tries to
20101 simulate the behavior of GCC before all such ABI fixes were made.
20102 This is useful to check whether the function returns something
20103 different after the ABI fixes. */
20104 static int
20105 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
20106 unsigned int *warn_psabi_flags)
20108 machine_mode mode;
20109 HOST_WIDE_INT size;
20111 if (aarch64_sve::builtin_type_p (type))
20112 return -1;
20114 switch (TREE_CODE (type))
20116 case REAL_TYPE:
20117 mode = TYPE_MODE (type);
20118 if (mode != DFmode && mode != SFmode
20119 && mode != TFmode && mode != HFmode
20120 && mode != SDmode && mode != DDmode && mode != TDmode)
20121 return -1;
20123 if (*modep == VOIDmode)
20124 *modep = mode;
20126 if (*modep == mode)
20127 return 1;
20129 break;
20131 case COMPLEX_TYPE:
20132 mode = TYPE_MODE (TREE_TYPE (type));
20133 if (mode != DFmode && mode != SFmode
20134 && mode != TFmode && mode != HFmode)
20135 return -1;
20137 if (*modep == VOIDmode)
20138 *modep = mode;
20140 if (*modep == mode)
20141 return 2;
20143 break;
20145 case VECTOR_TYPE:
20146 /* Use V2SImode and V4SImode as representatives of all 64-bit
20147 and 128-bit vector types. */
20148 size = int_size_in_bytes (type);
20149 switch (size)
20151 case 8:
20152 mode = V2SImode;
20153 break;
20154 case 16:
20155 mode = V4SImode;
20156 break;
20157 default:
20158 return -1;
20161 if (*modep == VOIDmode)
20162 *modep = mode;
20164 /* Vector modes are considered to be opaque: two vectors are
20165 equivalent for the purposes of being homogeneous aggregates
20166 if they are the same size. */
20167 if (*modep == mode)
20168 return 1;
20170 break;
20172 case ARRAY_TYPE:
20174 int count;
20175 tree index = TYPE_DOMAIN (type);
20177 /* Can't handle incomplete types nor sizes that are not
20178 fixed. */
20179 if (!COMPLETE_TYPE_P (type)
20180 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20181 return -1;
20183 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20184 warn_psabi_flags);
20185 if (count == -1
20186 || !index
20187 || !TYPE_MAX_VALUE (index)
20188 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20189 || !TYPE_MIN_VALUE (index)
20190 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20191 || count < 0)
20192 return -1;
20194 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20195 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20197 /* There must be no padding. */
20198 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20199 count * GET_MODE_BITSIZE (*modep)))
20200 return -1;
20202 return count;
20205 case RECORD_TYPE:
20207 int count = 0;
20208 int sub_count;
20209 tree field;
20211 /* Can't handle incomplete types nor sizes that are not
20212 fixed. */
20213 if (!COMPLETE_TYPE_P (type)
20214 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20215 return -1;
20217 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20219 if (TREE_CODE (field) != FIELD_DECL)
20220 continue;
20222 if (DECL_FIELD_ABI_IGNORED (field))
20224 /* See whether this is something that earlier versions of
20225 GCC failed to ignore. */
20226 unsigned int flag;
20227 if (lookup_attribute ("no_unique_address",
20228 DECL_ATTRIBUTES (field)))
20229 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20230 else if (cxx17_empty_base_field_p (field))
20231 flag = WARN_PSABI_EMPTY_CXX17_BASE;
20232 else
20233 /* No compatibility problem. */
20234 continue;
20236 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
20237 if (warn_psabi_flags)
20239 *warn_psabi_flags |= flag;
20240 continue;
20243 /* A zero-width bitfield may affect layout in some
20244 circumstances, but adds no members. The determination
20245 of whether or not a type is an HFA is performed after
20246 layout is complete, so if the type still looks like an
20247 HFA afterwards, it is still classed as one. This is
20248 potentially an ABI break for the hard-float ABI. */
20249 else if (DECL_BIT_FIELD (field)
20250 && integer_zerop (DECL_SIZE (field)))
20252 /* Prior to GCC-12 these fields were striped early,
20253 hiding them from the back-end entirely and
20254 resulting in the correct behaviour for argument
20255 passing. Simulate that old behaviour without
20256 generating a warning. */
20257 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20258 continue;
20259 if (warn_psabi_flags)
20261 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20262 continue;
20266 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20267 warn_psabi_flags);
20268 if (sub_count < 0)
20269 return -1;
20270 count += sub_count;
20273 /* There must be no padding. */
20274 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20275 count * GET_MODE_BITSIZE (*modep)))
20276 return -1;
20278 return count;
20281 case UNION_TYPE:
20282 case QUAL_UNION_TYPE:
20284 /* These aren't very interesting except in a degenerate case. */
20285 int count = 0;
20286 int sub_count;
20287 tree field;
20289 /* Can't handle incomplete types nor sizes that are not
20290 fixed. */
20291 if (!COMPLETE_TYPE_P (type)
20292 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20293 return -1;
20295 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20297 if (TREE_CODE (field) != FIELD_DECL)
20298 continue;
20300 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20301 warn_psabi_flags);
20302 if (sub_count < 0)
20303 return -1;
20304 count = count > sub_count ? count : sub_count;
20307 /* There must be no padding. */
20308 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20309 count * GET_MODE_BITSIZE (*modep)))
20310 return -1;
20312 return count;
20315 default:
20316 break;
20319 return -1;
20322 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20323 type as described in AAPCS64 \S 4.1.2.
20325 See the comment above aarch64_composite_type_p for the notes on MODE. */
20327 static bool
20328 aarch64_short_vector_p (const_tree type,
20329 machine_mode mode)
20331 poly_int64 size = -1;
20333 if (type && TREE_CODE (type) == VECTOR_TYPE)
20335 if (aarch64_sve::builtin_type_p (type))
20336 return false;
20337 size = int_size_in_bytes (type);
20339 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20340 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20342 /* The containing "else if" is too loose: it means that we look at TYPE
20343 if the type is a vector type (good), but that we otherwise ignore TYPE
20344 and look only at the mode. This is wrong because the type describes
20345 the language-level information whereas the mode is purely an internal
20346 GCC concept. We can therefore reach here for types that are not
20347 vectors in the AAPCS64 sense.
20349 We can't "fix" that for the traditional Advanced SIMD vector modes
20350 without breaking backwards compatibility. However, there's no such
20351 baggage for the structure modes, which were introduced in GCC 12. */
20352 if (aarch64_advsimd_struct_mode_p (mode))
20353 return false;
20355 /* For similar reasons, rely only on the type, not the mode, when
20356 processing SVE types. */
20357 if (type && aarch64_some_values_include_pst_objects_p (type))
20358 /* Leave later code to report an error if SVE is disabled. */
20359 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20360 else
20361 size = GET_MODE_SIZE (mode);
20363 if (known_eq (size, 8) || known_eq (size, 16))
20365 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20366 they are being treated as scalable AAPCS64 types. */
20367 gcc_assert (!aarch64_sve_mode_p (mode)
20368 && !aarch64_advsimd_struct_mode_p (mode));
20369 return true;
20371 return false;
20374 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20375 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
20376 array types. The C99 floating-point complex types are also considered
20377 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
20378 types, which are GCC extensions and out of the scope of AAPCS64, are
20379 treated as composite types here as well.
20381 Note that MODE itself is not sufficient in determining whether a type
20382 is such a composite type or not. This is because
20383 stor-layout.cc:compute_record_mode may have already changed the MODE
20384 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
20385 structure with only one field may have its MODE set to the mode of the
20386 field. Also an integer mode whose size matches the size of the
20387 RECORD_TYPE type may be used to substitute the original mode
20388 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
20389 solely relied on. */
20391 static bool
20392 aarch64_composite_type_p (const_tree type,
20393 machine_mode mode)
20395 if (aarch64_short_vector_p (type, mode))
20396 return false;
20398 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20399 return true;
20401 if (mode == BLKmode
20402 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20403 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20404 return true;
20406 return false;
20409 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20410 shall be passed or returned in simd/fp register(s) (providing these
20411 parameter passing registers are available).
20413 Upon successful return, *COUNT returns the number of needed registers,
20414 *BASE_MODE returns the mode of the individual register and when IS_HA
20415 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20416 floating-point aggregate or a homogeneous short-vector aggregate.
20418 SILENT_P is true if the function should refrain from reporting any
20419 diagnostics. This should only be used if the caller is certain that
20420 any ABI decisions would eventually come through this function with
20421 SILENT_P set to false. */
20423 static bool
20424 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20425 const_tree type,
20426 machine_mode *base_mode,
20427 int *count,
20428 bool *is_ha,
20429 bool silent_p)
20431 if (is_ha != NULL) *is_ha = false;
20433 machine_mode new_mode = VOIDmode;
20434 bool composite_p = aarch64_composite_type_p (type, mode);
20436 if ((!composite_p
20437 && (GET_MODE_CLASS (mode) == MODE_FLOAT
20438 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
20439 || aarch64_short_vector_p (type, mode))
20441 *count = 1;
20442 new_mode = mode;
20444 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20446 if (is_ha != NULL) *is_ha = true;
20447 *count = 2;
20448 new_mode = GET_MODE_INNER (mode);
20450 else if (type && composite_p)
20452 unsigned int warn_psabi_flags = 0;
20453 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20454 &warn_psabi_flags);
20455 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20457 static unsigned last_reported_type_uid;
20458 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20459 int alt;
20460 if (!silent_p
20461 && warn_psabi
20462 && warn_psabi_flags
20463 && uid != last_reported_type_uid
20464 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20465 != ag_count))
20467 const char *url10
20468 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20469 const char *url12
20470 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20471 gcc_assert (alt == -1);
20472 last_reported_type_uid = uid;
20473 /* Use TYPE_MAIN_VARIANT to strip any redundant const
20474 qualification. */
20475 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20476 inform (input_location, "parameter passing for argument of "
20477 "type %qT with %<[[no_unique_address]]%> members "
20478 "changed %{in GCC 10.1%}",
20479 TYPE_MAIN_VARIANT (type), url10);
20480 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20481 inform (input_location, "parameter passing for argument of "
20482 "type %qT when C++17 is enabled changed to match "
20483 "C++14 %{in GCC 10.1%}",
20484 TYPE_MAIN_VARIANT (type), url10);
20485 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20486 inform (input_location, "parameter passing for argument of "
20487 "type %qT changed %{in GCC 12.1%}",
20488 TYPE_MAIN_VARIANT (type), url12);
20491 if (is_ha != NULL) *is_ha = true;
20492 *count = ag_count;
20494 else
20495 return false;
20497 else
20498 return false;
20500 gcc_assert (!aarch64_sve_mode_p (new_mode));
20501 *base_mode = new_mode;
20502 return true;
20505 /* Implement TARGET_STRUCT_VALUE_RTX. */
20507 static rtx
20508 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20509 int incoming ATTRIBUTE_UNUSED)
20511 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20514 /* Implements target hook vector_mode_supported_p. */
20515 static bool
20516 aarch64_vector_mode_supported_p (machine_mode mode)
20518 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20519 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20522 /* Return the full-width SVE vector mode for element mode MODE, if one
20523 exists. */
20524 opt_machine_mode
20525 aarch64_full_sve_mode (scalar_mode mode)
20527 switch (mode)
20529 case E_DFmode:
20530 return VNx2DFmode;
20531 case E_SFmode:
20532 return VNx4SFmode;
20533 case E_HFmode:
20534 return VNx8HFmode;
20535 case E_BFmode:
20536 return VNx8BFmode;
20537 case E_DImode:
20538 return VNx2DImode;
20539 case E_SImode:
20540 return VNx4SImode;
20541 case E_HImode:
20542 return VNx8HImode;
20543 case E_QImode:
20544 return VNx16QImode;
20545 default:
20546 return opt_machine_mode ();
20550 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20551 if it exists. */
20552 opt_machine_mode
20553 aarch64_vq_mode (scalar_mode mode)
20555 switch (mode)
20557 case E_DFmode:
20558 return V2DFmode;
20559 case E_SFmode:
20560 return V4SFmode;
20561 case E_HFmode:
20562 return V8HFmode;
20563 case E_BFmode:
20564 return V8BFmode;
20565 case E_SImode:
20566 return V4SImode;
20567 case E_HImode:
20568 return V8HImode;
20569 case E_QImode:
20570 return V16QImode;
20571 case E_DImode:
20572 return V2DImode;
20573 default:
20574 return opt_machine_mode ();
20578 /* Return appropriate SIMD container
20579 for MODE within a vector of WIDTH bits. */
20580 static machine_mode
20581 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20583 if (TARGET_SVE
20584 && maybe_ne (width, 128)
20585 && known_eq (width, BITS_PER_SVE_VECTOR))
20586 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20588 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20589 if (TARGET_SIMD)
20591 if (known_eq (width, 128))
20592 return aarch64_vq_mode (mode).else_mode (word_mode);
20593 else
20594 switch (mode)
20596 case E_SFmode:
20597 return V2SFmode;
20598 case E_HFmode:
20599 return V4HFmode;
20600 case E_BFmode:
20601 return V4BFmode;
20602 case E_SImode:
20603 return V2SImode;
20604 case E_HImode:
20605 return V4HImode;
20606 case E_QImode:
20607 return V8QImode;
20608 default:
20609 break;
20612 return word_mode;
20615 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20616 and return whether the SVE mode should be preferred over the
20617 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
20618 static bool
20619 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20621 /* Take into account the aarch64-autovec-preference param if non-zero. */
20622 bool only_asimd_p = aarch64_autovec_preference == 1;
20623 bool only_sve_p = aarch64_autovec_preference == 2;
20625 if (only_asimd_p)
20626 return false;
20627 if (only_sve_p)
20628 return true;
20630 /* The preference in case of a tie in costs. */
20631 bool prefer_asimd = aarch64_autovec_preference == 3;
20632 bool prefer_sve = aarch64_autovec_preference == 4;
20634 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20635 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20636 /* If the CPU information does not have an SVE width registered use the
20637 generic poly_int comparison that prefers SVE. If a preference is
20638 explicitly requested avoid this path. */
20639 if (aarch64_tune_params.sve_width == SVE_SCALABLE
20640 && !prefer_asimd
20641 && !prefer_sve)
20642 return maybe_gt (nunits_sve, nunits_asimd);
20644 /* Otherwise estimate the runtime width of the modes involved. */
20645 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20646 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20648 /* Preferring SVE means picking it first unless the Advanced SIMD mode
20649 is clearly wider. */
20650 if (prefer_sve)
20651 return est_sve >= est_asimd;
20652 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20653 is clearly wider. */
20654 if (prefer_asimd)
20655 return est_sve > est_asimd;
20657 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
20658 return est_sve > est_asimd;
20661 /* Return 128-bit container as the preferred SIMD mode for MODE. */
20662 static machine_mode
20663 aarch64_preferred_simd_mode (scalar_mode mode)
20665 /* Take into account explicit auto-vectorization ISA preferences through
20666 aarch64_cmp_autovec_modes. */
20667 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20668 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20669 if (TARGET_SIMD)
20670 return aarch64_vq_mode (mode).else_mode (word_mode);
20671 return word_mode;
20674 /* Return a list of possible vector sizes for the vectorizer
20675 to iterate over. */
20676 static unsigned int
20677 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20679 static const machine_mode sve_modes[] = {
20680 /* Try using full vectors for all element types. */
20681 VNx16QImode,
20683 /* Try using 16-bit containers for 8-bit elements and full vectors
20684 for wider elements. */
20685 VNx8QImode,
20687 /* Try using 32-bit containers for 8-bit and 16-bit elements and
20688 full vectors for wider elements. */
20689 VNx4QImode,
20691 /* Try using 64-bit containers for all element types. */
20692 VNx2QImode
20695 static const machine_mode advsimd_modes[] = {
20696 /* Try using 128-bit vectors for all element types. */
20697 V16QImode,
20699 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20700 for wider elements. */
20701 V8QImode,
20703 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20704 for wider elements.
20706 TODO: We could support a limited form of V4QImode too, so that
20707 we use 32-bit vectors for 8-bit elements. */
20708 V4HImode,
20710 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20711 for 64-bit elements.
20713 TODO: We could similarly support limited forms of V2QImode and V2HImode
20714 for this case. */
20715 V2SImode
20718 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20719 This is because:
20721 - If we can't use N-byte Advanced SIMD vectors then the placement
20722 doesn't matter; we'll just continue as though the Advanced SIMD
20723 entry didn't exist.
20725 - If an SVE main loop with N bytes ends up being cheaper than an
20726 Advanced SIMD main loop with N bytes then by default we'll replace
20727 the Advanced SIMD version with the SVE one.
20729 - If an Advanced SIMD main loop with N bytes ends up being cheaper
20730 than an SVE main loop with N bytes then by default we'll try to
20731 use the SVE loop to vectorize the epilogue instead. */
20733 bool only_asimd_p = aarch64_autovec_preference == 1;
20734 bool only_sve_p = aarch64_autovec_preference == 2;
20736 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20737 unsigned int advsimd_i = 0;
20739 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20741 if (sve_i < ARRAY_SIZE (sve_modes)
20742 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20743 advsimd_modes[advsimd_i]))
20744 modes->safe_push (sve_modes[sve_i++]);
20745 else
20746 modes->safe_push (advsimd_modes[advsimd_i++]);
20748 while (sve_i < ARRAY_SIZE (sve_modes))
20749 modes->safe_push (sve_modes[sve_i++]);
20751 unsigned int flags = 0;
20752 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20753 can compare SVE against Advanced SIMD and so that we can compare
20754 multiple SVE vectorization approaches against each other. There's
20755 not really any point doing this for Advanced SIMD only, since the
20756 first mode that works should always be the best. */
20757 if (TARGET_SVE && aarch64_sve_compare_costs)
20758 flags |= VECT_COMPARE_COSTS;
20759 return flags;
20762 /* Implement TARGET_MANGLE_TYPE. */
20764 static const char *
20765 aarch64_mangle_type (const_tree type)
20767 /* The AArch64 ABI documents say that "__va_list" has to be
20768 mangled as if it is in the "std" namespace. */
20769 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20770 return "St9__va_list";
20772 /* Half-precision floating point types. */
20773 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20775 if (TYPE_MAIN_VARIANT (type) == float16_type_node)
20776 return NULL;
20777 if (TYPE_MODE (type) == BFmode)
20778 return "u6__bf16";
20779 else
20780 return "Dh";
20783 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
20784 builtin types. */
20785 if (TYPE_NAME (type) != NULL)
20787 const char *res;
20788 if ((res = aarch64_general_mangle_builtin_type (type))
20789 || (res = aarch64_sve::mangle_builtin_type (type)))
20790 return res;
20793 /* Use the default mangling. */
20794 return NULL;
20797 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
20799 static bool
20800 aarch64_verify_type_context (location_t loc, type_context_kind context,
20801 const_tree type, bool silent_p)
20803 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20806 /* Find the first rtx_insn before insn that will generate an assembly
20807 instruction. */
20809 static rtx_insn *
20810 aarch64_prev_real_insn (rtx_insn *insn)
20812 if (!insn)
20813 return NULL;
20817 insn = prev_real_insn (insn);
20819 while (insn && recog_memoized (insn) < 0);
20821 return insn;
20824 static bool
20825 is_madd_op (enum attr_type t1)
20827 unsigned int i;
20828 /* A number of these may be AArch32 only. */
20829 enum attr_type mlatypes[] = {
20830 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20831 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20832 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20835 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
20837 if (t1 == mlatypes[i])
20838 return true;
20841 return false;
20844 /* Check if there is a register dependency between a load and the insn
20845 for which we hold recog_data. */
20847 static bool
20848 dep_between_memop_and_curr (rtx memop)
20850 rtx load_reg;
20851 int opno;
20853 gcc_assert (GET_CODE (memop) == SET);
20855 if (!REG_P (SET_DEST (memop)))
20856 return false;
20858 load_reg = SET_DEST (memop);
20859 for (opno = 1; opno < recog_data.n_operands; opno++)
20861 rtx operand = recog_data.operand[opno];
20862 if (REG_P (operand)
20863 && reg_overlap_mentioned_p (load_reg, operand))
20864 return true;
20867 return false;
20871 /* When working around the Cortex-A53 erratum 835769,
20872 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
20873 instruction and has a preceding memory instruction such that a NOP
20874 should be inserted between them. */
20876 bool
20877 aarch64_madd_needs_nop (rtx_insn* insn)
20879 enum attr_type attr_type;
20880 rtx_insn *prev;
20881 rtx body;
20883 if (!TARGET_FIX_ERR_A53_835769)
20884 return false;
20886 if (!INSN_P (insn) || recog_memoized (insn) < 0)
20887 return false;
20889 attr_type = get_attr_type (insn);
20890 if (!is_madd_op (attr_type))
20891 return false;
20893 prev = aarch64_prev_real_insn (insn);
20894 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
20895 Restore recog state to INSN to avoid state corruption. */
20896 extract_constrain_insn_cached (insn);
20898 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
20899 return false;
20901 body = single_set (prev);
20903 /* If the previous insn is a memory op and there is no dependency between
20904 it and the DImode madd, emit a NOP between them. If body is NULL then we
20905 have a complex memory operation, probably a load/store pair.
20906 Be conservative for now and emit a NOP. */
20907 if (GET_MODE (recog_data.operand[0]) == DImode
20908 && (!body || !dep_between_memop_and_curr (body)))
20909 return true;
20911 return false;
20916 /* Implement FINAL_PRESCAN_INSN. */
20918 void
20919 aarch64_final_prescan_insn (rtx_insn *insn)
20921 if (aarch64_madd_needs_nop (insn))
20922 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
20926 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
20927 instruction. */
20929 bool
20930 aarch64_sve_index_immediate_p (rtx base_or_step)
20932 return (CONST_INT_P (base_or_step)
20933 && IN_RANGE (INTVAL (base_or_step), -16, 15));
20936 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
20937 when applied to mode MODE. Negate X first if NEGATE_P is true. */
20939 bool
20940 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
20942 rtx elt = unwrap_const_vec_duplicate (x);
20943 if (!CONST_INT_P (elt))
20944 return false;
20946 HOST_WIDE_INT val = INTVAL (elt);
20947 if (negate_p)
20948 val = -val;
20949 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
20951 if (val & 0xff)
20952 return IN_RANGE (val, 0, 0xff);
20953 return IN_RANGE (val, 0, 0xff00);
20956 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
20957 instructions when applied to mode MODE. Negate X first if NEGATE_P
20958 is true. */
20960 bool
20961 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
20963 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
20964 return false;
20966 /* After the optional negation, the immediate must be nonnegative.
20967 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
20968 instead of SQADD Zn.B, Zn.B, #129. */
20969 rtx elt = unwrap_const_vec_duplicate (x);
20970 return negate_p == (INTVAL (elt) < 0);
20973 /* Return true if X is a valid immediate operand for an SVE logical
20974 instruction such as AND. */
20976 bool
20977 aarch64_sve_bitmask_immediate_p (rtx x)
20979 rtx elt;
20981 return (const_vec_duplicate_p (x, &elt)
20982 && CONST_INT_P (elt)
20983 && aarch64_bitmask_imm (INTVAL (elt),
20984 GET_MODE_INNER (GET_MODE (x))));
20987 /* Return true if X is a valid immediate for the SVE DUP and CPY
20988 instructions. */
20990 bool
20991 aarch64_sve_dup_immediate_p (rtx x)
20993 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
20994 if (!CONST_INT_P (x))
20995 return false;
20997 HOST_WIDE_INT val = INTVAL (x);
20998 if (val & 0xff)
20999 return IN_RANGE (val, -0x80, 0x7f);
21000 return IN_RANGE (val, -0x8000, 0x7f00);
21003 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
21004 SIGNED_P says whether the operand is signed rather than unsigned. */
21006 bool
21007 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
21009 x = unwrap_const_vec_duplicate (x);
21010 return (CONST_INT_P (x)
21011 && (signed_p
21012 ? IN_RANGE (INTVAL (x), -16, 15)
21013 : IN_RANGE (INTVAL (x), 0, 127)));
21016 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21017 instruction. Negate X first if NEGATE_P is true. */
21019 bool
21020 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
21022 rtx elt;
21023 REAL_VALUE_TYPE r;
21025 if (!const_vec_duplicate_p (x, &elt)
21026 || !CONST_DOUBLE_P (elt))
21027 return false;
21029 r = *CONST_DOUBLE_REAL_VALUE (elt);
21031 if (negate_p)
21032 r = real_value_negate (&r);
21034 if (real_equal (&r, &dconst1))
21035 return true;
21036 if (real_equal (&r, &dconsthalf))
21037 return true;
21038 return false;
21041 /* Return true if X is a valid immediate operand for an SVE FMUL
21042 instruction. */
21044 bool
21045 aarch64_sve_float_mul_immediate_p (rtx x)
21047 rtx elt;
21049 return (const_vec_duplicate_p (x, &elt)
21050 && CONST_DOUBLE_P (elt)
21051 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
21052 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
21055 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21056 for the Advanced SIMD operation described by WHICH and INSN. If INFO
21057 is nonnull, use it to describe valid immediates. */
21058 static bool
21059 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
21060 simd_immediate_info *info,
21061 enum simd_immediate_check which,
21062 simd_immediate_info::insn_type insn)
21064 /* Try a 4-byte immediate with LSL. */
21065 for (unsigned int shift = 0; shift < 32; shift += 8)
21066 if ((val32 & (0xff << shift)) == val32)
21068 if (info)
21069 *info = simd_immediate_info (SImode, val32 >> shift, insn,
21070 simd_immediate_info::LSL, shift);
21071 return true;
21074 /* Try a 2-byte immediate with LSL. */
21075 unsigned int imm16 = val32 & 0xffff;
21076 if (imm16 == (val32 >> 16))
21077 for (unsigned int shift = 0; shift < 16; shift += 8)
21078 if ((imm16 & (0xff << shift)) == imm16)
21080 if (info)
21081 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
21082 simd_immediate_info::LSL, shift);
21083 return true;
21086 /* Try a 4-byte immediate with MSL, except for cases that MVN
21087 can handle. */
21088 if (which == AARCH64_CHECK_MOV)
21089 for (unsigned int shift = 8; shift < 24; shift += 8)
21091 unsigned int low = (1 << shift) - 1;
21092 if (((val32 & (0xff << shift)) | low) == val32)
21094 if (info)
21095 *info = simd_immediate_info (SImode, val32 >> shift, insn,
21096 simd_immediate_info::MSL, shift);
21097 return true;
21101 return false;
21104 /* Return true if replicating VAL64 is a valid immediate for the
21105 Advanced SIMD operation described by WHICH. If INFO is nonnull,
21106 use it to describe valid immediates. */
21107 static bool
21108 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
21109 simd_immediate_info *info,
21110 enum simd_immediate_check which)
21112 unsigned int val32 = val64 & 0xffffffff;
21113 unsigned int val16 = val64 & 0xffff;
21114 unsigned int val8 = val64 & 0xff;
21116 if (val32 == (val64 >> 32))
21118 if ((which & AARCH64_CHECK_ORR) != 0
21119 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
21120 simd_immediate_info::MOV))
21121 return true;
21123 if ((which & AARCH64_CHECK_BIC) != 0
21124 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
21125 simd_immediate_info::MVN))
21126 return true;
21128 /* Try using a replicated byte. */
21129 if (which == AARCH64_CHECK_MOV
21130 && val16 == (val32 >> 16)
21131 && val8 == (val16 >> 8))
21133 if (info)
21134 *info = simd_immediate_info (QImode, val8);
21135 return true;
21139 /* Try using a bit-to-bytemask. */
21140 if (which == AARCH64_CHECK_MOV)
21142 unsigned int i;
21143 for (i = 0; i < 64; i += 8)
21145 unsigned char byte = (val64 >> i) & 0xff;
21146 if (byte != 0 && byte != 0xff)
21147 break;
21149 if (i == 64)
21151 if (info)
21152 *info = simd_immediate_info (DImode, val64);
21153 return true;
21156 return false;
21159 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21160 instruction. If INFO is nonnull, use it to describe valid immediates. */
21162 static bool
21163 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21164 simd_immediate_info *info)
21166 scalar_int_mode mode = DImode;
21167 unsigned int val32 = val64 & 0xffffffff;
21168 if (val32 == (val64 >> 32))
21170 mode = SImode;
21171 unsigned int val16 = val32 & 0xffff;
21172 if (val16 == (val32 >> 16))
21174 mode = HImode;
21175 unsigned int val8 = val16 & 0xff;
21176 if (val8 == (val16 >> 8))
21177 mode = QImode;
21180 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21181 if (IN_RANGE (val, -0x80, 0x7f))
21183 /* DUP with no shift. */
21184 if (info)
21185 *info = simd_immediate_info (mode, val);
21186 return true;
21188 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21190 /* DUP with LSL #8. */
21191 if (info)
21192 *info = simd_immediate_info (mode, val);
21193 return true;
21195 if (aarch64_bitmask_imm (val64, mode))
21197 /* DUPM. */
21198 if (info)
21199 *info = simd_immediate_info (mode, val);
21200 return true;
21202 return false;
21205 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21207 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21209 where PATTERN is the svpattern as a CONST_INT and where ZERO
21210 is a zero constant of the required PTRUE mode (which can have
21211 fewer elements than X's mode, if zero bits are significant).
21213 If so, and if INFO is nonnull, describe the immediate in INFO. */
21214 bool
21215 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21217 if (GET_CODE (x) != CONST)
21218 return false;
21220 x = XEXP (x, 0);
21221 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21222 return false;
21224 if (info)
21226 aarch64_svpattern pattern
21227 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21228 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21229 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21230 *info = simd_immediate_info (int_mode, pattern);
21232 return true;
21235 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
21236 it to describe valid immediates. */
21238 static bool
21239 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21241 if (aarch64_sve_ptrue_svpattern_p (x, info))
21242 return true;
21244 if (x == CONST0_RTX (GET_MODE (x)))
21246 if (info)
21247 *info = simd_immediate_info (DImode, 0);
21248 return true;
21251 /* Analyze the value as a VNx16BImode. This should be relatively
21252 efficient, since rtx_vector_builder has enough built-in capacity
21253 to store all VLA predicate constants without needing the heap. */
21254 rtx_vector_builder builder;
21255 if (!aarch64_get_sve_pred_bits (builder, x))
21256 return false;
21258 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21259 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21261 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21262 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21263 if (pattern != AARCH64_NUM_SVPATTERNS)
21265 if (info)
21267 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21268 *info = simd_immediate_info (int_mode, pattern);
21270 return true;
21273 return false;
21276 /* Return true if OP is a valid SIMD immediate for the operation
21277 described by WHICH. If INFO is nonnull, use it to describe valid
21278 immediates. */
21279 bool
21280 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21281 enum simd_immediate_check which)
21283 machine_mode mode = GET_MODE (op);
21284 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21285 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21286 return false;
21288 if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
21289 return false;
21291 if (vec_flags & VEC_SVE_PRED)
21292 return aarch64_sve_pred_valid_immediate (op, info);
21294 scalar_mode elt_mode = GET_MODE_INNER (mode);
21295 rtx base, step;
21296 unsigned int n_elts;
21297 if (CONST_VECTOR_P (op)
21298 && CONST_VECTOR_DUPLICATE_P (op))
21299 n_elts = CONST_VECTOR_NPATTERNS (op);
21300 else if ((vec_flags & VEC_SVE_DATA)
21301 && const_vec_series_p (op, &base, &step))
21303 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21304 if (!aarch64_sve_index_immediate_p (base)
21305 || !aarch64_sve_index_immediate_p (step))
21306 return false;
21308 if (info)
21310 /* Get the corresponding container mode. E.g. an INDEX on V2SI
21311 should yield two integer values per 128-bit block, meaning
21312 that we need to treat it in the same way as V2DI and then
21313 ignore the upper 32 bits of each element. */
21314 elt_mode = aarch64_sve_container_int_mode (mode);
21315 *info = simd_immediate_info (elt_mode, base, step);
21317 return true;
21319 else if (CONST_VECTOR_P (op)
21320 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21321 /* N_ELTS set above. */;
21322 else
21323 return false;
21325 scalar_float_mode elt_float_mode;
21326 if (n_elts == 1
21327 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21329 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21330 if (aarch64_float_const_zero_rtx_p (elt)
21331 || aarch64_float_const_representable_p (elt))
21333 if (info)
21334 *info = simd_immediate_info (elt_float_mode, elt);
21335 return true;
21339 /* If all elements in an SVE vector have the same value, we have a free
21340 choice between using the element mode and using the container mode.
21341 Using the element mode means that unused parts of the vector are
21342 duplicates of the used elements, while using the container mode means
21343 that the unused parts are an extension of the used elements. Using the
21344 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21345 for its container mode VNx4SI while 0x00000101 isn't.
21347 If not all elements in an SVE vector have the same value, we need the
21348 transition from one element to the next to occur at container boundaries.
21349 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21350 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
21351 scalar_int_mode elt_int_mode;
21352 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21353 elt_int_mode = aarch64_sve_container_int_mode (mode);
21354 else
21355 elt_int_mode = int_mode_for_mode (elt_mode).require ();
21357 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21358 if (elt_size > 8)
21359 return false;
21361 /* Expand the vector constant out into a byte vector, with the least
21362 significant byte of the register first. */
21363 auto_vec<unsigned char, 16> bytes;
21364 bytes.reserve (n_elts * elt_size);
21365 for (unsigned int i = 0; i < n_elts; i++)
21367 /* The vector is provided in gcc endian-neutral fashion.
21368 For aarch64_be Advanced SIMD, it must be laid out in the vector
21369 register in reverse order. */
21370 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21371 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21373 if (elt_mode != elt_int_mode)
21374 elt = gen_lowpart (elt_int_mode, elt);
21376 if (!CONST_INT_P (elt))
21377 return false;
21379 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21380 for (unsigned int byte = 0; byte < elt_size; byte++)
21382 bytes.quick_push (elt_val & 0xff);
21383 elt_val >>= BITS_PER_UNIT;
21387 /* The immediate must repeat every eight bytes. */
21388 unsigned int nbytes = bytes.length ();
21389 for (unsigned i = 8; i < nbytes; ++i)
21390 if (bytes[i] != bytes[i - 8])
21391 return false;
21393 /* Get the repeating 8-byte value as an integer. No endian correction
21394 is needed here because bytes is already in lsb-first order. */
21395 unsigned HOST_WIDE_INT val64 = 0;
21396 for (unsigned int i = 0; i < 8; i++)
21397 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21398 << (i * BITS_PER_UNIT));
21400 if (vec_flags & VEC_SVE_DATA)
21401 return aarch64_sve_valid_immediate (val64, info);
21402 else
21403 return aarch64_advsimd_valid_immediate (val64, info, which);
21406 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21407 has a step in the range of INDEX. Return the index expression if so,
21408 otherwise return null. */
21410 aarch64_check_zero_based_sve_index_immediate (rtx x)
21412 rtx base, step;
21413 if (const_vec_series_p (x, &base, &step)
21414 && base == const0_rtx
21415 && aarch64_sve_index_immediate_p (step))
21416 return step;
21417 return NULL_RTX;
21420 /* Check of immediate shift constants are within range. */
21421 bool
21422 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21424 x = unwrap_const_vec_duplicate (x);
21425 if (!CONST_INT_P (x))
21426 return false;
21427 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21428 if (left)
21429 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21430 else
21431 return IN_RANGE (INTVAL (x), 1, bit_width);
21434 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21435 operation of width WIDTH at bit position POS. */
21438 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21440 gcc_assert (CONST_INT_P (width));
21441 gcc_assert (CONST_INT_P (pos));
21443 unsigned HOST_WIDE_INT mask
21444 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21445 return GEN_INT (mask << UINTVAL (pos));
21448 bool
21449 aarch64_mov_operand_p (rtx x, machine_mode mode)
21451 if (GET_CODE (x) == HIGH
21452 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21453 return true;
21455 if (CONST_INT_P (x))
21456 return true;
21458 if (VECTOR_MODE_P (GET_MODE (x)))
21460 /* Require predicate constants to be VNx16BI before RA, so that we
21461 force everything to have a canonical form. */
21462 if (!lra_in_progress
21463 && !reload_completed
21464 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21465 && GET_MODE (x) != VNx16BImode)
21466 return false;
21468 return aarch64_simd_valid_immediate (x, NULL);
21471 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
21472 x = strip_salt (x);
21474 /* GOT accesses are valid moves. */
21475 if (SYMBOL_REF_P (x)
21476 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21477 return true;
21479 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21480 return true;
21482 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21483 return true;
21485 return aarch64_classify_symbolic_expression (x)
21486 == SYMBOL_TINY_ABSOLUTE;
21489 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21490 the constant creation. */
21493 aarch64_gen_shareable_zero (machine_mode mode)
21495 machine_mode zmode = V4SImode;
21496 rtx tmp = gen_reg_rtx (zmode);
21497 emit_move_insn (tmp, CONST0_RTX (zmode));
21498 return lowpart_subreg (mode, tmp, zmode);
21501 /* Return a const_int vector of VAL. */
21503 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21505 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21506 return gen_const_vec_duplicate (mode, c);
21509 /* Check OP is a legal scalar immediate for the MOVI instruction. */
21511 bool
21512 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21514 machine_mode vmode;
21516 vmode = aarch64_simd_container_mode (mode, 64);
21517 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21518 return aarch64_simd_valid_immediate (op_v, NULL);
21521 /* Construct and return a PARALLEL RTX vector with elements numbering the
21522 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21523 the vector - from the perspective of the architecture. This does not
21524 line up with GCC's perspective on lane numbers, so we end up with
21525 different masks depending on our target endian-ness. The diagram
21526 below may help. We must draw the distinction when building masks
21527 which select one half of the vector. An instruction selecting
21528 architectural low-lanes for a big-endian target, must be described using
21529 a mask selecting GCC high-lanes.
21531 Big-Endian Little-Endian
21533 GCC 0 1 2 3 3 2 1 0
21534 | x | x | x | x | | x | x | x | x |
21535 Architecture 3 2 1 0 3 2 1 0
21537 Low Mask: { 2, 3 } { 0, 1 }
21538 High Mask: { 0, 1 } { 2, 3 }
21540 MODE Is the mode of the vector and NUNITS is the number of units in it. */
21543 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21545 rtvec v = rtvec_alloc (nunits / 2);
21546 int high_base = nunits / 2;
21547 int low_base = 0;
21548 int base;
21549 rtx t1;
21550 int i;
21552 if (BYTES_BIG_ENDIAN)
21553 base = high ? low_base : high_base;
21554 else
21555 base = high ? high_base : low_base;
21557 for (i = 0; i < nunits / 2; i++)
21558 RTVEC_ELT (v, i) = GEN_INT (base + i);
21560 t1 = gen_rtx_PARALLEL (mode, v);
21561 return t1;
21564 /* Check OP for validity as a PARALLEL RTX vector with elements
21565 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21566 from the perspective of the architecture. See the diagram above
21567 aarch64_simd_vect_par_cnst_half for more details. */
21569 bool
21570 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21571 bool high)
21573 int nelts;
21574 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21575 return false;
21577 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21578 HOST_WIDE_INT count_op = XVECLEN (op, 0);
21579 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21580 int i = 0;
21582 if (count_op != count_ideal)
21583 return false;
21585 for (i = 0; i < count_ideal; i++)
21587 rtx elt_op = XVECEXP (op, 0, i);
21588 rtx elt_ideal = XVECEXP (ideal, 0, i);
21590 if (!CONST_INT_P (elt_op)
21591 || INTVAL (elt_ideal) != INTVAL (elt_op))
21592 return false;
21594 return true;
21597 /* Return a PARALLEL containing NELTS elements, with element I equal
21598 to BASE + I * STEP. */
21601 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21603 rtvec vec = rtvec_alloc (nelts);
21604 for (unsigned int i = 0; i < nelts; ++i)
21605 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21606 return gen_rtx_PARALLEL (VOIDmode, vec);
21609 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21610 series with step STEP. */
21612 bool
21613 aarch64_stepped_int_parallel_p (rtx op, int step)
21615 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21616 return false;
21618 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21619 for (int i = 1; i < XVECLEN (op, 0); ++i)
21620 if (!CONST_INT_P (XVECEXP (op, 0, i))
21621 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21622 return false;
21624 return true;
21627 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
21628 HIGH (exclusive). */
21629 void
21630 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21631 const_tree exp)
21633 HOST_WIDE_INT lane;
21634 gcc_assert (CONST_INT_P (operand));
21635 lane = INTVAL (operand);
21637 if (lane < low || lane >= high)
21639 if (exp)
21640 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21641 lane, low, high - 1);
21642 else
21643 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21647 /* Peform endian correction on lane number N, which indexes a vector
21648 of mode MODE, and return the result as an SImode rtx. */
21651 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21653 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21656 /* Return TRUE if OP is a valid vector addressing mode. */
21658 bool
21659 aarch64_simd_mem_operand_p (rtx op)
21661 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21662 || REG_P (XEXP (op, 0)));
21665 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
21667 bool
21668 aarch64_sve_ld1r_operand_p (rtx op)
21670 struct aarch64_address_info addr;
21671 scalar_mode mode;
21673 return (MEM_P (op)
21674 && is_a <scalar_mode> (GET_MODE (op), &mode)
21675 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21676 && addr.type == ADDRESS_REG_IMM
21677 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21680 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21681 where the size of the read data is specified by `mode` and the size of the
21682 vector elements are specified by `elem_mode`. */
21683 bool
21684 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21685 scalar_mode elem_mode)
21687 struct aarch64_address_info addr;
21688 if (!MEM_P (op)
21689 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21690 return false;
21692 if (addr.type == ADDRESS_REG_IMM)
21693 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21695 if (addr.type == ADDRESS_REG_REG)
21696 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21698 return false;
21701 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
21702 bool
21703 aarch64_sve_ld1rq_operand_p (rtx op)
21705 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21706 GET_MODE_INNER (GET_MODE (op)));
21709 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21710 accessing a vector where the element size is specified by `elem_mode`. */
21711 bool
21712 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21714 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21717 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
21718 bool
21719 aarch64_sve_ldff1_operand_p (rtx op)
21721 if (!MEM_P (op))
21722 return false;
21724 struct aarch64_address_info addr;
21725 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21726 return false;
21728 if (addr.type == ADDRESS_REG_IMM)
21729 return known_eq (addr.const_offset, 0);
21731 return addr.type == ADDRESS_REG_REG;
21734 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
21735 bool
21736 aarch64_sve_ldnf1_operand_p (rtx op)
21738 struct aarch64_address_info addr;
21740 return (MEM_P (op)
21741 && aarch64_classify_address (&addr, XEXP (op, 0),
21742 GET_MODE (op), false)
21743 && addr.type == ADDRESS_REG_IMM);
21746 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21747 The conditions for STR are the same. */
21748 bool
21749 aarch64_sve_ldr_operand_p (rtx op)
21751 struct aarch64_address_info addr;
21753 return (MEM_P (op)
21754 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21755 false, ADDR_QUERY_ANY)
21756 && addr.type == ADDRESS_REG_IMM);
21759 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21760 addressing memory of mode MODE. */
21761 bool
21762 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21764 struct aarch64_address_info addr;
21765 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21766 return false;
21768 if (addr.type == ADDRESS_REG_IMM)
21769 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21771 return addr.type == ADDRESS_REG_REG;
21774 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21775 We need to be able to access the individual pieces, so the range
21776 is different from LD[234] and ST[234]. */
21777 bool
21778 aarch64_sve_struct_memory_operand_p (rtx op)
21780 if (!MEM_P (op))
21781 return false;
21783 machine_mode mode = GET_MODE (op);
21784 struct aarch64_address_info addr;
21785 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21786 ADDR_QUERY_ANY)
21787 || addr.type != ADDRESS_REG_IMM)
21788 return false;
21790 poly_int64 first = addr.const_offset;
21791 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21792 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21793 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21796 /* Emit a register copy from operand to operand, taking care not to
21797 early-clobber source registers in the process.
21799 COUNT is the number of components into which the copy needs to be
21800 decomposed. */
21801 void
21802 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21803 unsigned int count)
21805 unsigned int i;
21806 int rdest = REGNO (operands[0]);
21807 int rsrc = REGNO (operands[1]);
21809 if (!reg_overlap_mentioned_p (operands[0], operands[1])
21810 || rdest < rsrc)
21811 for (i = 0; i < count; i++)
21812 emit_move_insn (gen_rtx_REG (mode, rdest + i),
21813 gen_rtx_REG (mode, rsrc + i));
21814 else
21815 for (i = 0; i < count; i++)
21816 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21817 gen_rtx_REG (mode, rsrc + count - i - 1));
21820 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21821 one of VSTRUCT modes: OI, CI, or XI. */
21823 aarch64_simd_attr_length_rglist (machine_mode mode)
21825 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
21826 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21829 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
21830 alignment of a vector to 128 bits. SVE predicates have an alignment of
21831 16 bits. */
21832 static HOST_WIDE_INT
21833 aarch64_simd_vector_alignment (const_tree type)
21835 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21836 be set for non-predicate vectors of booleans. Modes are the most
21837 direct way we have of identifying real SVE predicate types. */
21838 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21839 return 16;
21840 widest_int min_size
21841 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21842 return wi::umin (min_size, 128).to_uhwi ();
21845 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
21846 static poly_uint64
21847 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21849 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21851 /* If the length of the vector is a fixed power of 2, try to align
21852 to that length, otherwise don't try to align at all. */
21853 HOST_WIDE_INT result;
21854 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21855 || !pow2p_hwi (result))
21856 result = TYPE_ALIGN (TREE_TYPE (type));
21857 return result;
21859 return TYPE_ALIGN (type);
21862 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
21863 static bool
21864 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21866 if (is_packed)
21867 return false;
21869 /* For fixed-length vectors, check that the vectorizer will aim for
21870 full-vector alignment. This isn't true for generic GCC vectors
21871 that are wider than the ABI maximum of 128 bits. */
21872 poly_uint64 preferred_alignment =
21873 aarch64_vectorize_preferred_vector_alignment (type);
21874 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
21875 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
21876 preferred_alignment))
21877 return false;
21879 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
21880 return true;
21883 /* Return true if the vector misalignment factor is supported by the
21884 target. */
21885 static bool
21886 aarch64_builtin_support_vector_misalignment (machine_mode mode,
21887 const_tree type, int misalignment,
21888 bool is_packed)
21890 if (TARGET_SIMD && STRICT_ALIGNMENT)
21892 /* Return if movmisalign pattern is not supported for this mode. */
21893 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
21894 return false;
21896 /* Misalignment factor is unknown at compile time. */
21897 if (misalignment == -1)
21898 return false;
21900 return default_builtin_support_vector_misalignment (mode, type, misalignment,
21901 is_packed);
21904 /* If VALS is a vector constant that can be loaded into a register
21905 using DUP, generate instructions to do so and return an RTX to
21906 assign to the register. Otherwise return NULL_RTX. */
21907 static rtx
21908 aarch64_simd_dup_constant (rtx vals)
21910 machine_mode mode = GET_MODE (vals);
21911 machine_mode inner_mode = GET_MODE_INNER (mode);
21912 rtx x;
21914 if (!const_vec_duplicate_p (vals, &x))
21915 return NULL_RTX;
21917 /* We can load this constant by using DUP and a constant in a
21918 single ARM register. This will be cheaper than a vector
21919 load. */
21920 x = copy_to_mode_reg (inner_mode, x);
21921 return gen_vec_duplicate (mode, x);
21925 /* Generate code to load VALS, which is a PARALLEL containing only
21926 constants (for vec_init) or CONST_VECTOR, efficiently into a
21927 register. Returns an RTX to copy into the register, or NULL_RTX
21928 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
21929 static rtx
21930 aarch64_simd_make_constant (rtx vals)
21932 machine_mode mode = GET_MODE (vals);
21933 rtx const_dup;
21934 rtx const_vec = NULL_RTX;
21935 int n_const = 0;
21936 int i;
21938 if (CONST_VECTOR_P (vals))
21939 const_vec = vals;
21940 else if (GET_CODE (vals) == PARALLEL)
21942 /* A CONST_VECTOR must contain only CONST_INTs and
21943 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
21944 Only store valid constants in a CONST_VECTOR. */
21945 int n_elts = XVECLEN (vals, 0);
21946 for (i = 0; i < n_elts; ++i)
21948 rtx x = XVECEXP (vals, 0, i);
21949 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
21950 n_const++;
21952 if (n_const == n_elts)
21953 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
21955 else
21956 gcc_unreachable ();
21958 if (const_vec != NULL_RTX
21959 && aarch64_simd_valid_immediate (const_vec, NULL))
21960 /* Load using MOVI/MVNI. */
21961 return const_vec;
21962 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
21963 /* Loaded using DUP. */
21964 return const_dup;
21965 else if (const_vec != NULL_RTX)
21966 /* Load from constant pool. We cannot take advantage of single-cycle
21967 LD1 because we need a PC-relative addressing mode. */
21968 return const_vec;
21969 else
21970 /* A PARALLEL containing something not valid inside CONST_VECTOR.
21971 We cannot construct an initializer. */
21972 return NULL_RTX;
21975 /* Expand a vector initialisation sequence, such that TARGET is
21976 initialised to contain VALS. */
21978 void
21979 aarch64_expand_vector_init (rtx target, rtx vals)
21981 machine_mode mode = GET_MODE (target);
21982 scalar_mode inner_mode = GET_MODE_INNER (mode);
21983 /* The number of vector elements. */
21984 int n_elts = XVECLEN (vals, 0);
21985 /* The number of vector elements which are not constant. */
21986 int n_var = 0;
21987 rtx any_const = NULL_RTX;
21988 /* The first element of vals. */
21989 rtx v0 = XVECEXP (vals, 0, 0);
21990 bool all_same = true;
21992 /* This is a special vec_init<M><N> where N is not an element mode but a
21993 vector mode with half the elements of M. We expect to find two entries
21994 of mode N in VALS and we must put their concatentation into TARGET. */
21995 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
21997 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
21998 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
21999 && known_eq (GET_MODE_SIZE (mode),
22000 2 * GET_MODE_SIZE (narrow_mode)));
22001 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
22002 XVECEXP (vals, 0, 0),
22003 XVECEXP (vals, 0, 1)));
22004 return;
22007 /* Count the number of variable elements to initialise. */
22008 for (int i = 0; i < n_elts; ++i)
22010 rtx x = XVECEXP (vals, 0, i);
22011 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
22012 ++n_var;
22013 else
22014 any_const = x;
22016 all_same &= rtx_equal_p (x, v0);
22019 /* No variable elements, hand off to aarch64_simd_make_constant which knows
22020 how best to handle this. */
22021 if (n_var == 0)
22023 rtx constant = aarch64_simd_make_constant (vals);
22024 if (constant != NULL_RTX)
22026 emit_move_insn (target, constant);
22027 return;
22031 /* Splat a single non-constant element if we can. */
22032 if (all_same)
22034 rtx x = copy_to_mode_reg (inner_mode, v0);
22035 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22036 return;
22039 /* Check for interleaving case.
22040 For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
22041 Generate following code:
22042 dup v0.h, x
22043 dup v1.h, y
22044 zip1 v0.h, v0.h, v1.h
22045 for "large enough" initializer. */
22047 if (n_elts >= 8)
22049 int i;
22050 for (i = 2; i < n_elts; i++)
22051 if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
22052 break;
22054 if (i == n_elts)
22056 machine_mode mode = GET_MODE (target);
22057 rtx dest[2];
22059 for (int i = 0; i < 2; i++)
22061 rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
22062 dest[i] = force_reg (mode, x);
22065 rtvec v = gen_rtvec (2, dest[0], dest[1]);
22066 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22067 return;
22071 enum insn_code icode = optab_handler (vec_set_optab, mode);
22072 gcc_assert (icode != CODE_FOR_nothing);
22074 /* If there are only variable elements, try to optimize
22075 the insertion using dup for the most common element
22076 followed by insertions. */
22078 /* The algorithm will fill matches[*][0] with the earliest matching element,
22079 and matches[X][1] with the count of duplicate elements (if X is the
22080 earliest element which has duplicates). */
22082 if (n_var == n_elts && n_elts <= 16)
22084 int matches[16][2] = {0};
22085 for (int i = 0; i < n_elts; i++)
22087 for (int j = 0; j <= i; j++)
22089 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
22091 matches[i][0] = j;
22092 matches[j][1]++;
22093 break;
22097 int maxelement = 0;
22098 int maxv = 0;
22099 for (int i = 0; i < n_elts; i++)
22100 if (matches[i][1] > maxv)
22102 maxelement = i;
22103 maxv = matches[i][1];
22106 /* Create a duplicate of the most common element, unless all elements
22107 are equally useless to us, in which case just immediately set the
22108 vector register using the first element. */
22110 if (maxv == 1)
22112 /* For vectors of two 64-bit elements, we can do even better. */
22113 if (n_elts == 2
22114 && (inner_mode == E_DImode
22115 || inner_mode == E_DFmode))
22118 rtx x0 = XVECEXP (vals, 0, 0);
22119 rtx x1 = XVECEXP (vals, 0, 1);
22120 /* Combine can pick up this case, but handling it directly
22121 here leaves clearer RTL.
22123 This is load_pair_lanes<mode>, and also gives us a clean-up
22124 for store_pair_lanes<mode>. */
22125 if (memory_operand (x0, inner_mode)
22126 && memory_operand (x1, inner_mode)
22127 && aarch64_mergeable_load_pair_p (mode, x0, x1))
22129 rtx t;
22130 if (inner_mode == DFmode)
22131 t = gen_load_pair_lanesdf (target, x0, x1);
22132 else
22133 t = gen_load_pair_lanesdi (target, x0, x1);
22134 emit_insn (t);
22135 return;
22138 /* The subreg-move sequence below will move into lane zero of the
22139 vector register. For big-endian we want that position to hold
22140 the last element of VALS. */
22141 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
22142 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22143 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
22145 else
22147 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22148 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22151 /* Insert the rest. */
22152 for (int i = 0; i < n_elts; i++)
22154 rtx x = XVECEXP (vals, 0, i);
22155 if (matches[i][0] == maxelement)
22156 continue;
22157 x = copy_to_mode_reg (inner_mode, x);
22158 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22160 return;
22163 /* Initialise a vector which is part-variable. We want to first try
22164 to build those lanes which are constant in the most efficient way we
22165 can. */
22166 if (n_var != n_elts)
22168 rtx copy = copy_rtx (vals);
22170 /* Load constant part of vector. We really don't care what goes into the
22171 parts we will overwrite, but we're more likely to be able to load the
22172 constant efficiently if it has fewer, larger, repeating parts
22173 (see aarch64_simd_valid_immediate). */
22174 for (int i = 0; i < n_elts; i++)
22176 rtx x = XVECEXP (vals, 0, i);
22177 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22178 continue;
22179 rtx subst = any_const;
22180 for (int bit = n_elts / 2; bit > 0; bit /= 2)
22182 /* Look in the copied vector, as more elements are const. */
22183 rtx test = XVECEXP (copy, 0, i ^ bit);
22184 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
22186 subst = test;
22187 break;
22190 XVECEXP (copy, 0, i) = subst;
22192 aarch64_expand_vector_init (target, copy);
22195 /* Insert the variable lanes directly. */
22196 for (int i = 0; i < n_elts; i++)
22198 rtx x = XVECEXP (vals, 0, i);
22199 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22200 continue;
22201 x = copy_to_mode_reg (inner_mode, x);
22202 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22206 /* Emit RTL corresponding to:
22207 insr TARGET, ELEM. */
22209 static void
22210 emit_insr (rtx target, rtx elem)
22212 machine_mode mode = GET_MODE (target);
22213 scalar_mode elem_mode = GET_MODE_INNER (mode);
22214 elem = force_reg (elem_mode, elem);
22216 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22217 gcc_assert (icode != CODE_FOR_nothing);
22218 emit_insn (GEN_FCN (icode) (target, target, elem));
22221 /* Subroutine of aarch64_sve_expand_vector_init for handling
22222 trailing constants.
22223 This function works as follows:
22224 (a) Create a new vector consisting of trailing constants.
22225 (b) Initialize TARGET with the constant vector using emit_move_insn.
22226 (c) Insert remaining elements in TARGET using insr.
22227 NELTS is the total number of elements in original vector while
22228 while NELTS_REQD is the number of elements that are actually
22229 significant.
22231 ??? The heuristic used is to do above only if number of constants
22232 is at least half the total number of elements. May need fine tuning. */
22234 static bool
22235 aarch64_sve_expand_vector_init_handle_trailing_constants
22236 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22238 machine_mode mode = GET_MODE (target);
22239 scalar_mode elem_mode = GET_MODE_INNER (mode);
22240 int n_trailing_constants = 0;
22242 for (int i = nelts_reqd - 1;
22243 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22244 i--)
22245 n_trailing_constants++;
22247 if (n_trailing_constants >= nelts_reqd / 2)
22249 /* Try to use the natural pattern of BUILDER to extend the trailing
22250 constant elements to a full vector. Replace any variables in the
22251 extra elements with zeros.
22253 ??? It would be better if the builders supported "don't care"
22254 elements, with the builder filling in whichever elements
22255 give the most compact encoding. */
22256 rtx_vector_builder v (mode, nelts, 1);
22257 for (int i = 0; i < nelts; i++)
22259 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22260 if (!valid_for_const_vector_p (elem_mode, x))
22261 x = CONST0_RTX (elem_mode);
22262 v.quick_push (x);
22264 rtx const_vec = v.build ();
22265 emit_move_insn (target, const_vec);
22267 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22268 emit_insr (target, builder.elt (i));
22270 return true;
22273 return false;
22276 /* Subroutine of aarch64_sve_expand_vector_init.
22277 Works as follows:
22278 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22279 (b) Skip trailing elements from BUILDER, which are the same as
22280 element NELTS_REQD - 1.
22281 (c) Insert earlier elements in reverse order in TARGET using insr. */
22283 static void
22284 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22285 const rtx_vector_builder &builder,
22286 int nelts_reqd)
22288 machine_mode mode = GET_MODE (target);
22289 scalar_mode elem_mode = GET_MODE_INNER (mode);
22291 struct expand_operand ops[2];
22292 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22293 gcc_assert (icode != CODE_FOR_nothing);
22295 create_output_operand (&ops[0], target, mode);
22296 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22297 expand_insn (icode, 2, ops);
22299 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22300 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22301 emit_insr (target, builder.elt (i));
22304 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22305 when all trailing elements of builder are same.
22306 This works as follows:
22307 (a) Use expand_insn interface to broadcast last vector element in TARGET.
22308 (b) Insert remaining elements in TARGET using insr.
22310 ??? The heuristic used is to do above if number of same trailing elements
22311 is at least 3/4 of total number of elements, loosely based on
22312 heuristic from mostly_zeros_p. May need fine-tuning. */
22314 static bool
22315 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22316 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22318 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22319 if (ndups >= (3 * nelts_reqd) / 4)
22321 aarch64_sve_expand_vector_init_insert_elems (target, builder,
22322 nelts_reqd - ndups + 1);
22323 return true;
22326 return false;
22329 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22330 of elements in BUILDER.
22332 The function tries to initialize TARGET from BUILDER if it fits one
22333 of the special cases outlined below.
22335 Failing that, the function divides BUILDER into two sub-vectors:
22336 v_even = even elements of BUILDER;
22337 v_odd = odd elements of BUILDER;
22339 and recursively calls itself with v_even and v_odd.
22341 if (recursive call succeeded for v_even or v_odd)
22342 TARGET = zip (v_even, v_odd)
22344 The function returns true if it managed to build TARGET from BUILDER
22345 with one of the special cases, false otherwise.
22347 Example: {a, 1, b, 2, c, 3, d, 4}
22349 The vector gets divided into:
22350 v_even = {a, b, c, d}
22351 v_odd = {1, 2, 3, 4}
22353 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22354 initialize tmp2 from constant vector v_odd using emit_move_insn.
22356 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22357 4 elements, so we construct tmp1 from v_even using insr:
22358 tmp1 = dup(d)
22359 insr tmp1, c
22360 insr tmp1, b
22361 insr tmp1, a
22363 And finally:
22364 TARGET = zip (tmp1, tmp2)
22365 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
22367 static bool
22368 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22369 int nelts, int nelts_reqd)
22371 machine_mode mode = GET_MODE (target);
22373 /* Case 1: Vector contains trailing constants. */
22375 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22376 (target, builder, nelts, nelts_reqd))
22377 return true;
22379 /* Case 2: Vector contains leading constants. */
22381 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22382 for (int i = 0; i < nelts_reqd; i++)
22383 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22384 rev_builder.finalize ();
22386 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22387 (target, rev_builder, nelts, nelts_reqd))
22389 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22390 return true;
22393 /* Case 3: Vector contains trailing same element. */
22395 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22396 (target, builder, nelts_reqd))
22397 return true;
22399 /* Case 4: Vector contains leading same element. */
22401 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22402 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22404 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22405 return true;
22408 /* Avoid recursing below 4-elements.
22409 ??? The threshold 4 may need fine-tuning. */
22411 if (nelts_reqd <= 4)
22412 return false;
22414 rtx_vector_builder v_even (mode, nelts, 1);
22415 rtx_vector_builder v_odd (mode, nelts, 1);
22417 for (int i = 0; i < nelts * 2; i += 2)
22419 v_even.quick_push (builder.elt (i));
22420 v_odd.quick_push (builder.elt (i + 1));
22423 v_even.finalize ();
22424 v_odd.finalize ();
22426 rtx tmp1 = gen_reg_rtx (mode);
22427 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22428 nelts, nelts_reqd / 2);
22430 rtx tmp2 = gen_reg_rtx (mode);
22431 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22432 nelts, nelts_reqd / 2);
22434 if (!did_even_p && !did_odd_p)
22435 return false;
22437 /* Initialize v_even and v_odd using INSR if it didn't match any of the
22438 special cases and zip v_even, v_odd. */
22440 if (!did_even_p)
22441 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22443 if (!did_odd_p)
22444 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22446 rtvec v = gen_rtvec (2, tmp1, tmp2);
22447 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22448 return true;
22451 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
22453 void
22454 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22456 machine_mode mode = GET_MODE (target);
22457 int nelts = XVECLEN (vals, 0);
22459 rtx_vector_builder v (mode, nelts, 1);
22460 for (int i = 0; i < nelts; i++)
22461 v.quick_push (XVECEXP (vals, 0, i));
22462 v.finalize ();
22464 /* If neither sub-vectors of v could be initialized specially,
22465 then use INSR to insert all elements from v into TARGET.
22466 ??? This might not be optimal for vectors with large
22467 initializers like 16-element or above.
22468 For nelts < 4, it probably isn't useful to handle specially. */
22470 if (nelts < 4
22471 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22472 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22475 /* Check whether VALUE is a vector constant in which every element
22476 is either a power of 2 or a negated power of 2. If so, return
22477 a constant vector of log2s, and flip CODE between PLUS and MINUS
22478 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
22480 static rtx
22481 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22483 if (!CONST_VECTOR_P (value))
22484 return NULL_RTX;
22486 rtx_vector_builder builder;
22487 if (!builder.new_unary_operation (GET_MODE (value), value, false))
22488 return NULL_RTX;
22490 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22491 /* 1 if the result of the multiplication must be negated,
22492 0 if it mustn't, or -1 if we don't yet care. */
22493 int negate = -1;
22494 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22495 for (unsigned int i = 0; i < encoded_nelts; ++i)
22497 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22498 if (!CONST_SCALAR_INT_P (elt))
22499 return NULL_RTX;
22500 rtx_mode_t val (elt, int_mode);
22501 wide_int pow2 = wi::neg (val);
22502 if (val != pow2)
22504 /* It matters whether we negate or not. Make that choice,
22505 and make sure that it's consistent with previous elements. */
22506 if (negate == !wi::neg_p (val))
22507 return NULL_RTX;
22508 negate = wi::neg_p (val);
22509 if (!negate)
22510 pow2 = val;
22512 /* POW2 is now the value that we want to be a power of 2. */
22513 int shift = wi::exact_log2 (pow2);
22514 if (shift < 0)
22515 return NULL_RTX;
22516 builder.quick_push (gen_int_mode (shift, int_mode));
22518 if (negate == -1)
22519 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
22520 code = PLUS;
22521 else if (negate == 1)
22522 code = code == PLUS ? MINUS : PLUS;
22523 return builder.build ();
22526 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22527 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
22528 operands array, in the same order as for fma_optab. Return true if
22529 the function emitted all the necessary instructions, false if the caller
22530 should generate the pattern normally with the new OPERANDS array. */
22532 bool
22533 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22535 machine_mode mode = GET_MODE (operands[0]);
22536 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22538 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22539 NULL_RTX, true, OPTAB_DIRECT);
22540 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22541 operands[3], product, operands[0], true,
22542 OPTAB_DIRECT);
22543 return true;
22545 operands[2] = force_reg (mode, operands[2]);
22546 return false;
22549 /* Likewise, but for a conditional pattern. */
22551 bool
22552 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22554 machine_mode mode = GET_MODE (operands[0]);
22555 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22557 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22558 NULL_RTX, true, OPTAB_DIRECT);
22559 emit_insn (gen_cond (code, mode, operands[0], operands[1],
22560 operands[4], product, operands[5]));
22561 return true;
22563 operands[3] = force_reg (mode, operands[3]);
22564 return false;
22567 static unsigned HOST_WIDE_INT
22568 aarch64_shift_truncation_mask (machine_mode mode)
22570 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22571 return 0;
22572 return GET_MODE_UNIT_BITSIZE (mode) - 1;
22575 /* Select a format to encode pointers in exception handling data. */
22577 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22579 int type;
22580 switch (aarch64_cmodel)
22582 case AARCH64_CMODEL_TINY:
22583 case AARCH64_CMODEL_TINY_PIC:
22584 case AARCH64_CMODEL_SMALL:
22585 case AARCH64_CMODEL_SMALL_PIC:
22586 case AARCH64_CMODEL_SMALL_SPIC:
22587 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
22588 for everything. */
22589 type = DW_EH_PE_sdata4;
22590 break;
22591 default:
22592 /* No assumptions here. 8-byte relocs required. */
22593 type = DW_EH_PE_sdata8;
22594 break;
22596 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22599 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
22601 static void
22602 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22604 if (TREE_CODE (decl) == FUNCTION_DECL)
22606 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22607 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22609 fprintf (stream, "\t.variant_pcs\t");
22610 assemble_name (stream, name);
22611 fprintf (stream, "\n");
22616 /* The last .arch and .tune assembly strings that we printed. */
22617 static std::string aarch64_last_printed_arch_string;
22618 static std::string aarch64_last_printed_tune_string;
22620 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
22621 by the function fndecl. */
22623 void
22624 aarch64_declare_function_name (FILE *stream, const char* name,
22625 tree fndecl)
22627 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22629 struct cl_target_option *targ_options;
22630 if (target_parts)
22631 targ_options = TREE_TARGET_OPTION (target_parts);
22632 else
22633 targ_options = TREE_TARGET_OPTION (target_option_current_node);
22634 gcc_assert (targ_options);
22636 const struct processor *this_arch
22637 = aarch64_get_arch (targ_options->x_selected_arch);
22639 auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
22640 std::string extension
22641 = aarch64_get_extension_string_for_isa_flags (isa_flags,
22642 this_arch->flags);
22643 /* Only update the assembler .arch string if it is distinct from the last
22644 such string we printed. */
22645 std::string to_print = this_arch->name + extension;
22646 if (to_print != aarch64_last_printed_arch_string)
22648 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22649 aarch64_last_printed_arch_string = to_print;
22652 /* Print the cpu name we're tuning for in the comments, might be
22653 useful to readers of the generated asm. Do it only when it changes
22654 from function to function and verbose assembly is requested. */
22655 const struct processor *this_tune
22656 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
22658 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22660 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22661 this_tune->name);
22662 aarch64_last_printed_tune_string = this_tune->name;
22665 aarch64_asm_output_variant_pcs (stream, fndecl, name);
22667 /* Don't forget the type directive for ELF. */
22668 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22669 ASM_OUTPUT_LABEL (stream, name);
22671 cfun->machine->label_is_assembled = true;
22674 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
22675 the function label and emit a BTI if necessary. */
22677 void
22678 aarch64_print_patchable_function_entry (FILE *file,
22679 unsigned HOST_WIDE_INT patch_area_size,
22680 bool record_p)
22682 if (cfun->machine->label_is_assembled
22683 && aarch64_bti_enabled ()
22684 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
22686 /* Remove the BTI that follows the patch area and insert a new BTI
22687 before the patch area right after the function label. */
22688 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22689 if (insn
22690 && INSN_P (insn)
22691 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
22692 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
22693 delete_insn (insn);
22694 asm_fprintf (file, "\thint\t34 // bti c\n");
22697 default_print_patchable_function_entry (file, patch_area_size, record_p);
22700 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
22702 void
22703 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22705 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22706 const char *value = IDENTIFIER_POINTER (target);
22707 aarch64_asm_output_variant_pcs (stream, decl, name);
22708 ASM_OUTPUT_DEF (stream, name, value);
22711 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
22712 function symbol references. */
22714 void
22715 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22717 default_elf_asm_output_external (stream, decl, name);
22718 aarch64_asm_output_variant_pcs (stream, decl, name);
22721 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22722 Used to output the .cfi_b_key_frame directive when signing the current
22723 function with the B key. */
22725 void
22726 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22728 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22729 && aarch64_ra_sign_key == AARCH64_KEY_B)
22730 asm_fprintf (f, "\t.cfi_b_key_frame\n");
22733 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
22735 static void
22736 aarch64_start_file (void)
22738 struct cl_target_option *default_options
22739 = TREE_TARGET_OPTION (target_option_default_node);
22741 const struct processor *default_arch
22742 = aarch64_get_arch (default_options->x_selected_arch);
22743 auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
22744 std::string extension
22745 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22746 default_arch->flags);
22748 aarch64_last_printed_arch_string = default_arch->name + extension;
22749 aarch64_last_printed_tune_string = "";
22750 asm_fprintf (asm_out_file, "\t.arch %s\n",
22751 aarch64_last_printed_arch_string.c_str ());
22753 default_file_start ();
22756 /* Emit load exclusive. */
22758 static void
22759 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22760 rtx mem, rtx model_rtx)
22762 if (mode == TImode)
22763 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22764 gen_highpart (DImode, rval),
22765 mem, model_rtx));
22766 else
22767 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22770 /* Emit store exclusive. */
22772 static void
22773 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22774 rtx mem, rtx rval, rtx model_rtx)
22776 if (mode == TImode)
22777 emit_insn (gen_aarch64_store_exclusive_pair
22778 (bval, mem, operand_subword (rval, 0, 0, TImode),
22779 operand_subword (rval, 1, 0, TImode), model_rtx));
22780 else
22781 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22784 /* Mark the previous jump instruction as unlikely. */
22786 static void
22787 aarch64_emit_unlikely_jump (rtx insn)
22789 rtx_insn *jump = emit_jump_insn (insn);
22790 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22793 /* We store the names of the various atomic helpers in a 5x5 array.
22794 Return the libcall function given MODE, MODEL and NAMES. */
22797 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22798 const atomic_ool_names *names)
22800 memmodel model = memmodel_from_int (INTVAL (model_rtx));
22801 int mode_idx, model_idx;
22803 switch (mode)
22805 case E_QImode:
22806 mode_idx = 0;
22807 break;
22808 case E_HImode:
22809 mode_idx = 1;
22810 break;
22811 case E_SImode:
22812 mode_idx = 2;
22813 break;
22814 case E_DImode:
22815 mode_idx = 3;
22816 break;
22817 case E_TImode:
22818 mode_idx = 4;
22819 break;
22820 default:
22821 gcc_unreachable ();
22824 switch (model)
22826 case MEMMODEL_RELAXED:
22827 model_idx = 0;
22828 break;
22829 case MEMMODEL_CONSUME:
22830 case MEMMODEL_ACQUIRE:
22831 model_idx = 1;
22832 break;
22833 case MEMMODEL_RELEASE:
22834 model_idx = 2;
22835 break;
22836 case MEMMODEL_ACQ_REL:
22837 case MEMMODEL_SEQ_CST:
22838 model_idx = 3;
22839 break;
22840 case MEMMODEL_SYNC_ACQUIRE:
22841 case MEMMODEL_SYNC_RELEASE:
22842 case MEMMODEL_SYNC_SEQ_CST:
22843 model_idx = 4;
22844 break;
22845 default:
22846 gcc_unreachable ();
22849 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
22850 VISIBILITY_HIDDEN);
22853 #define DEF0(B, N) \
22854 { "__aarch64_" #B #N "_relax", \
22855 "__aarch64_" #B #N "_acq", \
22856 "__aarch64_" #B #N "_rel", \
22857 "__aarch64_" #B #N "_acq_rel", \
22858 "__aarch64_" #B #N "_sync" }
22860 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
22861 { NULL, NULL, NULL, NULL }
22862 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
22864 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
22865 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
22866 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
22867 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
22868 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
22869 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
22871 #undef DEF0
22872 #undef DEF4
22873 #undef DEF5
22875 /* Expand a compare and swap pattern. */
22877 void
22878 aarch64_expand_compare_and_swap (rtx operands[])
22880 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
22881 machine_mode mode, r_mode;
22883 bval = operands[0];
22884 rval = operands[1];
22885 mem = operands[2];
22886 oldval = operands[3];
22887 newval = operands[4];
22888 is_weak = operands[5];
22889 mod_s = operands[6];
22890 mod_f = operands[7];
22891 mode = GET_MODE (mem);
22893 /* Normally the succ memory model must be stronger than fail, but in the
22894 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
22895 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
22896 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
22897 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
22898 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
22900 r_mode = mode;
22901 if (mode == QImode || mode == HImode)
22903 r_mode = SImode;
22904 rval = gen_reg_rtx (r_mode);
22907 if (TARGET_LSE)
22909 /* The CAS insn requires oldval and rval overlap, but we need to
22910 have a copy of oldval saved across the operation to tell if
22911 the operation is successful. */
22912 if (reg_overlap_mentioned_p (rval, oldval))
22913 rval = copy_to_mode_reg (r_mode, oldval);
22914 else
22915 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
22917 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
22918 newval, mod_s));
22919 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22921 else if (TARGET_OUTLINE_ATOMICS)
22923 /* Oldval must satisfy compare afterward. */
22924 if (!aarch64_plus_operand (oldval, mode))
22925 oldval = force_reg (mode, oldval);
22926 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
22927 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
22928 oldval, mode, newval, mode,
22929 XEXP (mem, 0), Pmode);
22930 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22932 else
22934 /* The oldval predicate varies by mode. Test it and force to reg. */
22935 insn_code code = code_for_aarch64_compare_and_swap (mode);
22936 if (!insn_data[code].operand[2].predicate (oldval, mode))
22937 oldval = force_reg (mode, oldval);
22939 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
22940 is_weak, mod_s, mod_f));
22941 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
22944 if (r_mode != mode)
22945 rval = gen_lowpart (mode, rval);
22946 emit_move_insn (operands[1], rval);
22948 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
22949 emit_insn (gen_rtx_SET (bval, x));
22952 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
22953 sequence implementing an atomic operation. */
22955 static void
22956 aarch64_emit_post_barrier (enum memmodel model)
22958 const enum memmodel base_model = memmodel_base (model);
22960 if (is_mm_sync (model)
22961 && (base_model == MEMMODEL_ACQUIRE
22962 || base_model == MEMMODEL_ACQ_REL
22963 || base_model == MEMMODEL_SEQ_CST))
22965 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
22969 /* Split a compare and swap pattern. */
22971 void
22972 aarch64_split_compare_and_swap (rtx operands[])
22974 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
22975 gcc_assert (epilogue_completed);
22977 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
22978 machine_mode mode;
22979 bool is_weak;
22980 rtx_code_label *label1, *label2;
22981 enum memmodel model;
22983 rval = operands[0];
22984 mem = operands[1];
22985 oldval = operands[2];
22986 newval = operands[3];
22987 is_weak = (operands[4] != const0_rtx);
22988 model_rtx = operands[5];
22989 scratch = operands[7];
22990 mode = GET_MODE (mem);
22991 model = memmodel_from_int (INTVAL (model_rtx));
22993 /* When OLDVAL is zero and we want the strong version we can emit a tighter
22994 loop:
22995 .label1:
22996 LD[A]XR rval, [mem]
22997 CBNZ rval, .label2
22998 ST[L]XR scratch, newval, [mem]
22999 CBNZ scratch, .label1
23000 .label2:
23001 CMP rval, 0. */
23002 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
23003 oldval == const0_rtx && mode != TImode);
23005 label1 = NULL;
23006 if (!is_weak)
23008 label1 = gen_label_rtx ();
23009 emit_label (label1);
23011 label2 = gen_label_rtx ();
23013 /* The initial load can be relaxed for a __sync operation since a final
23014 barrier will be emitted to stop code hoisting. */
23015 if (is_mm_sync (model))
23016 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
23017 else
23018 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
23020 if (strong_zero_p)
23021 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
23022 else
23024 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23025 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
23027 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23028 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
23029 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23031 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
23033 if (!is_weak)
23035 if (aarch64_track_speculation)
23037 /* Emit an explicit compare instruction, so that we can correctly
23038 track the condition codes. */
23039 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23040 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23042 else
23043 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
23045 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23046 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
23047 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23049 else
23050 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23052 emit_label (label2);
23054 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23055 to set the condition flags. If this is not used it will be removed by
23056 later passes. */
23057 if (strong_zero_p)
23058 aarch64_gen_compare_reg (NE, rval, const0_rtx);
23060 /* Emit any final barrier needed for a __sync operation. */
23061 if (is_mm_sync (model))
23062 aarch64_emit_post_barrier (model);
23065 /* Split an atomic operation. */
23067 void
23068 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
23069 rtx value, rtx model_rtx, rtx cond)
23071 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23072 gcc_assert (epilogue_completed);
23074 machine_mode mode = GET_MODE (mem);
23075 machine_mode wmode = (mode == DImode ? DImode : SImode);
23076 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
23077 const bool is_sync = is_mm_sync (model);
23078 rtx_code_label *label;
23079 rtx x;
23081 /* Split the atomic operation into a sequence. */
23082 label = gen_label_rtx ();
23083 emit_label (label);
23085 if (new_out)
23086 new_out = gen_lowpart (wmode, new_out);
23087 if (old_out)
23088 old_out = gen_lowpart (wmode, old_out);
23089 else
23090 old_out = new_out;
23091 value = simplify_gen_subreg (wmode, value, mode, 0);
23093 /* The initial load can be relaxed for a __sync operation since a final
23094 barrier will be emitted to stop code hoisting. */
23095 if (is_sync)
23096 aarch64_emit_load_exclusive (mode, old_out, mem,
23097 GEN_INT (MEMMODEL_RELAXED));
23098 else
23099 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
23101 switch (code)
23103 case SET:
23104 new_out = value;
23105 break;
23107 case NOT:
23108 x = gen_rtx_AND (wmode, old_out, value);
23109 emit_insn (gen_rtx_SET (new_out, x));
23110 x = gen_rtx_NOT (wmode, new_out);
23111 emit_insn (gen_rtx_SET (new_out, x));
23112 break;
23114 case MINUS:
23115 if (CONST_INT_P (value))
23117 value = GEN_INT (-UINTVAL (value));
23118 code = PLUS;
23120 /* Fall through. */
23122 default:
23123 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
23124 emit_insn (gen_rtx_SET (new_out, x));
23125 break;
23128 aarch64_emit_store_exclusive (mode, cond, mem,
23129 gen_lowpart (mode, new_out), model_rtx);
23131 if (aarch64_track_speculation)
23133 /* Emit an explicit compare instruction, so that we can correctly
23134 track the condition codes. */
23135 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
23136 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23138 else
23139 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
23141 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23142 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
23143 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23145 /* Emit any final barrier needed for a __sync operation. */
23146 if (is_sync)
23147 aarch64_emit_post_barrier (model);
23150 static void
23151 aarch64_init_libfuncs (void)
23153 /* Half-precision float operations. The compiler handles all operations
23154 with NULL libfuncs by converting to SFmode. */
23156 /* Conversions. */
23157 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
23158 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
23160 /* Arithmetic. */
23161 set_optab_libfunc (add_optab, HFmode, NULL);
23162 set_optab_libfunc (sdiv_optab, HFmode, NULL);
23163 set_optab_libfunc (smul_optab, HFmode, NULL);
23164 set_optab_libfunc (neg_optab, HFmode, NULL);
23165 set_optab_libfunc (sub_optab, HFmode, NULL);
23167 /* Comparisons. */
23168 set_optab_libfunc (eq_optab, HFmode, NULL);
23169 set_optab_libfunc (ne_optab, HFmode, NULL);
23170 set_optab_libfunc (lt_optab, HFmode, NULL);
23171 set_optab_libfunc (le_optab, HFmode, NULL);
23172 set_optab_libfunc (ge_optab, HFmode, NULL);
23173 set_optab_libfunc (gt_optab, HFmode, NULL);
23174 set_optab_libfunc (unord_optab, HFmode, NULL);
23177 /* Target hook for c_mode_for_suffix. */
23178 static machine_mode
23179 aarch64_c_mode_for_suffix (char suffix)
23181 if (suffix == 'q')
23182 return TFmode;
23184 return VOIDmode;
23187 /* We can only represent floating point constants which will fit in
23188 "quarter-precision" values. These values are characterised by
23189 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
23192 (-1)^s * (n/16) * 2^r
23194 Where:
23195 's' is the sign bit.
23196 'n' is an integer in the range 16 <= n <= 31.
23197 'r' is an integer in the range -3 <= r <= 4. */
23199 /* Return true iff X can be represented by a quarter-precision
23200 floating point immediate operand X. Note, we cannot represent 0.0. */
23201 bool
23202 aarch64_float_const_representable_p (rtx x)
23204 /* This represents our current view of how many bits
23205 make up the mantissa. */
23206 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23207 int exponent;
23208 unsigned HOST_WIDE_INT mantissa, mask;
23209 REAL_VALUE_TYPE r, m;
23210 bool fail;
23212 x = unwrap_const_vec_duplicate (x);
23213 if (!CONST_DOUBLE_P (x))
23214 return false;
23216 if (GET_MODE (x) == VOIDmode
23217 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23218 return false;
23220 r = *CONST_DOUBLE_REAL_VALUE (x);
23222 /* We cannot represent infinities, NaNs or +/-zero. We won't
23223 know if we have +zero until we analyse the mantissa, but we
23224 can reject the other invalid values. */
23225 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23226 || REAL_VALUE_MINUS_ZERO (r))
23227 return false;
23229 /* Extract exponent. */
23230 r = real_value_abs (&r);
23231 exponent = REAL_EXP (&r);
23233 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23234 highest (sign) bit, with a fixed binary point at bit point_pos.
23235 m1 holds the low part of the mantissa, m2 the high part.
23236 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23237 bits for the mantissa, this can fail (low bits will be lost). */
23238 real_ldexp (&m, &r, point_pos - exponent);
23239 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23241 /* If the low part of the mantissa has bits set we cannot represent
23242 the value. */
23243 if (w.ulow () != 0)
23244 return false;
23245 /* We have rejected the lower HOST_WIDE_INT, so update our
23246 understanding of how many bits lie in the mantissa and
23247 look only at the high HOST_WIDE_INT. */
23248 mantissa = w.elt (1);
23249 point_pos -= HOST_BITS_PER_WIDE_INT;
23251 /* We can only represent values with a mantissa of the form 1.xxxx. */
23252 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23253 if ((mantissa & mask) != 0)
23254 return false;
23256 /* Having filtered unrepresentable values, we may now remove all
23257 but the highest 5 bits. */
23258 mantissa >>= point_pos - 5;
23260 /* We cannot represent the value 0.0, so reject it. This is handled
23261 elsewhere. */
23262 if (mantissa == 0)
23263 return false;
23265 /* Then, as bit 4 is always set, we can mask it off, leaving
23266 the mantissa in the range [0, 15]. */
23267 mantissa &= ~(1 << 4);
23268 gcc_assert (mantissa <= 15);
23270 /* GCC internally does not use IEEE754-like encoding (where normalized
23271 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
23272 Our mantissa values are shifted 4 places to the left relative to
23273 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23274 by 5 places to correct for GCC's representation. */
23275 exponent = 5 - exponent;
23277 return (exponent >= 0 && exponent <= 7);
23280 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23281 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
23282 output MOVI/MVNI, ORR or BIC immediate. */
23283 char*
23284 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23285 enum simd_immediate_check which)
23287 bool is_valid;
23288 static char templ[40];
23289 const char *mnemonic;
23290 const char *shift_op;
23291 unsigned int lane_count = 0;
23292 char element_char;
23294 struct simd_immediate_info info;
23296 /* This will return true to show const_vector is legal for use as either
23297 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23298 It will also update INFO to show how the immediate should be generated.
23299 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
23300 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23301 gcc_assert (is_valid);
23303 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23304 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23306 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23308 gcc_assert (info.insn == simd_immediate_info::MOV
23309 && info.u.mov.shift == 0);
23310 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23311 move immediate path. */
23312 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23313 info.u.mov.value = GEN_INT (0);
23314 else
23316 const unsigned int buf_size = 20;
23317 char float_buf[buf_size] = {'\0'};
23318 real_to_decimal_for_mode (float_buf,
23319 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23320 buf_size, buf_size, 1, info.elt_mode);
23322 if (lane_count == 1)
23323 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23324 else
23325 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23326 lane_count, element_char, float_buf);
23327 return templ;
23331 gcc_assert (CONST_INT_P (info.u.mov.value));
23333 if (which == AARCH64_CHECK_MOV)
23335 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23336 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23337 ? "msl" : "lsl");
23338 if (lane_count == 1)
23339 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23340 mnemonic, UINTVAL (info.u.mov.value));
23341 else if (info.u.mov.shift)
23342 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23343 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23344 element_char, UINTVAL (info.u.mov.value), shift_op,
23345 info.u.mov.shift);
23346 else
23347 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23348 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23349 element_char, UINTVAL (info.u.mov.value));
23351 else
23353 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
23354 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23355 if (info.u.mov.shift)
23356 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23357 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23358 element_char, UINTVAL (info.u.mov.value), "lsl",
23359 info.u.mov.shift);
23360 else
23361 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23362 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23363 element_char, UINTVAL (info.u.mov.value));
23365 return templ;
23368 char*
23369 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23372 /* If a floating point number was passed and we desire to use it in an
23373 integer mode do the conversion to integer. */
23374 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23376 unsigned HOST_WIDE_INT ival;
23377 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23378 gcc_unreachable ();
23379 immediate = gen_int_mode (ival, mode);
23382 machine_mode vmode;
23383 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23384 a 128 bit vector mode. */
23385 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23387 vmode = aarch64_simd_container_mode (mode, width);
23388 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23389 return aarch64_output_simd_mov_immediate (v_op, width);
23392 /* Return the output string to use for moving immediate CONST_VECTOR
23393 into an SVE register. */
23395 char *
23396 aarch64_output_sve_mov_immediate (rtx const_vector)
23398 static char templ[40];
23399 struct simd_immediate_info info;
23400 char element_char;
23402 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23403 gcc_assert (is_valid);
23405 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23407 machine_mode vec_mode = GET_MODE (const_vector);
23408 if (aarch64_sve_pred_mode_p (vec_mode))
23410 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23411 if (info.insn == simd_immediate_info::MOV)
23413 gcc_assert (info.u.mov.value == const0_rtx);
23414 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23416 else
23418 gcc_assert (info.insn == simd_immediate_info::PTRUE);
23419 unsigned int total_bytes;
23420 if (info.u.pattern == AARCH64_SV_ALL
23421 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23422 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23423 total_bytes / GET_MODE_SIZE (info.elt_mode));
23424 else
23425 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23426 svpattern_token (info.u.pattern));
23428 return buf;
23431 if (info.insn == simd_immediate_info::INDEX)
23433 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23434 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23435 element_char, INTVAL (info.u.index.base),
23436 INTVAL (info.u.index.step));
23437 return templ;
23440 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23442 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23443 info.u.mov.value = GEN_INT (0);
23444 else
23446 const int buf_size = 20;
23447 char float_buf[buf_size] = {};
23448 real_to_decimal_for_mode (float_buf,
23449 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23450 buf_size, buf_size, 1, info.elt_mode);
23452 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23453 element_char, float_buf);
23454 return templ;
23458 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23459 element_char, INTVAL (info.u.mov.value));
23460 return templ;
23463 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
23464 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23465 pattern. */
23467 char *
23468 aarch64_output_sve_ptrues (rtx const_unspec)
23470 static char templ[40];
23472 struct simd_immediate_info info;
23473 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23474 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23476 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23477 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23478 svpattern_token (info.u.pattern));
23479 return templ;
23482 /* Split operands into moves from op[1] + op[2] into op[0]. */
23484 void
23485 aarch64_split_combinev16qi (rtx operands[3])
23487 unsigned int dest = REGNO (operands[0]);
23488 unsigned int src1 = REGNO (operands[1]);
23489 unsigned int src2 = REGNO (operands[2]);
23490 machine_mode halfmode = GET_MODE (operands[1]);
23491 unsigned int halfregs = REG_NREGS (operands[1]);
23492 rtx destlo, desthi;
23494 gcc_assert (halfmode == V16QImode);
23496 if (src1 == dest && src2 == dest + halfregs)
23498 /* No-op move. Can't split to nothing; emit something. */
23499 emit_note (NOTE_INSN_DELETED);
23500 return;
23503 /* Preserve register attributes for variable tracking. */
23504 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23505 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23506 GET_MODE_SIZE (halfmode));
23508 /* Special case of reversed high/low parts. */
23509 if (reg_overlap_mentioned_p (operands[2], destlo)
23510 && reg_overlap_mentioned_p (operands[1], desthi))
23512 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23513 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23514 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23516 else if (!reg_overlap_mentioned_p (operands[2], destlo))
23518 /* Try to avoid unnecessary moves if part of the result
23519 is in the right place already. */
23520 if (src1 != dest)
23521 emit_move_insn (destlo, operands[1]);
23522 if (src2 != dest + halfregs)
23523 emit_move_insn (desthi, operands[2]);
23525 else
23527 if (src2 != dest + halfregs)
23528 emit_move_insn (desthi, operands[2]);
23529 if (src1 != dest)
23530 emit_move_insn (destlo, operands[1]);
23534 /* vec_perm support. */
23536 struct expand_vec_perm_d
23538 rtx target, op0, op1;
23539 vec_perm_indices perm;
23540 machine_mode vmode;
23541 machine_mode op_mode;
23542 unsigned int vec_flags;
23543 unsigned int op_vec_flags;
23544 bool one_vector_p;
23545 bool testing_p;
23548 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23550 /* Generate a variable permutation. */
23552 static void
23553 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23555 machine_mode vmode = GET_MODE (target);
23556 bool one_vector_p = rtx_equal_p (op0, op1);
23558 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23559 gcc_checking_assert (GET_MODE (op0) == vmode);
23560 gcc_checking_assert (GET_MODE (op1) == vmode);
23561 gcc_checking_assert (GET_MODE (sel) == vmode);
23562 gcc_checking_assert (TARGET_SIMD);
23564 if (one_vector_p)
23566 if (vmode == V8QImode)
23568 /* Expand the argument to a V16QI mode by duplicating it. */
23569 rtx pair = gen_reg_rtx (V16QImode);
23570 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23571 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23573 else
23575 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23578 else
23580 rtx pair;
23582 if (vmode == V8QImode)
23584 pair = gen_reg_rtx (V16QImode);
23585 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23586 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23588 else
23590 pair = gen_reg_rtx (V2x16QImode);
23591 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23592 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23597 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23598 NELT is the number of elements in the vector. */
23600 void
23601 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23602 unsigned int nelt)
23604 machine_mode vmode = GET_MODE (target);
23605 bool one_vector_p = rtx_equal_p (op0, op1);
23606 rtx mask;
23608 /* The TBL instruction does not use a modulo index, so we must take care
23609 of that ourselves. */
23610 mask = aarch64_simd_gen_const_vector_dup (vmode,
23611 one_vector_p ? nelt - 1 : 2 * nelt - 1);
23612 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23614 /* For big-endian, we also need to reverse the index within the vector
23615 (but not which vector). */
23616 if (BYTES_BIG_ENDIAN)
23618 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
23619 if (!one_vector_p)
23620 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23621 sel = expand_simple_binop (vmode, XOR, sel, mask,
23622 NULL, 0, OPTAB_LIB_WIDEN);
23624 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23627 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
23629 static void
23630 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23632 emit_insn (gen_rtx_SET (target,
23633 gen_rtx_UNSPEC (GET_MODE (target),
23634 gen_rtvec (2, op0, op1), code)));
23637 /* Expand an SVE vec_perm with the given operands. */
23639 void
23640 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23642 machine_mode data_mode = GET_MODE (target);
23643 machine_mode sel_mode = GET_MODE (sel);
23644 /* Enforced by the pattern condition. */
23645 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23647 /* Note: vec_perm indices are supposed to wrap when they go beyond the
23648 size of the two value vectors, i.e. the upper bits of the indices
23649 are effectively ignored. SVE TBL instead produces 0 for any
23650 out-of-range indices, so we need to modulo all the vec_perm indices
23651 to ensure they are all in range. */
23652 rtx sel_reg = force_reg (sel_mode, sel);
23654 /* Check if the sel only references the first values vector. */
23655 if (CONST_VECTOR_P (sel)
23656 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23658 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23659 return;
23662 /* Check if the two values vectors are the same. */
23663 if (rtx_equal_p (op0, op1))
23665 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23666 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23667 NULL, 0, OPTAB_DIRECT);
23668 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23669 return;
23672 /* Run TBL on for each value vector and combine the results. */
23674 rtx res0 = gen_reg_rtx (data_mode);
23675 rtx res1 = gen_reg_rtx (data_mode);
23676 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23677 if (!CONST_VECTOR_P (sel)
23678 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23680 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23681 2 * nunits - 1);
23682 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23683 NULL, 0, OPTAB_DIRECT);
23685 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23686 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23687 NULL, 0, OPTAB_DIRECT);
23688 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23689 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23690 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23691 else
23692 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23695 /* Recognize patterns suitable for the TRN instructions. */
23696 static bool
23697 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23699 HOST_WIDE_INT odd;
23700 poly_uint64 nelt = d->perm.length ();
23701 rtx out, in0, in1;
23702 machine_mode vmode = d->vmode;
23704 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23705 return false;
23707 /* Note that these are little-endian tests.
23708 We correct for big-endian later. */
23709 if (!d->perm[0].is_constant (&odd)
23710 || (odd != 0 && odd != 1)
23711 || !d->perm.series_p (0, 2, odd, 2)
23712 || !d->perm.series_p (1, 2, nelt + odd, 2))
23713 return false;
23715 /* Success! */
23716 if (d->testing_p)
23717 return true;
23719 in0 = d->op0;
23720 in1 = d->op1;
23721 /* We don't need a big-endian lane correction for SVE; see the comment
23722 at the head of aarch64-sve.md for details. */
23723 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23725 std::swap (in0, in1);
23726 odd = !odd;
23728 out = d->target;
23730 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23731 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23732 return true;
23735 /* Try to re-encode the PERM constant so it combines odd and even elements.
23736 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23737 We retry with this new constant with the full suite of patterns. */
23738 static bool
23739 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23741 expand_vec_perm_d newd;
23742 unsigned HOST_WIDE_INT nelt;
23744 if (d->vec_flags != VEC_ADVSIMD)
23745 return false;
23747 /* Get the new mode. Always twice the size of the inner
23748 and half the elements. */
23749 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23750 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23751 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23752 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23754 if (new_mode == word_mode)
23755 return false;
23757 /* to_constant is safe since this routine is specific to Advanced SIMD
23758 vectors. */
23759 nelt = d->perm.length ().to_constant ();
23761 vec_perm_builder newpermconst;
23762 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23764 /* Convert the perm constant if we can. Require even, odd as the pairs. */
23765 for (unsigned int i = 0; i < nelt; i += 2)
23767 poly_int64 elt0 = d->perm[i];
23768 poly_int64 elt1 = d->perm[i + 1];
23769 poly_int64 newelt;
23770 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23771 return false;
23772 newpermconst.quick_push (newelt.to_constant ());
23774 newpermconst.finalize ();
23776 newd.vmode = new_mode;
23777 newd.vec_flags = VEC_ADVSIMD;
23778 newd.op_mode = newd.vmode;
23779 newd.op_vec_flags = newd.vec_flags;
23780 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23781 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23782 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23783 newd.testing_p = d->testing_p;
23784 newd.one_vector_p = d->one_vector_p;
23786 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23787 return aarch64_expand_vec_perm_const_1 (&newd);
23790 /* Recognize patterns suitable for the UZP instructions. */
23791 static bool
23792 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23794 HOST_WIDE_INT odd;
23795 rtx out, in0, in1;
23796 machine_mode vmode = d->vmode;
23798 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23799 return false;
23801 /* Note that these are little-endian tests.
23802 We correct for big-endian later. */
23803 if (!d->perm[0].is_constant (&odd)
23804 || (odd != 0 && odd != 1)
23805 || !d->perm.series_p (0, 1, odd, 2))
23806 return false;
23808 /* Success! */
23809 if (d->testing_p)
23810 return true;
23812 in0 = d->op0;
23813 in1 = d->op1;
23814 /* We don't need a big-endian lane correction for SVE; see the comment
23815 at the head of aarch64-sve.md for details. */
23816 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23818 std::swap (in0, in1);
23819 odd = !odd;
23821 out = d->target;
23823 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23824 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23825 return true;
23828 /* Recognize patterns suitable for the ZIP instructions. */
23829 static bool
23830 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23832 unsigned int high;
23833 poly_uint64 nelt = d->perm.length ();
23834 rtx out, in0, in1;
23835 machine_mode vmode = d->vmode;
23837 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23838 return false;
23840 /* Note that these are little-endian tests.
23841 We correct for big-endian later. */
23842 poly_uint64 first = d->perm[0];
23843 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
23844 || !d->perm.series_p (0, 2, first, 1)
23845 || !d->perm.series_p (1, 2, first + nelt, 1))
23846 return false;
23847 high = maybe_ne (first, 0U);
23849 /* Success! */
23850 if (d->testing_p)
23851 return true;
23853 in0 = d->op0;
23854 in1 = d->op1;
23855 /* We don't need a big-endian lane correction for SVE; see the comment
23856 at the head of aarch64-sve.md for details. */
23857 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23859 std::swap (in0, in1);
23860 high = !high;
23862 out = d->target;
23864 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23865 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
23866 return true;
23869 /* Recognize patterns for the EXT insn. */
23871 static bool
23872 aarch64_evpc_ext (struct expand_vec_perm_d *d)
23874 HOST_WIDE_INT location;
23875 rtx offset;
23877 /* The first element always refers to the first vector.
23878 Check if the extracted indices are increasing by one. */
23879 if (d->vec_flags == VEC_SVE_PRED
23880 || !d->perm[0].is_constant (&location)
23881 || !d->perm.series_p (0, 1, location, 1))
23882 return false;
23884 /* Success! */
23885 if (d->testing_p)
23886 return true;
23888 /* The case where (location == 0) is a no-op for both big- and little-endian,
23889 and is removed by the mid-end at optimization levels -O1 and higher.
23891 We don't need a big-endian lane correction for SVE; see the comment
23892 at the head of aarch64-sve.md for details. */
23893 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
23895 /* After setup, we want the high elements of the first vector (stored
23896 at the LSB end of the register), and the low elements of the second
23897 vector (stored at the MSB end of the register). So swap. */
23898 std::swap (d->op0, d->op1);
23899 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
23900 to_constant () is safe since this is restricted to Advanced SIMD
23901 vectors. */
23902 location = d->perm.length ().to_constant () - location;
23905 offset = GEN_INT (location);
23906 emit_set_insn (d->target,
23907 gen_rtx_UNSPEC (d->vmode,
23908 gen_rtvec (3, d->op0, d->op1, offset),
23909 UNSPEC_EXT));
23910 return true;
23913 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
23914 within each 64-bit, 32-bit or 16-bit granule. */
23916 static bool
23917 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
23919 HOST_WIDE_INT diff;
23920 unsigned int i, size, unspec;
23921 machine_mode pred_mode;
23923 if (d->vec_flags == VEC_SVE_PRED
23924 || !d->one_vector_p
23925 || !d->perm[0].is_constant (&diff)
23926 || !diff)
23927 return false;
23929 if (d->vec_flags & VEC_SVE_DATA)
23930 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
23931 else
23932 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
23933 if (size == 64)
23935 unspec = UNSPEC_REV64;
23936 pred_mode = VNx2BImode;
23938 else if (size == 32)
23940 unspec = UNSPEC_REV32;
23941 pred_mode = VNx4BImode;
23943 else if (size == 16)
23945 unspec = UNSPEC_REV16;
23946 pred_mode = VNx8BImode;
23948 else
23949 return false;
23951 unsigned int step = diff + 1;
23952 for (i = 0; i < step; ++i)
23953 if (!d->perm.series_p (i, step, diff - i, step))
23954 return false;
23956 /* Success! */
23957 if (d->testing_p)
23958 return true;
23960 if (d->vec_flags & VEC_SVE_DATA)
23962 rtx pred = aarch64_ptrue_reg (pred_mode);
23963 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
23964 d->target, pred, d->op0));
23965 return true;
23967 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
23968 emit_set_insn (d->target, src);
23969 return true;
23972 /* Recognize patterns for the REV insn, which reverses elements within
23973 a full vector. */
23975 static bool
23976 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
23978 poly_uint64 nelt = d->perm.length ();
23980 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
23981 return false;
23983 if (!d->perm.series_p (0, 1, nelt - 1, -1))
23984 return false;
23986 /* Success! */
23987 if (d->testing_p)
23988 return true;
23990 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
23991 emit_set_insn (d->target, src);
23992 return true;
23995 static bool
23996 aarch64_evpc_dup (struct expand_vec_perm_d *d)
23998 rtx out = d->target;
23999 rtx in0;
24000 HOST_WIDE_INT elt;
24001 machine_mode vmode = d->vmode;
24002 rtx lane;
24004 if (d->vec_flags == VEC_SVE_PRED
24005 || d->perm.encoding ().encoded_nelts () != 1
24006 || !d->perm[0].is_constant (&elt))
24007 return false;
24009 if ((d->vec_flags & VEC_SVE_DATA)
24010 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
24011 return false;
24013 /* Success! */
24014 if (d->testing_p)
24015 return true;
24017 /* The generic preparation in aarch64_expand_vec_perm_const_1
24018 swaps the operand order and the permute indices if it finds
24019 d->perm[0] to be in the second operand. Thus, we can always
24020 use d->op0 and need not do any extra arithmetic to get the
24021 correct lane number. */
24022 in0 = d->op0;
24023 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
24025 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
24026 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
24027 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
24028 return true;
24031 static bool
24032 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
24034 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
24035 machine_mode vmode = d->vmode;
24037 /* Make sure that the indices are constant. */
24038 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
24039 for (unsigned int i = 0; i < encoded_nelts; ++i)
24040 if (!d->perm[i].is_constant ())
24041 return false;
24043 if (d->testing_p)
24044 return true;
24046 /* Generic code will try constant permutation twice. Once with the
24047 original mode and again with the elements lowered to QImode.
24048 So wait and don't do the selector expansion ourselves. */
24049 if (vmode != V8QImode && vmode != V16QImode)
24050 return false;
24052 /* to_constant is safe since this routine is specific to Advanced SIMD
24053 vectors. */
24054 unsigned int nelt = d->perm.length ().to_constant ();
24055 for (unsigned int i = 0; i < nelt; ++i)
24056 /* If big-endian and two vectors we end up with a weird mixed-endian
24057 mode on NEON. Reverse the index within each word but not the word
24058 itself. to_constant is safe because we checked is_constant above. */
24059 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
24060 ? d->perm[i].to_constant () ^ (nelt - 1)
24061 : d->perm[i].to_constant ());
24063 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
24064 sel = force_reg (vmode, sel);
24066 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
24067 return true;
24070 /* Try to implement D using an SVE TBL instruction. */
24072 static bool
24073 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
24075 unsigned HOST_WIDE_INT nelt;
24077 /* Permuting two variable-length vectors could overflow the
24078 index range. */
24079 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
24080 return false;
24082 if (d->testing_p)
24083 return true;
24085 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
24086 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
24087 if (d->one_vector_p)
24088 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
24089 else
24090 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
24091 return true;
24094 /* Try to implement D using SVE dup instruction. */
24096 static bool
24097 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
24099 if (BYTES_BIG_ENDIAN
24100 || !d->one_vector_p
24101 || d->vec_flags != VEC_SVE_DATA
24102 || d->op_vec_flags != VEC_ADVSIMD
24103 || d->perm.encoding ().nelts_per_pattern () != 1
24104 || !known_eq (d->perm.encoding ().npatterns (),
24105 GET_MODE_NUNITS (d->op_mode))
24106 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
24107 return false;
24109 int npatterns = d->perm.encoding ().npatterns ();
24110 for (int i = 0; i < npatterns; i++)
24111 if (!known_eq (d->perm[i], i))
24112 return false;
24114 if (d->testing_p)
24115 return true;
24117 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
24118 return true;
24121 /* Try to implement D using SVE SEL instruction. */
24123 static bool
24124 aarch64_evpc_sel (struct expand_vec_perm_d *d)
24126 machine_mode vmode = d->vmode;
24127 int unit_size = GET_MODE_UNIT_SIZE (vmode);
24129 if (d->vec_flags != VEC_SVE_DATA
24130 || unit_size > 8)
24131 return false;
24133 int n_patterns = d->perm.encoding ().npatterns ();
24134 poly_int64 vec_len = d->perm.length ();
24136 for (int i = 0; i < n_patterns; ++i)
24137 if (!known_eq (d->perm[i], i)
24138 && !known_eq (d->perm[i], vec_len + i))
24139 return false;
24141 for (int i = n_patterns; i < n_patterns * 2; i++)
24142 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
24143 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
24144 return false;
24146 if (d->testing_p)
24147 return true;
24149 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
24151 /* Build a predicate that is true when op0 elements should be used. */
24152 rtx_vector_builder builder (pred_mode, n_patterns, 2);
24153 for (int i = 0; i < n_patterns * 2; i++)
24155 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
24156 : CONST0_RTX (BImode);
24157 builder.quick_push (elem);
24160 rtx const_vec = builder.build ();
24161 rtx pred = force_reg (pred_mode, const_vec);
24162 /* TARGET = PRED ? OP0 : OP1. */
24163 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
24164 return true;
24167 /* Recognize patterns suitable for the INS instructions. */
24168 static bool
24169 aarch64_evpc_ins (struct expand_vec_perm_d *d)
24171 machine_mode mode = d->vmode;
24172 unsigned HOST_WIDE_INT nelt;
24174 if (d->vec_flags != VEC_ADVSIMD)
24175 return false;
24177 /* to_constant is safe since this routine is specific to Advanced SIMD
24178 vectors. */
24179 nelt = d->perm.length ().to_constant ();
24180 rtx insv = d->op0;
24182 HOST_WIDE_INT idx = -1;
24184 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24186 HOST_WIDE_INT elt;
24187 if (!d->perm[i].is_constant (&elt))
24188 return false;
24189 if (elt == (HOST_WIDE_INT) i)
24190 continue;
24191 if (idx != -1)
24193 idx = -1;
24194 break;
24196 idx = i;
24199 if (idx == -1)
24201 insv = d->op1;
24202 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24204 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24205 continue;
24206 if (idx != -1)
24207 return false;
24208 idx = i;
24211 if (idx == -1)
24212 return false;
24215 if (d->testing_p)
24216 return true;
24218 gcc_assert (idx != -1);
24220 unsigned extractindex = d->perm[idx].to_constant ();
24221 rtx extractv = d->op0;
24222 if (extractindex >= nelt)
24224 extractv = d->op1;
24225 extractindex -= nelt;
24227 gcc_assert (extractindex < nelt);
24229 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24230 expand_operand ops[5];
24231 create_output_operand (&ops[0], d->target, mode);
24232 create_input_operand (&ops[1], insv, mode);
24233 create_integer_operand (&ops[2], 1 << idx);
24234 create_input_operand (&ops[3], extractv, mode);
24235 create_integer_operand (&ops[4], extractindex);
24236 expand_insn (icode, 5, ops);
24238 return true;
24241 static bool
24242 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24244 gcc_assert (d->op_mode != E_VOIDmode);
24246 /* The pattern matching functions above are written to look for a small
24247 number to begin the sequence (0, 1, N/2). If we begin with an index
24248 from the second operand, we can swap the operands. */
24249 poly_int64 nelt = d->perm.length ();
24250 if (known_ge (d->perm[0], nelt))
24252 d->perm.rotate_inputs (1);
24253 std::swap (d->op0, d->op1);
24256 if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
24257 || d->vec_flags == VEC_SVE_DATA
24258 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24259 || d->vec_flags == VEC_SVE_PRED)
24260 && known_gt (nelt, 1))
24262 if (d->vmode == d->op_mode)
24264 if (aarch64_evpc_rev_local (d))
24265 return true;
24266 else if (aarch64_evpc_rev_global (d))
24267 return true;
24268 else if (aarch64_evpc_ext (d))
24269 return true;
24270 else if (aarch64_evpc_dup (d))
24271 return true;
24272 else if (aarch64_evpc_zip (d))
24273 return true;
24274 else if (aarch64_evpc_uzp (d))
24275 return true;
24276 else if (aarch64_evpc_trn (d))
24277 return true;
24278 else if (aarch64_evpc_sel (d))
24279 return true;
24280 else if (aarch64_evpc_ins (d))
24281 return true;
24282 else if (aarch64_evpc_reencode (d))
24283 return true;
24285 if (d->vec_flags == VEC_SVE_DATA)
24286 return aarch64_evpc_sve_tbl (d);
24287 else if (d->vec_flags == VEC_ADVSIMD)
24288 return aarch64_evpc_tbl (d);
24290 else
24292 if (aarch64_evpc_sve_dup (d))
24293 return true;
24296 return false;
24299 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
24301 static bool
24302 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24303 rtx target, rtx op0, rtx op1,
24304 const vec_perm_indices &sel)
24306 struct expand_vec_perm_d d;
24308 /* Check whether the mask can be applied to a single vector. */
24309 if (sel.ninputs () == 1
24310 || (op0 && rtx_equal_p (op0, op1)))
24311 d.one_vector_p = true;
24312 else if (sel.all_from_input_p (0))
24314 d.one_vector_p = true;
24315 op1 = op0;
24317 else if (sel.all_from_input_p (1))
24319 d.one_vector_p = true;
24320 op0 = op1;
24322 else
24323 d.one_vector_p = false;
24325 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24326 sel.nelts_per_input ());
24327 d.vmode = vmode;
24328 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24329 d.op_mode = op_mode;
24330 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
24331 d.target = target;
24332 d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
24333 if (op0 == op1)
24334 d.op1 = d.op0;
24335 else
24336 d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
24337 d.testing_p = !target;
24339 if (!d.testing_p)
24340 return aarch64_expand_vec_perm_const_1 (&d);
24342 rtx_insn *last = get_last_insn ();
24343 bool ret = aarch64_expand_vec_perm_const_1 (&d);
24344 gcc_assert (last == get_last_insn ());
24346 return ret;
24349 /* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */
24351 bool
24352 aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
24353 tree vectype, wide_int cst,
24354 rtx *output, rtx in0, rtx in1)
24356 if (code != TRUNC_DIV_EXPR
24357 || !TYPE_UNSIGNED (vectype))
24358 return false;
24360 unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
24361 if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
24362 return false;
24364 int pow = wi::exact_log2 (cst + 1);
24365 auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
24366 /* SVE actually has a div operator, we may have gotten here through
24367 that route. */
24368 if (pow != (int) (element_precision (vectype) / 2)
24369 || insn_code == CODE_FOR_nothing)
24370 return false;
24372 /* We can use the optimized pattern. */
24373 if (in0 == NULL_RTX && in1 == NULL_RTX)
24374 return true;
24376 if (!VECTOR_TYPE_P (vectype))
24377 return false;
24379 gcc_assert (output);
24381 if (!*output)
24382 *output = gen_reg_rtx (TYPE_MODE (vectype));
24384 emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
24385 return true;
24388 /* Generate a byte permute mask for a register of mode MODE,
24389 which has NUNITS units. */
24392 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24394 /* We have to reverse each vector because we dont have
24395 a permuted load that can reverse-load according to ABI rules. */
24396 rtx mask;
24397 rtvec v = rtvec_alloc (16);
24398 unsigned int i, j;
24399 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24401 gcc_assert (BYTES_BIG_ENDIAN);
24402 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24404 for (i = 0; i < nunits; i++)
24405 for (j = 0; j < usize; j++)
24406 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24407 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24408 return force_reg (V16QImode, mask);
24411 /* Expand an SVE integer comparison using the SVE equivalent of:
24413 (set TARGET (CODE OP0 OP1)). */
24415 void
24416 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24418 machine_mode pred_mode = GET_MODE (target);
24419 machine_mode data_mode = GET_MODE (op0);
24420 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24421 op0, op1);
24422 if (!rtx_equal_p (target, res))
24423 emit_move_insn (target, res);
24426 /* Return the UNSPEC_COND_* code for comparison CODE. */
24428 static unsigned int
24429 aarch64_unspec_cond_code (rtx_code code)
24431 switch (code)
24433 case NE:
24434 return UNSPEC_COND_FCMNE;
24435 case EQ:
24436 return UNSPEC_COND_FCMEQ;
24437 case LT:
24438 return UNSPEC_COND_FCMLT;
24439 case GT:
24440 return UNSPEC_COND_FCMGT;
24441 case LE:
24442 return UNSPEC_COND_FCMLE;
24443 case GE:
24444 return UNSPEC_COND_FCMGE;
24445 case UNORDERED:
24446 return UNSPEC_COND_FCMUO;
24447 default:
24448 gcc_unreachable ();
24452 /* Emit:
24454 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24456 where <X> is the operation associated with comparison CODE.
24457 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24459 static void
24460 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24461 bool known_ptrue_p, rtx op0, rtx op1)
24463 rtx flag = gen_int_mode (known_ptrue_p, SImode);
24464 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24465 gen_rtvec (4, pred, flag, op0, op1),
24466 aarch64_unspec_cond_code (code));
24467 emit_set_insn (target, unspec);
24470 /* Emit the SVE equivalent of:
24472 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24473 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24474 (set TARGET (ior:PRED_MODE TMP1 TMP2))
24476 where <Xi> is the operation associated with comparison CODEi.
24477 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24479 static void
24480 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24481 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24483 machine_mode pred_mode = GET_MODE (pred);
24484 rtx tmp1 = gen_reg_rtx (pred_mode);
24485 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24486 rtx tmp2 = gen_reg_rtx (pred_mode);
24487 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24488 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24491 /* Emit the SVE equivalent of:
24493 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24494 (set TARGET (not TMP))
24496 where <X> is the operation associated with comparison CODE.
24497 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24499 static void
24500 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24501 bool known_ptrue_p, rtx op0, rtx op1)
24503 machine_mode pred_mode = GET_MODE (pred);
24504 rtx tmp = gen_reg_rtx (pred_mode);
24505 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24506 aarch64_emit_unop (target, one_cmpl_optab, tmp);
24509 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24511 (set TARGET (CODE OP0 OP1))
24513 If CAN_INVERT_P is true, the caller can also handle inverted results;
24514 return true if the result is in fact inverted. */
24516 bool
24517 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24518 rtx op0, rtx op1, bool can_invert_p)
24520 machine_mode pred_mode = GET_MODE (target);
24521 machine_mode data_mode = GET_MODE (op0);
24523 rtx ptrue = aarch64_ptrue_reg (pred_mode);
24524 switch (code)
24526 case UNORDERED:
24527 /* UNORDERED has no immediate form. */
24528 op1 = force_reg (data_mode, op1);
24529 /* fall through */
24530 case LT:
24531 case LE:
24532 case GT:
24533 case GE:
24534 case EQ:
24535 case NE:
24537 /* There is native support for the comparison. */
24538 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24539 return false;
24542 case LTGT:
24543 /* This is a trapping operation (LT or GT). */
24544 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24545 return false;
24547 case UNEQ:
24548 if (!flag_trapping_math)
24550 /* This would trap for signaling NaNs. */
24551 op1 = force_reg (data_mode, op1);
24552 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24553 ptrue, true, op0, op1);
24554 return false;
24556 /* fall through */
24557 case UNLT:
24558 case UNLE:
24559 case UNGT:
24560 case UNGE:
24561 if (flag_trapping_math)
24563 /* Work out which elements are ordered. */
24564 rtx ordered = gen_reg_rtx (pred_mode);
24565 op1 = force_reg (data_mode, op1);
24566 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24567 ptrue, true, op0, op1);
24569 /* Test the opposite condition for the ordered elements,
24570 then invert the result. */
24571 if (code == UNEQ)
24572 code = NE;
24573 else
24574 code = reverse_condition_maybe_unordered (code);
24575 if (can_invert_p)
24577 aarch64_emit_sve_fp_cond (target, code,
24578 ordered, false, op0, op1);
24579 return true;
24581 aarch64_emit_sve_invert_fp_cond (target, code,
24582 ordered, false, op0, op1);
24583 return false;
24585 break;
24587 case ORDERED:
24588 /* ORDERED has no immediate form. */
24589 op1 = force_reg (data_mode, op1);
24590 break;
24592 default:
24593 gcc_unreachable ();
24596 /* There is native support for the inverse comparison. */
24597 code = reverse_condition_maybe_unordered (code);
24598 if (can_invert_p)
24600 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24601 return true;
24603 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24604 return false;
24607 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
24608 of the data being selected and CMP_MODE is the mode of the values being
24609 compared. */
24611 void
24612 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24613 rtx *ops)
24615 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24616 rtx pred = gen_reg_rtx (pred_mode);
24617 if (FLOAT_MODE_P (cmp_mode))
24619 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24620 ops[4], ops[5], true))
24621 std::swap (ops[1], ops[2]);
24623 else
24624 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24626 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24627 ops[1] = force_reg (data_mode, ops[1]);
24628 /* The "false" value can only be zero if the "true" value is a constant. */
24629 if (register_operand (ops[1], data_mode)
24630 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24631 ops[2] = force_reg (data_mode, ops[2]);
24633 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24634 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24637 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
24638 true. However due to issues with register allocation it is preferable
24639 to avoid tieing integer scalar and FP scalar modes. Executing integer
24640 operations in general registers is better than treating them as scalar
24641 vector operations. This reduces latency and avoids redundant int<->FP
24642 moves. So tie modes if they are either the same class, or vector modes
24643 with other vector modes, vector structs or any scalar mode. */
24645 static bool
24646 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24648 if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24649 != aarch64_advsimd_partial_struct_mode_p (mode2))
24650 && maybe_gt (GET_MODE_SIZE (mode1), 8)
24651 && maybe_gt (GET_MODE_SIZE (mode2), 8))
24652 return false;
24654 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24655 return true;
24657 /* We specifically want to allow elements of "structure" modes to
24658 be tieable to the structure. This more general condition allows
24659 other rarer situations too. The reason we don't extend this to
24660 predicate modes is that there are no predicate structure modes
24661 nor any specific instructions for extracting part of a predicate
24662 register. */
24663 if (aarch64_vector_data_mode_p (mode1)
24664 && aarch64_vector_data_mode_p (mode2))
24665 return true;
24667 /* Also allow any scalar modes with vectors. */
24668 if (aarch64_vector_mode_supported_p (mode1)
24669 || aarch64_vector_mode_supported_p (mode2))
24670 return true;
24672 return false;
24675 /* Return a new RTX holding the result of moving POINTER forward by
24676 AMOUNT bytes. */
24678 static rtx
24679 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24681 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24683 return adjust_automodify_address (pointer, GET_MODE (pointer),
24684 next, amount);
24687 /* Return a new RTX holding the result of moving POINTER forward by the
24688 size of the mode it points to. */
24690 static rtx
24691 aarch64_progress_pointer (rtx pointer)
24693 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24696 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24697 MODE bytes. */
24699 static void
24700 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24701 machine_mode mode)
24703 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
24704 address copies using V4SImode so that we can use Q registers. */
24705 if (known_eq (GET_MODE_BITSIZE (mode), 256))
24707 mode = V4SImode;
24708 rtx reg1 = gen_reg_rtx (mode);
24709 rtx reg2 = gen_reg_rtx (mode);
24710 /* "Cast" the pointers to the correct mode. */
24711 *src = adjust_address (*src, mode, 0);
24712 *dst = adjust_address (*dst, mode, 0);
24713 /* Emit the memcpy. */
24714 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24715 aarch64_progress_pointer (*src)));
24716 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24717 aarch64_progress_pointer (*dst), reg2));
24718 /* Move the pointers forward. */
24719 *src = aarch64_move_pointer (*src, 32);
24720 *dst = aarch64_move_pointer (*dst, 32);
24721 return;
24724 rtx reg = gen_reg_rtx (mode);
24726 /* "Cast" the pointers to the correct mode. */
24727 *src = adjust_address (*src, mode, 0);
24728 *dst = adjust_address (*dst, mode, 0);
24729 /* Emit the memcpy. */
24730 emit_move_insn (reg, *src);
24731 emit_move_insn (*dst, reg);
24732 /* Move the pointers forward. */
24733 *src = aarch64_progress_pointer (*src);
24734 *dst = aarch64_progress_pointer (*dst);
24737 /* Expand a cpymem using the MOPS extension. OPERANDS are taken
24738 from the cpymem pattern. Return true iff we succeeded. */
24739 static bool
24740 aarch64_expand_cpymem_mops (rtx *operands)
24742 if (!TARGET_MOPS)
24743 return false;
24745 /* All three registers are changed by the instruction, so each one
24746 must be a fresh pseudo. */
24747 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24748 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24749 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24750 rtx src_mem = replace_equiv_address (operands[1], src_addr);
24751 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24752 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24754 return true;
24757 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
24758 we succeed, otherwise return false, indicating that a libcall to
24759 memcpy should be emitted. */
24761 bool
24762 aarch64_expand_cpymem (rtx *operands)
24764 int mode_bits;
24765 rtx dst = operands[0];
24766 rtx src = operands[1];
24767 rtx base;
24768 machine_mode cur_mode = BLKmode;
24770 /* Variable-sized memcpy can go through the MOPS expansion if available. */
24771 if (!CONST_INT_P (operands[2]))
24772 return aarch64_expand_cpymem_mops (operands);
24774 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24776 /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
24777 unsigned HOST_WIDE_INT max_copy_size
24778 = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24780 bool size_p = optimize_function_for_size_p (cfun);
24782 /* Large constant-sized cpymem should go through MOPS when possible.
24783 It should be a win even for size optimization in the general case.
24784 For speed optimization the choice between MOPS and the SIMD sequence
24785 depends on the size of the copy, rather than number of instructions,
24786 alignment etc. */
24787 if (size > max_copy_size)
24788 return aarch64_expand_cpymem_mops (operands);
24790 int copy_bits = 256;
24792 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24793 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
24794 if (size <= 24
24795 || !TARGET_SIMD
24796 || (aarch64_tune_params.extra_tuning_flags
24797 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24798 copy_bits = 128;
24800 /* Emit an inline load+store sequence and count the number of operations
24801 involved. We use a simple count of just the loads and stores emitted
24802 rather than rtx_insn count as all the pointer adjustments and reg copying
24803 in this function will get optimized away later in the pipeline. */
24804 start_sequence ();
24805 unsigned nops = 0;
24807 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24808 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24810 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24811 src = adjust_automodify_address (src, VOIDmode, base, 0);
24813 /* Convert size to bits to make the rest of the code simpler. */
24814 int n = size * BITS_PER_UNIT;
24816 while (n > 0)
24818 /* Find the largest mode in which to do the copy in without over reading
24819 or writing. */
24820 opt_scalar_int_mode mode_iter;
24821 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24822 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24823 cur_mode = mode_iter.require ();
24825 gcc_assert (cur_mode != BLKmode);
24827 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24829 /* Prefer Q-register accesses for the last bytes. */
24830 if (mode_bits == 128 && copy_bits == 256)
24831 cur_mode = V4SImode;
24833 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24834 /* A single block copy is 1 load + 1 store. */
24835 nops += 2;
24836 n -= mode_bits;
24838 /* Emit trailing copies using overlapping unaligned accesses
24839 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
24840 if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24842 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
24843 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24844 gcc_assert (n_bits <= mode_bits);
24845 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
24846 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24847 n = n_bits;
24850 rtx_insn *seq = get_insns ();
24851 end_sequence ();
24852 /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
24853 the constant size into a register. */
24854 unsigned mops_cost = 3 + 1;
24856 /* If MOPS is available at this point we don't consider the libcall as it's
24857 not a win even on code size. At this point only consider MOPS if
24858 optimizing for size. For speed optimizations we will have chosen between
24859 the two based on copy size already. */
24860 if (TARGET_MOPS)
24862 if (size_p && mops_cost < nops)
24863 return aarch64_expand_cpymem_mops (operands);
24864 emit_insn (seq);
24865 return true;
24868 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
24869 arguments + 1 for the call. When MOPS is not available and we're
24870 optimizing for size a libcall may be preferable. */
24871 unsigned libcall_cost = 4;
24872 if (size_p && libcall_cost < nops)
24873 return false;
24875 emit_insn (seq);
24876 return true;
24879 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
24880 SRC is a register we have created with the duplicated value to be set. */
24881 static void
24882 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
24883 machine_mode mode)
24885 /* If we are copying 128bits or 256bits, we can do that straight from
24886 the SIMD register we prepared. */
24887 if (known_eq (GET_MODE_BITSIZE (mode), 256))
24889 mode = GET_MODE (src);
24890 /* "Cast" the *dst to the correct mode. */
24891 *dst = adjust_address (*dst, mode, 0);
24892 /* Emit the memset. */
24893 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
24894 aarch64_progress_pointer (*dst), src));
24896 /* Move the pointers forward. */
24897 *dst = aarch64_move_pointer (*dst, 32);
24898 return;
24900 if (known_eq (GET_MODE_BITSIZE (mode), 128))
24902 /* "Cast" the *dst to the correct mode. */
24903 *dst = adjust_address (*dst, GET_MODE (src), 0);
24904 /* Emit the memset. */
24905 emit_move_insn (*dst, src);
24906 /* Move the pointers forward. */
24907 *dst = aarch64_move_pointer (*dst, 16);
24908 return;
24910 /* For copying less, we have to extract the right amount from src. */
24911 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
24913 /* "Cast" the *dst to the correct mode. */
24914 *dst = adjust_address (*dst, mode, 0);
24915 /* Emit the memset. */
24916 emit_move_insn (*dst, reg);
24917 /* Move the pointer forward. */
24918 *dst = aarch64_progress_pointer (*dst);
24921 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
24922 as for the setmem pattern. Return true iff we succeed. */
24923 static bool
24924 aarch64_expand_setmem_mops (rtx *operands)
24926 if (!TARGET_MOPS)
24927 return false;
24929 /* The first two registers are changed by the instruction, so both
24930 of them must be a fresh pseudo. */
24931 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24932 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24933 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
24934 rtx val = operands[2];
24935 if (val != CONST0_RTX (QImode))
24936 val = force_reg (QImode, val);
24937 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
24938 return true;
24941 /* Expand setmem, as if from a __builtin_memset. Return true if
24942 we succeed, otherwise return false. */
24944 bool
24945 aarch64_expand_setmem (rtx *operands)
24947 int n, mode_bits;
24948 unsigned HOST_WIDE_INT len;
24949 rtx dst = operands[0];
24950 rtx val = operands[2], src;
24951 rtx base;
24952 machine_mode cur_mode = BLKmode, next_mode;
24954 /* If we don't have SIMD registers or the size is variable use the MOPS
24955 inlined sequence if possible. */
24956 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
24957 return aarch64_expand_setmem_mops (operands);
24959 bool size_p = optimize_function_for_size_p (cfun);
24961 /* Default the maximum to 256-bytes when considering only libcall vs
24962 SIMD broadcast sequence. */
24963 unsigned max_set_size = 256;
24965 len = INTVAL (operands[1]);
24966 if (len > max_set_size && !TARGET_MOPS)
24967 return false;
24969 int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
24970 /* The MOPS sequence takes:
24971 3 instructions for the memory storing
24972 + 1 to move the constant size into a reg
24973 + 1 if VAL is a non-zero constant to move into a reg
24974 (zero constants can use XZR directly). */
24975 unsigned mops_cost = 3 + 1 + cst_val;
24976 /* A libcall to memset in the worst case takes 3 instructions to prepare
24977 the arguments + 1 for the call. */
24978 unsigned libcall_cost = 4;
24980 /* Upper bound check. For large constant-sized setmem use the MOPS sequence
24981 when available. */
24982 if (TARGET_MOPS
24983 && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
24984 return aarch64_expand_setmem_mops (operands);
24986 /* Attempt a sequence with a vector broadcast followed by stores.
24987 Count the number of operations involved to see if it's worth it
24988 against the alternatives. A simple counter simd_ops on the
24989 algorithmically-relevant operations is used rather than an rtx_insn count
24990 as all the pointer adjusmtents and mode reinterprets will be optimized
24991 away later. */
24992 start_sequence ();
24993 unsigned simd_ops = 0;
24995 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24996 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24998 /* Prepare the val using a DUP/MOVI v0.16B, val. */
24999 src = expand_vector_broadcast (V16QImode, val);
25000 src = force_reg (V16QImode, src);
25001 simd_ops++;
25002 /* Convert len to bits to make the rest of the code simpler. */
25003 n = len * BITS_PER_UNIT;
25005 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
25006 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
25007 const int copy_limit = (aarch64_tune_params.extra_tuning_flags
25008 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
25009 ? GET_MODE_BITSIZE (TImode) : 256;
25011 while (n > 0)
25013 /* Find the largest mode in which to do the copy without
25014 over writing. */
25015 opt_scalar_int_mode mode_iter;
25016 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
25017 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
25018 cur_mode = mode_iter.require ();
25020 gcc_assert (cur_mode != BLKmode);
25022 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
25023 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
25024 simd_ops++;
25025 n -= mode_bits;
25027 /* Do certain trailing copies as overlapping if it's going to be
25028 cheaper. i.e. less instructions to do so. For instance doing a 15
25029 byte copy it's more efficient to do two overlapping 8 byte copies than
25030 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
25031 if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
25033 next_mode = smallest_mode_for_size (n, MODE_INT);
25034 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25035 gcc_assert (n_bits <= mode_bits);
25036 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25037 n = n_bits;
25040 rtx_insn *seq = get_insns ();
25041 end_sequence ();
25043 if (size_p)
25045 /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25046 call to memset or the MOPS expansion. */
25047 if (TARGET_MOPS
25048 && mops_cost <= libcall_cost
25049 && mops_cost <= simd_ops)
25050 return aarch64_expand_setmem_mops (operands);
25051 /* If MOPS is not available or not shorter pick a libcall if the SIMD
25052 sequence is too long. */
25053 else if (libcall_cost < simd_ops)
25054 return false;
25055 emit_insn (seq);
25056 return true;
25059 /* At this point the SIMD broadcast sequence is the best choice when
25060 optimizing for speed. */
25061 emit_insn (seq);
25062 return true;
25066 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
25067 SImode stores. Handle the case when the constant has identical
25068 bottom and top halves. This is beneficial when the two stores can be
25069 merged into an STP and we avoid synthesising potentially expensive
25070 immediates twice. Return true if such a split is possible. */
25072 bool
25073 aarch64_split_dimode_const_store (rtx dst, rtx src)
25075 rtx lo = gen_lowpart (SImode, src);
25076 rtx hi = gen_highpart_mode (SImode, DImode, src);
25078 bool size_p = optimize_function_for_size_p (cfun);
25080 if (!rtx_equal_p (lo, hi))
25081 return false;
25083 unsigned int orig_cost
25084 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
25085 unsigned int lo_cost
25086 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
25088 /* We want to transform:
25089 MOV x1, 49370
25090 MOVK x1, 0x140, lsl 16
25091 MOVK x1, 0xc0da, lsl 32
25092 MOVK x1, 0x140, lsl 48
25093 STR x1, [x0]
25094 into:
25095 MOV w1, 49370
25096 MOVK w1, 0x140, lsl 16
25097 STP w1, w1, [x0]
25098 So we want to perform this only when we save two instructions
25099 or more. When optimizing for size, however, accept any code size
25100 savings we can. */
25101 if (size_p && orig_cost <= lo_cost)
25102 return false;
25104 if (!size_p
25105 && (orig_cost <= lo_cost + 1))
25106 return false;
25108 rtx mem_lo = adjust_address (dst, SImode, 0);
25109 if (!aarch64_mem_pair_operand (mem_lo, SImode))
25110 return false;
25112 rtx tmp_reg = gen_reg_rtx (SImode);
25113 aarch64_expand_mov_immediate (tmp_reg, lo);
25114 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
25115 /* Don't emit an explicit store pair as this may not be always profitable.
25116 Let the sched-fusion logic decide whether to merge them. */
25117 emit_move_insn (mem_lo, tmp_reg);
25118 emit_move_insn (mem_hi, tmp_reg);
25120 return true;
25123 /* Generate RTL for a conditional branch with rtx comparison CODE in
25124 mode CC_MODE. The destination of the unlikely conditional branch
25125 is LABEL_REF. */
25127 void
25128 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
25129 rtx label_ref)
25131 rtx x;
25132 x = gen_rtx_fmt_ee (code, VOIDmode,
25133 gen_rtx_REG (cc_mode, CC_REGNUM),
25134 const0_rtx);
25136 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25137 gen_rtx_LABEL_REF (VOIDmode, label_ref),
25138 pc_rtx);
25139 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25142 /* Generate DImode scratch registers for 128-bit (TImode) addition.
25144 OP1 represents the TImode destination operand 1
25145 OP2 represents the TImode destination operand 2
25146 LOW_DEST represents the low half (DImode) of TImode operand 0
25147 LOW_IN1 represents the low half (DImode) of TImode operand 1
25148 LOW_IN2 represents the low half (DImode) of TImode operand 2
25149 HIGH_DEST represents the high half (DImode) of TImode operand 0
25150 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25151 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25153 void
25154 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25155 rtx *low_in1, rtx *low_in2,
25156 rtx *high_dest, rtx *high_in1,
25157 rtx *high_in2)
25159 *low_dest = gen_reg_rtx (DImode);
25160 *low_in1 = gen_lowpart (DImode, op1);
25161 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25162 subreg_lowpart_offset (DImode, TImode));
25163 *high_dest = gen_reg_rtx (DImode);
25164 *high_in1 = gen_highpart (DImode, op1);
25165 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25166 subreg_highpart_offset (DImode, TImode));
25169 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25171 This function differs from 'arch64_addti_scratch_regs' in that
25172 OP1 can be an immediate constant (zero). We must call
25173 subreg_highpart_offset with DImode and TImode arguments, otherwise
25174 VOIDmode will be used for the const_int which generates an internal
25175 error from subreg_size_highpart_offset which does not expect a size of zero.
25177 OP1 represents the TImode destination operand 1
25178 OP2 represents the TImode destination operand 2
25179 LOW_DEST represents the low half (DImode) of TImode operand 0
25180 LOW_IN1 represents the low half (DImode) of TImode operand 1
25181 LOW_IN2 represents the low half (DImode) of TImode operand 2
25182 HIGH_DEST represents the high half (DImode) of TImode operand 0
25183 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25184 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25187 void
25188 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25189 rtx *low_in1, rtx *low_in2,
25190 rtx *high_dest, rtx *high_in1,
25191 rtx *high_in2)
25193 *low_dest = gen_reg_rtx (DImode);
25194 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
25195 subreg_lowpart_offset (DImode, TImode));
25197 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25198 subreg_lowpart_offset (DImode, TImode));
25199 *high_dest = gen_reg_rtx (DImode);
25201 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
25202 subreg_highpart_offset (DImode, TImode));
25203 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25204 subreg_highpart_offset (DImode, TImode));
25207 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
25209 OP0 represents the TImode destination operand 0
25210 LOW_DEST represents the low half (DImode) of TImode operand 0
25211 LOW_IN1 represents the low half (DImode) of TImode operand 1
25212 LOW_IN2 represents the low half (DImode) of TImode operand 2
25213 HIGH_DEST represents the high half (DImode) of TImode operand 0
25214 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25215 HIGH_IN2 represents the high half (DImode) of TImode operand 2
25216 UNSIGNED_P is true if the operation is being performed on unsigned
25217 values. */
25218 void
25219 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
25220 rtx low_in2, rtx high_dest, rtx high_in1,
25221 rtx high_in2, bool unsigned_p)
25223 if (low_in2 == const0_rtx)
25225 low_dest = low_in1;
25226 high_in2 = force_reg (DImode, high_in2);
25227 if (unsigned_p)
25228 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
25229 else
25230 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
25232 else
25234 if (aarch64_plus_immediate (low_in2, DImode))
25235 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25236 GEN_INT (-UINTVAL (low_in2))));
25237 else
25239 low_in2 = force_reg (DImode, low_in2);
25240 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25242 high_in2 = force_reg (DImode, high_in2);
25244 if (unsigned_p)
25245 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25246 else
25247 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25250 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25251 emit_move_insn (gen_highpart (DImode, op0), high_dest);
25255 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
25257 static unsigned HOST_WIDE_INT
25258 aarch64_asan_shadow_offset (void)
25260 if (TARGET_ILP32)
25261 return (HOST_WIDE_INT_1 << 29);
25262 else
25263 return (HOST_WIDE_INT_1 << 36);
25266 static rtx
25267 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25268 int code, tree treeop0, tree treeop1)
25270 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25271 rtx op0, op1;
25272 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25273 insn_code icode;
25274 struct expand_operand ops[4];
25276 start_sequence ();
25277 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25279 op_mode = GET_MODE (op0);
25280 if (op_mode == VOIDmode)
25281 op_mode = GET_MODE (op1);
25283 switch (op_mode)
25285 case E_QImode:
25286 case E_HImode:
25287 case E_SImode:
25288 cmp_mode = SImode;
25289 icode = CODE_FOR_cmpsi;
25290 break;
25292 case E_DImode:
25293 cmp_mode = DImode;
25294 icode = CODE_FOR_cmpdi;
25295 break;
25297 case E_SFmode:
25298 cmp_mode = SFmode;
25299 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25300 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25301 break;
25303 case E_DFmode:
25304 cmp_mode = DFmode;
25305 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25306 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25307 break;
25309 default:
25310 end_sequence ();
25311 return NULL_RTX;
25314 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25315 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25316 if (!op0 || !op1)
25318 end_sequence ();
25319 return NULL_RTX;
25321 *prep_seq = get_insns ();
25322 end_sequence ();
25324 create_fixed_operand (&ops[0], op0);
25325 create_fixed_operand (&ops[1], op1);
25327 start_sequence ();
25328 if (!maybe_expand_insn (icode, 2, ops))
25330 end_sequence ();
25331 return NULL_RTX;
25333 *gen_seq = get_insns ();
25334 end_sequence ();
25336 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25337 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25340 static rtx
25341 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25342 int cmp_code, tree treeop0, tree treeop1, int bit_code)
25344 rtx op0, op1, target;
25345 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25346 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25347 insn_code icode;
25348 struct expand_operand ops[6];
25349 int aarch64_cond;
25351 push_to_sequence (*prep_seq);
25352 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25354 op_mode = GET_MODE (op0);
25355 if (op_mode == VOIDmode)
25356 op_mode = GET_MODE (op1);
25358 switch (op_mode)
25360 case E_QImode:
25361 case E_HImode:
25362 case E_SImode:
25363 cmp_mode = SImode;
25364 break;
25366 case E_DImode:
25367 cmp_mode = DImode;
25368 break;
25370 case E_SFmode:
25371 cmp_mode = SFmode;
25372 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25373 break;
25375 case E_DFmode:
25376 cmp_mode = DFmode;
25377 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25378 break;
25380 default:
25381 end_sequence ();
25382 return NULL_RTX;
25385 icode = code_for_ccmp (cc_mode, cmp_mode);
25387 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25388 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25389 if (!op0 || !op1)
25391 end_sequence ();
25392 return NULL_RTX;
25394 *prep_seq = get_insns ();
25395 end_sequence ();
25397 target = gen_rtx_REG (cc_mode, CC_REGNUM);
25398 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25400 if (bit_code != AND)
25402 /* Treat the ccmp patterns as canonical and use them where possible,
25403 but fall back to ccmp_rev patterns if there's no other option. */
25404 rtx_code prev_code = GET_CODE (prev);
25405 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25406 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25407 && !(prev_code == EQ
25408 || prev_code == NE
25409 || prev_code == ORDERED
25410 || prev_code == UNORDERED))
25411 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25412 else
25414 rtx_code code = reverse_condition (prev_code);
25415 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25417 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25420 create_fixed_operand (&ops[0], XEXP (prev, 0));
25421 create_fixed_operand (&ops[1], target);
25422 create_fixed_operand (&ops[2], op0);
25423 create_fixed_operand (&ops[3], op1);
25424 create_fixed_operand (&ops[4], prev);
25425 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25427 push_to_sequence (*gen_seq);
25428 if (!maybe_expand_insn (icode, 6, ops))
25430 end_sequence ();
25431 return NULL_RTX;
25434 *gen_seq = get_insns ();
25435 end_sequence ();
25437 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25440 #undef TARGET_GEN_CCMP_FIRST
25441 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25443 #undef TARGET_GEN_CCMP_NEXT
25444 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25446 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
25447 instruction fusion of some sort. */
25449 static bool
25450 aarch64_macro_fusion_p (void)
25452 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25456 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
25457 should be kept together during scheduling. */
25459 static bool
25460 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25462 rtx set_dest;
25463 rtx prev_set = single_set (prev);
25464 rtx curr_set = single_set (curr);
25465 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
25466 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25468 if (!aarch64_macro_fusion_p ())
25469 return false;
25471 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25473 /* We are trying to match:
25474 prev (mov) == (set (reg r0) (const_int imm16))
25475 curr (movk) == (set (zero_extract (reg r0)
25476 (const_int 16)
25477 (const_int 16))
25478 (const_int imm16_1)) */
25480 set_dest = SET_DEST (curr_set);
25482 if (GET_CODE (set_dest) == ZERO_EXTRACT
25483 && CONST_INT_P (SET_SRC (curr_set))
25484 && CONST_INT_P (SET_SRC (prev_set))
25485 && CONST_INT_P (XEXP (set_dest, 2))
25486 && INTVAL (XEXP (set_dest, 2)) == 16
25487 && REG_P (XEXP (set_dest, 0))
25488 && REG_P (SET_DEST (prev_set))
25489 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25491 return true;
25495 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25498 /* We're trying to match:
25499 prev (adrp) == (set (reg r1)
25500 (high (symbol_ref ("SYM"))))
25501 curr (add) == (set (reg r0)
25502 (lo_sum (reg r1)
25503 (symbol_ref ("SYM"))))
25504 Note that r0 need not necessarily be the same as r1, especially
25505 during pre-regalloc scheduling. */
25507 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25508 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25510 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25511 && REG_P (XEXP (SET_SRC (curr_set), 0))
25512 && REGNO (XEXP (SET_SRC (curr_set), 0))
25513 == REGNO (SET_DEST (prev_set))
25514 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25515 XEXP (SET_SRC (curr_set), 1)))
25516 return true;
25520 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25523 /* We're trying to match:
25524 prev (movk) == (set (zero_extract (reg r0)
25525 (const_int 16)
25526 (const_int 32))
25527 (const_int imm16_1))
25528 curr (movk) == (set (zero_extract (reg r0)
25529 (const_int 16)
25530 (const_int 48))
25531 (const_int imm16_2)) */
25533 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25534 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25535 && REG_P (XEXP (SET_DEST (prev_set), 0))
25536 && REG_P (XEXP (SET_DEST (curr_set), 0))
25537 && REGNO (XEXP (SET_DEST (prev_set), 0))
25538 == REGNO (XEXP (SET_DEST (curr_set), 0))
25539 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25540 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25541 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25542 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25543 && CONST_INT_P (SET_SRC (prev_set))
25544 && CONST_INT_P (SET_SRC (curr_set)))
25545 return true;
25548 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25550 /* We're trying to match:
25551 prev (adrp) == (set (reg r0)
25552 (high (symbol_ref ("SYM"))))
25553 curr (ldr) == (set (reg r1)
25554 (mem (lo_sum (reg r0)
25555 (symbol_ref ("SYM")))))
25557 curr (ldr) == (set (reg r1)
25558 (zero_extend (mem
25559 (lo_sum (reg r0)
25560 (symbol_ref ("SYM")))))) */
25561 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25562 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25564 rtx curr_src = SET_SRC (curr_set);
25566 if (GET_CODE (curr_src) == ZERO_EXTEND)
25567 curr_src = XEXP (curr_src, 0);
25569 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25570 && REG_P (XEXP (XEXP (curr_src, 0), 0))
25571 && REGNO (XEXP (XEXP (curr_src, 0), 0))
25572 == REGNO (SET_DEST (prev_set))
25573 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25574 XEXP (SET_SRC (prev_set), 0)))
25575 return true;
25579 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
25580 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25581 && prev_set && curr_set && any_condjump_p (curr)
25582 && GET_CODE (SET_SRC (prev_set)) == COMPARE
25583 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25584 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25585 return true;
25587 /* Fuse flag-setting ALU instructions and conditional branch. */
25588 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25589 && any_condjump_p (curr))
25591 unsigned int condreg1, condreg2;
25592 rtx cc_reg_1;
25593 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25594 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25596 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25597 && prev
25598 && modified_in_p (cc_reg_1, prev))
25600 enum attr_type prev_type = get_attr_type (prev);
25602 /* FIXME: this misses some which is considered simple arthematic
25603 instructions for ThunderX. Simple shifts are missed here. */
25604 if (prev_type == TYPE_ALUS_SREG
25605 || prev_type == TYPE_ALUS_IMM
25606 || prev_type == TYPE_LOGICS_REG
25607 || prev_type == TYPE_LOGICS_IMM)
25608 return true;
25612 /* Fuse ALU instructions and CBZ/CBNZ. */
25613 if (prev_set
25614 && curr_set
25615 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25616 && any_condjump_p (curr))
25618 /* We're trying to match:
25619 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25620 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
25621 (const_int 0))
25622 (label_ref ("SYM"))
25623 (pc)) */
25624 if (SET_DEST (curr_set) == (pc_rtx)
25625 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25626 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25627 && REG_P (SET_DEST (prev_set))
25628 && REGNO (SET_DEST (prev_set))
25629 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25631 /* Fuse ALU operations followed by conditional branch instruction. */
25632 switch (get_attr_type (prev))
25634 case TYPE_ALU_IMM:
25635 case TYPE_ALU_SREG:
25636 case TYPE_ADC_REG:
25637 case TYPE_ADC_IMM:
25638 case TYPE_ADCS_REG:
25639 case TYPE_ADCS_IMM:
25640 case TYPE_LOGIC_REG:
25641 case TYPE_LOGIC_IMM:
25642 case TYPE_CSEL:
25643 case TYPE_ADR:
25644 case TYPE_MOV_IMM:
25645 case TYPE_SHIFT_REG:
25646 case TYPE_SHIFT_IMM:
25647 case TYPE_BFM:
25648 case TYPE_RBIT:
25649 case TYPE_REV:
25650 case TYPE_EXTEND:
25651 return true;
25653 default:;
25658 /* Fuse A+B+1 and A-B-1 */
25659 if (simple_sets_p
25660 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
25662 /* We're trying to match:
25663 prev == (set (r0) (plus (r0) (r1)))
25664 curr == (set (r0) (plus (r0) (const_int 1)))
25666 prev == (set (r0) (minus (r0) (r1)))
25667 curr == (set (r0) (plus (r0) (const_int -1))) */
25669 rtx prev_src = SET_SRC (prev_set);
25670 rtx curr_src = SET_SRC (curr_set);
25672 int polarity = 1;
25673 if (GET_CODE (prev_src) == MINUS)
25674 polarity = -1;
25676 if (GET_CODE (curr_src) == PLUS
25677 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
25678 && CONST_INT_P (XEXP (curr_src, 1))
25679 && INTVAL (XEXP (curr_src, 1)) == polarity
25680 && REG_P (XEXP (curr_src, 0))
25681 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
25682 return true;
25685 return false;
25688 /* Return true iff the instruction fusion described by OP is enabled. */
25690 bool
25691 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25693 return (aarch64_tune_params.fusible_ops & op) != 0;
25696 /* If MEM is in the form of [base+offset], extract the two parts
25697 of address and set to BASE and OFFSET, otherwise return false
25698 after clearing BASE and OFFSET. */
25700 bool
25701 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25703 rtx addr;
25705 gcc_assert (MEM_P (mem));
25707 addr = XEXP (mem, 0);
25709 if (REG_P (addr))
25711 *base = addr;
25712 *offset = const0_rtx;
25713 return true;
25716 if (GET_CODE (addr) == PLUS
25717 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25719 *base = XEXP (addr, 0);
25720 *offset = XEXP (addr, 1);
25721 return true;
25724 *base = NULL_RTX;
25725 *offset = NULL_RTX;
25727 return false;
25730 /* Types for scheduling fusion. */
25731 enum sched_fusion_type
25733 SCHED_FUSION_NONE = 0,
25734 SCHED_FUSION_LD_SIGN_EXTEND,
25735 SCHED_FUSION_LD_ZERO_EXTEND,
25736 SCHED_FUSION_LD,
25737 SCHED_FUSION_ST,
25738 SCHED_FUSION_NUM
25741 /* If INSN is a load or store of address in the form of [base+offset],
25742 extract the two parts and set to BASE and OFFSET. Return scheduling
25743 fusion type this INSN is. */
25745 static enum sched_fusion_type
25746 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25748 rtx x, dest, src;
25749 enum sched_fusion_type fusion = SCHED_FUSION_LD;
25751 gcc_assert (INSN_P (insn));
25752 x = PATTERN (insn);
25753 if (GET_CODE (x) != SET)
25754 return SCHED_FUSION_NONE;
25756 src = SET_SRC (x);
25757 dest = SET_DEST (x);
25759 machine_mode dest_mode = GET_MODE (dest);
25761 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25762 return SCHED_FUSION_NONE;
25764 if (GET_CODE (src) == SIGN_EXTEND)
25766 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25767 src = XEXP (src, 0);
25768 if (!MEM_P (src) || GET_MODE (src) != SImode)
25769 return SCHED_FUSION_NONE;
25771 else if (GET_CODE (src) == ZERO_EXTEND)
25773 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25774 src = XEXP (src, 0);
25775 if (!MEM_P (src) || GET_MODE (src) != SImode)
25776 return SCHED_FUSION_NONE;
25779 if (MEM_P (src) && REG_P (dest))
25780 extract_base_offset_in_addr (src, base, offset);
25781 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25783 fusion = SCHED_FUSION_ST;
25784 extract_base_offset_in_addr (dest, base, offset);
25786 else
25787 return SCHED_FUSION_NONE;
25789 if (*base == NULL_RTX || *offset == NULL_RTX)
25790 fusion = SCHED_FUSION_NONE;
25792 return fusion;
25795 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25797 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25798 and PRI are only calculated for these instructions. For other instruction,
25799 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
25800 type instruction fusion can be added by returning different priorities.
25802 It's important that irrelevant instructions get the largest FUSION_PRI. */
25804 static void
25805 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25806 int *fusion_pri, int *pri)
25808 int tmp, off_val;
25809 rtx base, offset;
25810 enum sched_fusion_type fusion;
25812 gcc_assert (INSN_P (insn));
25814 tmp = max_pri - 1;
25815 fusion = fusion_load_store (insn, &base, &offset);
25816 if (fusion == SCHED_FUSION_NONE)
25818 *pri = tmp;
25819 *fusion_pri = tmp;
25820 return;
25823 /* Set FUSION_PRI according to fusion type and base register. */
25824 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25826 /* Calculate PRI. */
25827 tmp /= 2;
25829 /* INSN with smaller offset goes first. */
25830 off_val = (int)(INTVAL (offset));
25831 if (off_val >= 0)
25832 tmp -= (off_val & 0xfffff);
25833 else
25834 tmp += ((- off_val) & 0xfffff);
25836 *pri = tmp;
25837 return;
25840 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25841 Adjust priority of sha1h instructions so they are scheduled before
25842 other SHA1 instructions. */
25844 static int
25845 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
25847 rtx x = PATTERN (insn);
25849 if (GET_CODE (x) == SET)
25851 x = SET_SRC (x);
25853 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
25854 return priority + 10;
25857 return priority;
25860 /* If REVERSED is null, return true if memory reference *MEM2 comes
25861 immediately after memory reference *MEM1. Do not change the references
25862 in this case.
25864 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
25865 if they are, try to make them use constant offsets from the same base
25866 register. Return true on success. When returning true, set *REVERSED
25867 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
25868 static bool
25869 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
25871 if (reversed)
25872 *reversed = false;
25874 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
25875 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
25876 return false;
25878 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
25879 return false;
25881 auto size1 = MEM_SIZE (*mem1);
25882 auto size2 = MEM_SIZE (*mem2);
25884 rtx base1, base2, offset1, offset2;
25885 extract_base_offset_in_addr (*mem1, &base1, &offset1);
25886 extract_base_offset_in_addr (*mem2, &base2, &offset2);
25888 /* Make sure at least one memory is in base+offset form. */
25889 if (!(base1 && offset1) && !(base2 && offset2))
25890 return false;
25892 /* If both mems already use the same base register, just check the
25893 offsets. */
25894 if (base1 && base2 && rtx_equal_p (base1, base2))
25896 if (!offset1 || !offset2)
25897 return false;
25899 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
25900 return true;
25902 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
25904 *reversed = true;
25905 return true;
25908 return false;
25911 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
25912 guarantee that the values are consecutive. */
25913 if (MEM_EXPR (*mem1)
25914 && MEM_EXPR (*mem2)
25915 && MEM_OFFSET_KNOWN_P (*mem1)
25916 && MEM_OFFSET_KNOWN_P (*mem2))
25918 poly_int64 expr_offset1;
25919 poly_int64 expr_offset2;
25920 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
25921 &expr_offset1);
25922 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
25923 &expr_offset2);
25924 if (!expr_base1
25925 || !expr_base2
25926 || !DECL_P (expr_base1)
25927 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
25928 return false;
25930 expr_offset1 += MEM_OFFSET (*mem1);
25931 expr_offset2 += MEM_OFFSET (*mem2);
25933 if (known_eq (expr_offset1 + size1, expr_offset2))
25935 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
25936 *reversed = true;
25937 else
25938 return false;
25940 if (reversed)
25942 if (base2)
25944 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
25945 expr_offset1 - expr_offset2);
25946 *mem1 = replace_equiv_address_nv (*mem1, addr1);
25948 else
25950 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
25951 expr_offset2 - expr_offset1);
25952 *mem2 = replace_equiv_address_nv (*mem2, addr2);
25955 return true;
25958 return false;
25961 /* Return true if MEM1 and MEM2 can be combined into a single access
25962 of mode MODE, with the combined access having the same address as MEM1. */
25964 bool
25965 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
25967 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
25968 return false;
25969 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
25972 /* Given OPERANDS of consecutive load/store, check if we can merge
25973 them into ldp/stp. LOAD is true if they are load instructions.
25974 MODE is the mode of memory operands. */
25976 bool
25977 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
25978 machine_mode mode)
25980 enum reg_class rclass_1, rclass_2;
25981 rtx mem_1, mem_2, reg_1, reg_2;
25983 if (load)
25985 mem_1 = operands[1];
25986 mem_2 = operands[3];
25987 reg_1 = operands[0];
25988 reg_2 = operands[2];
25989 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
25990 if (REGNO (reg_1) == REGNO (reg_2))
25991 return false;
25992 if (reg_overlap_mentioned_p (reg_1, mem_2))
25993 return false;
25995 else
25997 mem_1 = operands[0];
25998 mem_2 = operands[2];
25999 reg_1 = operands[1];
26000 reg_2 = operands[3];
26003 /* The mems cannot be volatile. */
26004 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
26005 return false;
26007 /* If we have SImode and slow unaligned ldp,
26008 check the alignment to be at least 8 byte. */
26009 if (mode == SImode
26010 && (aarch64_tune_params.extra_tuning_flags
26011 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26012 && !optimize_size
26013 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
26014 return false;
26016 /* Check if the addresses are in the form of [base+offset]. */
26017 bool reversed = false;
26018 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
26019 return false;
26021 /* The operands must be of the same size. */
26022 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
26023 GET_MODE_SIZE (GET_MODE (mem_2))));
26025 /* One of the memory accesses must be a mempair operand.
26026 If it is not the first one, they need to be swapped by the
26027 peephole. */
26028 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
26029 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
26030 return false;
26032 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
26033 rclass_1 = FP_REGS;
26034 else
26035 rclass_1 = GENERAL_REGS;
26037 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
26038 rclass_2 = FP_REGS;
26039 else
26040 rclass_2 = GENERAL_REGS;
26042 /* Check if the registers are of same class. */
26043 if (rclass_1 != rclass_2)
26044 return false;
26046 return true;
26049 /* Given OPERANDS of consecutive load/store that can be merged,
26050 swap them if they are not in ascending order. */
26051 void
26052 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
26054 int mem_op = load ? 1 : 0;
26055 bool reversed = false;
26056 if (!aarch64_check_consecutive_mems (operands + mem_op,
26057 operands + mem_op + 2, &reversed))
26058 gcc_unreachable ();
26060 if (reversed)
26062 /* Irrespective of whether this is a load or a store,
26063 we do the same swap. */
26064 std::swap (operands[0], operands[2]);
26065 std::swap (operands[1], operands[3]);
26069 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26070 comparison between the two. */
26072 aarch64_host_wide_int_compare (const void *x, const void *y)
26074 return wi::cmps (* ((const HOST_WIDE_INT *) x),
26075 * ((const HOST_WIDE_INT *) y));
26078 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26079 other pointing to a REG rtx containing an offset, compare the offsets
26080 of the two pairs.
26082 Return:
26084 1 iff offset (X) > offset (Y)
26085 0 iff offset (X) == offset (Y)
26086 -1 iff offset (X) < offset (Y) */
26088 aarch64_ldrstr_offset_compare (const void *x, const void *y)
26090 const rtx * operands_1 = (const rtx *) x;
26091 const rtx * operands_2 = (const rtx *) y;
26092 rtx mem_1, mem_2, base, offset_1, offset_2;
26094 if (MEM_P (operands_1[0]))
26095 mem_1 = operands_1[0];
26096 else
26097 mem_1 = operands_1[1];
26099 if (MEM_P (operands_2[0]))
26100 mem_2 = operands_2[0];
26101 else
26102 mem_2 = operands_2[1];
26104 /* Extract the offsets. */
26105 extract_base_offset_in_addr (mem_1, &base, &offset_1);
26106 extract_base_offset_in_addr (mem_2, &base, &offset_2);
26108 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
26110 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
26113 /* Given OPERANDS of consecutive load/store, check if we can merge
26114 them into ldp/stp by adjusting the offset. LOAD is true if they
26115 are load instructions. MODE is the mode of memory operands.
26117 Given below consecutive stores:
26119 str w1, [xb, 0x100]
26120 str w1, [xb, 0x104]
26121 str w1, [xb, 0x108]
26122 str w1, [xb, 0x10c]
26124 Though the offsets are out of the range supported by stp, we can
26125 still pair them after adjusting the offset, like:
26127 add scratch, xb, 0x100
26128 stp w1, w1, [scratch]
26129 stp w1, w1, [scratch, 0x8]
26131 The peephole patterns detecting this opportunity should guarantee
26132 the scratch register is avaliable. */
26134 bool
26135 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
26136 machine_mode mode)
26138 const int num_insns = 4;
26139 enum reg_class rclass;
26140 HOST_WIDE_INT offvals[num_insns], msize;
26141 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
26143 if (load)
26145 for (int i = 0; i < num_insns; i++)
26147 reg[i] = operands[2 * i];
26148 mem[i] = operands[2 * i + 1];
26150 gcc_assert (REG_P (reg[i]));
26153 /* Do not attempt to merge the loads if the loads clobber each other. */
26154 for (int i = 0; i < 8; i += 2)
26155 for (int j = i + 2; j < 8; j += 2)
26156 if (reg_overlap_mentioned_p (operands[i], operands[j]))
26157 return false;
26159 else
26160 for (int i = 0; i < num_insns; i++)
26162 mem[i] = operands[2 * i];
26163 reg[i] = operands[2 * i + 1];
26166 /* Skip if memory operand is by itself valid for ldp/stp. */
26167 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
26168 return false;
26170 for (int i = 0; i < num_insns; i++)
26172 /* The mems cannot be volatile. */
26173 if (MEM_VOLATILE_P (mem[i]))
26174 return false;
26176 /* Check if the addresses are in the form of [base+offset]. */
26177 extract_base_offset_in_addr (mem[i], base + i, offset + i);
26178 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
26179 return false;
26182 /* Check if the registers are of same class. */
26183 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
26184 ? FP_REGS : GENERAL_REGS;
26186 for (int i = 1; i < num_insns; i++)
26187 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
26189 if (rclass != FP_REGS)
26190 return false;
26192 else
26194 if (rclass != GENERAL_REGS)
26195 return false;
26198 /* Only the last register in the order in which they occur
26199 may be clobbered by the load. */
26200 if (rclass == GENERAL_REGS && load)
26201 for (int i = 0; i < num_insns - 1; i++)
26202 if (reg_mentioned_p (reg[i], mem[i]))
26203 return false;
26205 /* Check if the bases are same. */
26206 for (int i = 0; i < num_insns - 1; i++)
26207 if (!rtx_equal_p (base[i], base[i + 1]))
26208 return false;
26210 for (int i = 0; i < num_insns; i++)
26211 offvals[i] = INTVAL (offset[i]);
26213 msize = GET_MODE_SIZE (mode).to_constant ();
26215 /* Check if the offsets can be put in the right order to do a ldp/stp. */
26216 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
26217 aarch64_host_wide_int_compare);
26219 if (!(offvals[1] == offvals[0] + msize
26220 && offvals[3] == offvals[2] + msize))
26221 return false;
26223 /* Check that offsets are within range of each other. The ldp/stp
26224 instructions have 7 bit immediate offsets, so use 0x80. */
26225 if (offvals[2] - offvals[0] >= msize * 0x80)
26226 return false;
26228 /* The offsets must be aligned with respect to each other. */
26229 if (offvals[0] % msize != offvals[2] % msize)
26230 return false;
26232 /* If we have SImode and slow unaligned ldp,
26233 check the alignment to be at least 8 byte. */
26234 if (mode == SImode
26235 && (aarch64_tune_params.extra_tuning_flags
26236 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26237 && !optimize_size
26238 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
26239 return false;
26241 return true;
26244 /* Given OPERANDS of consecutive load/store, this function pairs them
26245 into LDP/STP after adjusting the offset. It depends on the fact
26246 that the operands can be sorted so the offsets are correct for STP.
26247 MODE is the mode of memory operands. CODE is the rtl operator
26248 which should be applied to all memory operands, it's SIGN_EXTEND,
26249 ZERO_EXTEND or UNKNOWN. */
26251 bool
26252 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
26253 machine_mode mode, RTX_CODE code)
26255 rtx base, offset_1, offset_3, t1, t2;
26256 rtx mem_1, mem_2, mem_3, mem_4;
26257 rtx temp_operands[8];
26258 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
26259 stp_off_upper_limit, stp_off_lower_limit, msize;
26261 /* We make changes on a copy as we may still bail out. */
26262 for (int i = 0; i < 8; i ++)
26263 temp_operands[i] = operands[i];
26265 /* Sort the operands. Note for cases as below:
26266 [base + 0x310] = A
26267 [base + 0x320] = B
26268 [base + 0x330] = C
26269 [base + 0x320] = D
26270 We need stable sorting otherwise wrong data may be store to offset 0x320.
26271 Also note the dead store in above case should be optimized away, but no
26272 guarantees here. */
26273 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26274 aarch64_ldrstr_offset_compare);
26276 /* Copy the memory operands so that if we have to bail for some
26277 reason the original addresses are unchanged. */
26278 if (load)
26280 mem_1 = copy_rtx (temp_operands[1]);
26281 mem_2 = copy_rtx (temp_operands[3]);
26282 mem_3 = copy_rtx (temp_operands[5]);
26283 mem_4 = copy_rtx (temp_operands[7]);
26285 else
26287 mem_1 = copy_rtx (temp_operands[0]);
26288 mem_2 = copy_rtx (temp_operands[2]);
26289 mem_3 = copy_rtx (temp_operands[4]);
26290 mem_4 = copy_rtx (temp_operands[6]);
26291 gcc_assert (code == UNKNOWN);
26294 extract_base_offset_in_addr (mem_1, &base, &offset_1);
26295 extract_base_offset_in_addr (mem_3, &base, &offset_3);
26296 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26297 && offset_3 != NULL_RTX);
26299 /* Adjust offset so it can fit in LDP/STP instruction. */
26300 msize = GET_MODE_SIZE (mode).to_constant();
26301 stp_off_upper_limit = msize * (0x40 - 1);
26302 stp_off_lower_limit = - msize * 0x40;
26304 off_val_1 = INTVAL (offset_1);
26305 off_val_3 = INTVAL (offset_3);
26307 /* The base offset is optimally half way between the two STP/LDP offsets. */
26308 if (msize <= 4)
26309 base_off = (off_val_1 + off_val_3) / 2;
26310 else
26311 /* However, due to issues with negative LDP/STP offset generation for
26312 larger modes, for DF, DD, DI and vector modes. we must not use negative
26313 addresses smaller than 9 signed unadjusted bits can store. This
26314 provides the most range in this case. */
26315 base_off = off_val_1;
26317 /* Adjust the base so that it is aligned with the addresses but still
26318 optimal. */
26319 if (base_off % msize != off_val_1 % msize)
26320 /* Fix the offset, bearing in mind we want to make it bigger not
26321 smaller. */
26322 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26323 else if (msize <= 4)
26324 /* The negative range of LDP/STP is one larger than the positive range. */
26325 base_off += msize;
26327 /* Check if base offset is too big or too small. We can attempt to resolve
26328 this issue by setting it to the maximum value and seeing if the offsets
26329 still fit. */
26330 if (base_off >= 0x1000)
26332 base_off = 0x1000 - 1;
26333 /* We must still make sure that the base offset is aligned with respect
26334 to the address. But it may not be made any bigger. */
26335 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26338 /* Likewise for the case where the base is too small. */
26339 if (base_off <= -0x1000)
26341 base_off = -0x1000 + 1;
26342 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26345 /* Offset of the first STP/LDP. */
26346 new_off_1 = off_val_1 - base_off;
26348 /* Offset of the second STP/LDP. */
26349 new_off_3 = off_val_3 - base_off;
26351 /* The offsets must be within the range of the LDP/STP instructions. */
26352 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26353 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26354 return false;
26356 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26357 new_off_1), true);
26358 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26359 new_off_1 + msize), true);
26360 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26361 new_off_3), true);
26362 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26363 new_off_3 + msize), true);
26365 if (!aarch64_mem_pair_operand (mem_1, mode)
26366 || !aarch64_mem_pair_operand (mem_3, mode))
26367 return false;
26369 if (code == ZERO_EXTEND)
26371 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26372 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26373 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26374 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26376 else if (code == SIGN_EXTEND)
26378 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26379 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26380 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26381 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26384 if (load)
26386 operands[0] = temp_operands[0];
26387 operands[1] = mem_1;
26388 operands[2] = temp_operands[2];
26389 operands[3] = mem_2;
26390 operands[4] = temp_operands[4];
26391 operands[5] = mem_3;
26392 operands[6] = temp_operands[6];
26393 operands[7] = mem_4;
26395 else
26397 operands[0] = mem_1;
26398 operands[1] = temp_operands[1];
26399 operands[2] = mem_2;
26400 operands[3] = temp_operands[3];
26401 operands[4] = mem_3;
26402 operands[5] = temp_operands[5];
26403 operands[6] = mem_4;
26404 operands[7] = temp_operands[7];
26407 /* Emit adjusting instruction. */
26408 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26409 /* Emit ldp/stp instructions. */
26410 t1 = gen_rtx_SET (operands[0], operands[1]);
26411 t2 = gen_rtx_SET (operands[2], operands[3]);
26412 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26413 t1 = gen_rtx_SET (operands[4], operands[5]);
26414 t2 = gen_rtx_SET (operands[6], operands[7]);
26415 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26416 return true;
26419 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
26420 it isn't worth branching around empty masked ops (including masked
26421 stores). */
26423 static bool
26424 aarch64_empty_mask_is_expensive (unsigned)
26426 return false;
26429 /* Return 1 if pseudo register should be created and used to hold
26430 GOT address for PIC code. */
26432 bool
26433 aarch64_use_pseudo_pic_reg (void)
26435 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26438 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
26440 static int
26441 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26443 switch (XINT (x, 1))
26445 case UNSPEC_GOTSMALLPIC:
26446 case UNSPEC_GOTSMALLPIC28K:
26447 case UNSPEC_GOTTINYPIC:
26448 return 0;
26449 default:
26450 break;
26453 return default_unspec_may_trap_p (x, flags);
26457 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26458 return the log2 of that value. Otherwise return -1. */
26461 aarch64_fpconst_pow_of_2 (rtx x)
26463 const REAL_VALUE_TYPE *r;
26465 if (!CONST_DOUBLE_P (x))
26466 return -1;
26468 r = CONST_DOUBLE_REAL_VALUE (x);
26470 if (REAL_VALUE_NEGATIVE (*r)
26471 || REAL_VALUE_ISNAN (*r)
26472 || REAL_VALUE_ISINF (*r)
26473 || !real_isinteger (r, DFmode))
26474 return -1;
26476 return exact_log2 (real_to_integer (r));
26479 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26480 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26481 return n. Otherwise return -1. */
26484 aarch64_fpconst_pow2_recip (rtx x)
26486 REAL_VALUE_TYPE r0;
26488 if (!CONST_DOUBLE_P (x))
26489 return -1;
26491 r0 = *CONST_DOUBLE_REAL_VALUE (x);
26492 if (exact_real_inverse (DFmode, &r0)
26493 && !REAL_VALUE_NEGATIVE (r0))
26495 int ret = exact_log2 (real_to_integer (&r0));
26496 if (ret >= 1 && ret <= 32)
26497 return ret;
26499 return -1;
26502 /* If X is a vector of equal CONST_DOUBLE values and that value is
26503 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
26506 aarch64_vec_fpconst_pow_of_2 (rtx x)
26508 int nelts;
26509 if (!CONST_VECTOR_P (x)
26510 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26511 return -1;
26513 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26514 return -1;
26516 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26517 if (firstval <= 0)
26518 return -1;
26520 for (int i = 1; i < nelts; i++)
26521 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26522 return -1;
26524 return firstval;
26527 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26528 to float.
26530 __fp16 always promotes through this hook.
26531 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26532 through the generic excess precision logic rather than here. */
26534 static tree
26535 aarch64_promoted_type (const_tree t)
26537 if (SCALAR_FLOAT_TYPE_P (t)
26538 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26539 return float_type_node;
26541 return NULL_TREE;
26544 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
26546 static bool
26547 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26548 optimization_type opt_type)
26550 switch (op)
26552 case rsqrt_optab:
26553 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26555 default:
26556 return true;
26560 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
26562 static unsigned int
26563 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26564 int *offset)
26566 /* Polynomial invariant 1 == (VG / 2) - 1. */
26567 gcc_assert (i == 1);
26568 *factor = 2;
26569 *offset = 1;
26570 return AARCH64_DWARF_VG;
26573 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26574 if MODE is HFmode, and punt to the generic implementation otherwise. */
26576 static bool
26577 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26579 return (mode == HFmode
26580 ? true
26581 : default_libgcc_floating_mode_supported_p (mode));
26584 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26585 if MODE is HFmode, and punt to the generic implementation otherwise. */
26587 static bool
26588 aarch64_scalar_mode_supported_p (scalar_mode mode)
26590 if (DECIMAL_FLOAT_MODE_P (mode))
26591 return default_decimal_float_supported_p ();
26593 return (mode == HFmode
26594 ? true
26595 : default_scalar_mode_supported_p (mode));
26598 /* Set the value of FLT_EVAL_METHOD.
26599 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26601 0: evaluate all operations and constants, whose semantic type has at
26602 most the range and precision of type float, to the range and
26603 precision of float; evaluate all other operations and constants to
26604 the range and precision of the semantic type;
26606 N, where _FloatN is a supported interchange floating type
26607 evaluate all operations and constants, whose semantic type has at
26608 most the range and precision of _FloatN type, to the range and
26609 precision of the _FloatN type; evaluate all other operations and
26610 constants to the range and precision of the semantic type;
26612 If we have the ARMv8.2-A extensions then we support _Float16 in native
26613 precision, so we should set this to 16. Otherwise, we support the type,
26614 but want to evaluate expressions in float precision, so set this to
26615 0. */
26617 static enum flt_eval_method
26618 aarch64_excess_precision (enum excess_precision_type type)
26620 switch (type)
26622 case EXCESS_PRECISION_TYPE_FAST:
26623 case EXCESS_PRECISION_TYPE_STANDARD:
26624 /* We can calculate either in 16-bit range and precision or
26625 32-bit range and precision. Make that decision based on whether
26626 we have native support for the ARMv8.2-A 16-bit floating-point
26627 instructions or not. */
26628 return (TARGET_FP_F16INST
26629 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26630 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26631 case EXCESS_PRECISION_TYPE_IMPLICIT:
26632 case EXCESS_PRECISION_TYPE_FLOAT16:
26633 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26634 default:
26635 gcc_unreachable ();
26637 return FLT_EVAL_METHOD_UNPREDICTABLE;
26640 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
26641 scheduled for speculative execution. Reject the long-running division
26642 and square-root instructions. */
26644 static bool
26645 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26647 switch (get_attr_type (insn))
26649 case TYPE_SDIV:
26650 case TYPE_UDIV:
26651 case TYPE_FDIVS:
26652 case TYPE_FDIVD:
26653 case TYPE_FSQRTS:
26654 case TYPE_FSQRTD:
26655 case TYPE_NEON_FP_SQRT_S:
26656 case TYPE_NEON_FP_SQRT_D:
26657 case TYPE_NEON_FP_SQRT_S_Q:
26658 case TYPE_NEON_FP_SQRT_D_Q:
26659 case TYPE_NEON_FP_DIV_S:
26660 case TYPE_NEON_FP_DIV_D:
26661 case TYPE_NEON_FP_DIV_S_Q:
26662 case TYPE_NEON_FP_DIV_D_Q:
26663 return false;
26664 default:
26665 return true;
26669 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
26671 static int
26672 aarch64_compute_pressure_classes (reg_class *classes)
26674 int i = 0;
26675 classes[i++] = GENERAL_REGS;
26676 classes[i++] = FP_REGS;
26677 /* PR_REGS isn't a useful pressure class because many predicate pseudo
26678 registers need to go in PR_LO_REGS at some point during their
26679 lifetime. Splitting it into two halves has the effect of making
26680 all predicates count against PR_LO_REGS, so that we try whenever
26681 possible to restrict the number of live predicates to 8. This
26682 greatly reduces the amount of spilling in certain loops. */
26683 classes[i++] = PR_LO_REGS;
26684 classes[i++] = PR_HI_REGS;
26685 return i;
26688 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
26690 static bool
26691 aarch64_can_change_mode_class (machine_mode from,
26692 machine_mode to, reg_class_t)
26694 unsigned int from_flags = aarch64_classify_vector_mode (from);
26695 unsigned int to_flags = aarch64_classify_vector_mode (to);
26697 bool from_sve_p = (from_flags & VEC_ANY_SVE);
26698 bool to_sve_p = (to_flags & VEC_ANY_SVE);
26700 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26701 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26703 bool from_pred_p = (from_flags & VEC_SVE_PRED);
26704 bool to_pred_p = (to_flags & VEC_SVE_PRED);
26706 bool from_full_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT));
26707 bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26708 | VEC_PARTIAL));
26710 /* Don't allow changes between predicate modes and other modes.
26711 Only predicate registers can hold predicate modes and only
26712 non-predicate registers can hold non-predicate modes, so any
26713 attempt to mix them would require a round trip through memory. */
26714 if (from_pred_p != to_pred_p)
26715 return false;
26717 /* Don't allow changes between partial SVE modes and other modes.
26718 The contents of partial SVE modes are distributed evenly across
26719 the register, whereas GCC expects them to be clustered together. */
26720 if (from_partial_sve_p != to_partial_sve_p)
26721 return false;
26723 /* Similarly reject changes between partial SVE modes that have
26724 different patterns of significant and insignificant bits. */
26725 if (from_partial_sve_p
26726 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26727 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26728 return false;
26730 /* Don't allow changes between partial and full Advanced SIMD structure
26731 modes. */
26732 if (from_full_advsimd_struct_p && to_partial_advsimd_struct_p)
26733 return false;
26735 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26737 /* Don't allow changes between SVE modes and other modes that might
26738 be bigger than 128 bits. In particular, OImode, CImode and XImode
26739 divide into 128-bit quantities while SVE modes divide into
26740 BITS_PER_SVE_VECTOR quantities. */
26741 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26742 return false;
26743 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26744 return false;
26747 if (BYTES_BIG_ENDIAN)
26749 /* Don't allow changes between SVE data modes and non-SVE modes.
26750 See the comment at the head of aarch64-sve.md for details. */
26751 if (from_sve_p != to_sve_p)
26752 return false;
26754 /* Don't allow changes in element size: lane 0 of the new vector
26755 would not then be lane 0 of the old vector. See the comment
26756 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26757 description.
26759 In the worst case, this forces a register to be spilled in
26760 one mode and reloaded in the other, which handles the
26761 endianness correctly. */
26762 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26763 return false;
26765 return true;
26768 /* Implement TARGET_EARLY_REMAT_MODES. */
26770 static void
26771 aarch64_select_early_remat_modes (sbitmap modes)
26773 /* SVE values are not normally live across a call, so it should be
26774 worth doing early rematerialization even in VL-specific mode. */
26775 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26776 if (aarch64_sve_mode_p ((machine_mode) i))
26777 bitmap_set_bit (modes, i);
26780 /* Override the default target speculation_safe_value. */
26781 static rtx
26782 aarch64_speculation_safe_value (machine_mode mode,
26783 rtx result, rtx val, rtx failval)
26785 /* Maybe we should warn if falling back to hard barriers. They are
26786 likely to be noticably more expensive than the alternative below. */
26787 if (!aarch64_track_speculation)
26788 return default_speculation_safe_value (mode, result, val, failval);
26790 if (!REG_P (val))
26791 val = copy_to_mode_reg (mode, val);
26793 if (!aarch64_reg_or_zero (failval, mode))
26794 failval = copy_to_mode_reg (mode, failval);
26796 emit_insn (gen_despeculate_copy (mode, result, val, failval));
26797 return result;
26800 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26801 Look into the tuning structure for an estimate.
26802 KIND specifies the type of requested estimate: min, max or likely.
26803 For cores with a known SVE width all three estimates are the same.
26804 For generic SVE tuning we want to distinguish the maximum estimate from
26805 the minimum and likely ones.
26806 The likely estimate is the same as the minimum in that case to give a
26807 conservative behavior of auto-vectorizing with SVE when it is a win
26808 even for 128-bit SVE.
26809 When SVE width information is available VAL.coeffs[1] is multiplied by
26810 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
26812 static HOST_WIDE_INT
26813 aarch64_estimated_poly_value (poly_int64 val,
26814 poly_value_estimate_kind kind
26815 = POLY_VALUE_LIKELY)
26817 unsigned int width_source = aarch64_tune_params.sve_width;
26819 /* If there is no core-specific information then the minimum and likely
26820 values are based on 128-bit vectors and the maximum is based on
26821 the architectural maximum of 2048 bits. */
26822 if (width_source == SVE_SCALABLE)
26823 switch (kind)
26825 case POLY_VALUE_MIN:
26826 case POLY_VALUE_LIKELY:
26827 return val.coeffs[0];
26828 case POLY_VALUE_MAX:
26829 return val.coeffs[0] + val.coeffs[1] * 15;
26832 /* Allow sve_width to be a bitmask of different VL, treating the lowest
26833 as likely. This could be made more general if future -mtune options
26834 need it to be. */
26835 if (kind == POLY_VALUE_MAX)
26836 width_source = 1 << floor_log2 (width_source);
26837 else
26838 width_source = least_bit_hwi (width_source);
26840 /* If the core provides width information, use that. */
26841 HOST_WIDE_INT over_128 = width_source - 128;
26842 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
26846 /* Return true for types that could be supported as SIMD return or
26847 argument types. */
26849 static bool
26850 supported_simd_type (tree t)
26852 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
26854 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
26855 return s == 1 || s == 2 || s == 4 || s == 8;
26857 return false;
26860 /* Return true for types that currently are supported as SIMD return
26861 or argument types. */
26863 static bool
26864 currently_supported_simd_type (tree t, tree b)
26866 if (COMPLEX_FLOAT_TYPE_P (t))
26867 return false;
26869 if (TYPE_SIZE (t) != TYPE_SIZE (b))
26870 return false;
26872 return supported_simd_type (t);
26875 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
26877 static int
26878 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
26879 struct cgraph_simd_clone *clonei,
26880 tree base_type, int num,
26881 bool explicit_p)
26883 tree t, ret_type;
26884 unsigned int elt_bits, count;
26885 unsigned HOST_WIDE_INT const_simdlen;
26886 poly_uint64 vec_bits;
26888 if (!TARGET_SIMD)
26889 return 0;
26891 /* For now, SVE simdclones won't produce illegal simdlen, So only check
26892 const simdlens here. */
26893 if (maybe_ne (clonei->simdlen, 0U)
26894 && clonei->simdlen.is_constant (&const_simdlen)
26895 && (const_simdlen < 2
26896 || const_simdlen > 1024
26897 || (const_simdlen & (const_simdlen - 1)) != 0))
26899 if (explicit_p)
26900 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26901 "unsupported simdlen %wd", const_simdlen);
26902 return 0;
26905 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
26906 if (TREE_CODE (ret_type) != VOID_TYPE
26907 && !currently_supported_simd_type (ret_type, base_type))
26909 if (!explicit_p)
26911 else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
26912 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26913 "GCC does not currently support mixed size types "
26914 "for %<simd%> functions");
26915 else if (supported_simd_type (ret_type))
26916 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26917 "GCC does not currently support return type %qT "
26918 "for %<simd%> functions", ret_type);
26919 else
26920 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26921 "unsupported return type %qT for %<simd%> functions",
26922 ret_type);
26923 return 0;
26926 int i;
26927 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
26928 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
26930 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
26931 t && t != void_list_node; t = TREE_CHAIN (t), i++)
26933 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
26935 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
26936 && !currently_supported_simd_type (arg_type, base_type))
26938 if (!explicit_p)
26940 else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
26941 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26942 "GCC does not currently support mixed size types "
26943 "for %<simd%> functions");
26944 else
26945 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26946 "GCC does not currently support argument type %qT "
26947 "for %<simd%> functions", arg_type);
26948 return 0;
26952 clonei->vecsize_mangle = 'n';
26953 clonei->mask_mode = VOIDmode;
26954 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
26955 if (known_eq (clonei->simdlen, 0U))
26957 count = 2;
26958 vec_bits = (num == 0 ? 64 : 128);
26959 clonei->simdlen = exact_div (vec_bits, elt_bits);
26961 else
26963 count = 1;
26964 vec_bits = clonei->simdlen * elt_bits;
26965 /* For now, SVE simdclones won't produce illegal simdlen, So only check
26966 const simdlens here. */
26967 if (clonei->simdlen.is_constant (&const_simdlen)
26968 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
26970 if (explicit_p)
26971 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26972 "GCC does not currently support simdlen %wd for "
26973 "type %qT",
26974 const_simdlen, base_type);
26975 return 0;
26978 clonei->vecsize_int = vec_bits;
26979 clonei->vecsize_float = vec_bits;
26980 return count;
26983 /* Implement TARGET_SIMD_CLONE_ADJUST. */
26985 static void
26986 aarch64_simd_clone_adjust (struct cgraph_node *node)
26988 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
26989 use the correct ABI. */
26991 tree t = TREE_TYPE (node->decl);
26992 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
26993 TYPE_ATTRIBUTES (t));
26996 /* Implement TARGET_SIMD_CLONE_USABLE. */
26998 static int
26999 aarch64_simd_clone_usable (struct cgraph_node *node)
27001 switch (node->simdclone->vecsize_mangle)
27003 case 'n':
27004 if (!TARGET_SIMD)
27005 return -1;
27006 return 0;
27007 default:
27008 gcc_unreachable ();
27012 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27014 static int
27015 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
27017 auto check_attr = [&](const char *name) {
27018 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
27019 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
27020 if (!attr1 && !attr2)
27021 return true;
27023 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
27026 if (!check_attr ("aarch64_vector_pcs"))
27027 return 0;
27028 if (!check_attr ("Advanced SIMD type"))
27029 return 0;
27030 if (!check_attr ("SVE type"))
27031 return 0;
27032 if (!check_attr ("SVE sizeless type"))
27033 return 0;
27034 return 1;
27037 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
27039 static const char *
27040 aarch64_get_multilib_abi_name (void)
27042 if (TARGET_BIG_END)
27043 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
27044 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
27047 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27048 global variable based guard use the default else
27049 return a null tree. */
27050 static tree
27051 aarch64_stack_protect_guard (void)
27053 if (aarch64_stack_protector_guard == SSP_GLOBAL)
27054 return default_stack_protect_guard ();
27056 return NULL_TREE;
27059 /* Return the diagnostic message string if conversion from FROMTYPE to
27060 TOTYPE is not allowed, NULL otherwise. */
27062 static const char *
27063 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
27065 if (element_mode (fromtype) != element_mode (totype))
27067 /* Do no allow conversions to/from BFmode scalar types. */
27068 if (TYPE_MODE (fromtype) == BFmode)
27069 return N_("invalid conversion from type %<bfloat16_t%>");
27070 if (TYPE_MODE (totype) == BFmode)
27071 return N_("invalid conversion to type %<bfloat16_t%>");
27074 /* Conversion allowed. */
27075 return NULL;
27078 /* Return the diagnostic message string if the unary operation OP is
27079 not permitted on TYPE, NULL otherwise. */
27081 static const char *
27082 aarch64_invalid_unary_op (int op, const_tree type)
27084 /* Reject all single-operand operations on BFmode except for &. */
27085 if (element_mode (type) == BFmode && op != ADDR_EXPR)
27086 return N_("operation not permitted on type %<bfloat16_t%>");
27088 /* Operation allowed. */
27089 return NULL;
27092 /* Return the diagnostic message string if the binary operation OP is
27093 not permitted on TYPE1 and TYPE2, NULL otherwise. */
27095 static const char *
27096 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
27097 const_tree type2)
27099 /* Reject all 2-operand operations on BFmode. */
27100 if (element_mode (type1) == BFmode
27101 || element_mode (type2) == BFmode)
27102 return N_("operation not permitted on type %<bfloat16_t%>");
27104 if (VECTOR_TYPE_P (type1)
27105 && VECTOR_TYPE_P (type2)
27106 && !TYPE_INDIVISIBLE_P (type1)
27107 && !TYPE_INDIVISIBLE_P (type2)
27108 && (aarch64_sve::builtin_type_p (type1)
27109 != aarch64_sve::builtin_type_p (type2)))
27110 return N_("cannot combine GNU and SVE vectors in a binary operation");
27112 /* Operation allowed. */
27113 return NULL;
27116 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
27117 compiler that we automatically ignore the top byte of our pointers, which
27118 allows using -fsanitize=hwaddress. */
27119 bool
27120 aarch64_can_tag_addresses ()
27122 return !TARGET_ILP32;
27125 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
27126 section at the end if needed. */
27127 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
27128 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
27129 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
27130 void
27131 aarch64_file_end_indicate_exec_stack ()
27133 file_end_indicate_exec_stack ();
27135 unsigned feature_1_and = 0;
27136 if (aarch64_bti_enabled ())
27137 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
27139 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
27140 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
27142 if (feature_1_and)
27144 /* Generate .note.gnu.property section. */
27145 switch_to_section (get_section (".note.gnu.property",
27146 SECTION_NOTYPE, NULL));
27148 /* PT_NOTE header: namesz, descsz, type.
27149 namesz = 4 ("GNU\0")
27150 descsz = 16 (Size of the program property array)
27151 [(12 + padding) * Number of array elements]
27152 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
27153 assemble_align (POINTER_SIZE);
27154 assemble_integer (GEN_INT (4), 4, 32, 1);
27155 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
27156 assemble_integer (GEN_INT (5), 4, 32, 1);
27158 /* PT_NOTE name. */
27159 assemble_string ("GNU", 4);
27161 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27162 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27163 datasz = 4
27164 data = feature_1_and. */
27165 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
27166 assemble_integer (GEN_INT (4), 4, 32, 1);
27167 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
27169 /* Pad the size of the note to the required alignment. */
27170 assemble_align (POINTER_SIZE);
27173 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27174 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27175 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
27177 /* Helper function for straight line speculation.
27178 Return what barrier should be emitted for straight line speculation
27179 mitigation.
27180 When not mitigating against straight line speculation this function returns
27181 an empty string.
27182 When mitigating against straight line speculation, use:
27183 * SB when the v8.5-A SB extension is enabled.
27184 * DSB+ISB otherwise. */
27185 const char *
27186 aarch64_sls_barrier (int mitigation_required)
27188 return mitigation_required
27189 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
27190 : "";
27193 static GTY (()) tree aarch64_sls_shared_thunks[30];
27194 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
27195 const char *indirect_symbol_names[30] = {
27196 "__call_indirect_x0",
27197 "__call_indirect_x1",
27198 "__call_indirect_x2",
27199 "__call_indirect_x3",
27200 "__call_indirect_x4",
27201 "__call_indirect_x5",
27202 "__call_indirect_x6",
27203 "__call_indirect_x7",
27204 "__call_indirect_x8",
27205 "__call_indirect_x9",
27206 "__call_indirect_x10",
27207 "__call_indirect_x11",
27208 "__call_indirect_x12",
27209 "__call_indirect_x13",
27210 "__call_indirect_x14",
27211 "__call_indirect_x15",
27212 "", /* "__call_indirect_x16", */
27213 "", /* "__call_indirect_x17", */
27214 "__call_indirect_x18",
27215 "__call_indirect_x19",
27216 "__call_indirect_x20",
27217 "__call_indirect_x21",
27218 "__call_indirect_x22",
27219 "__call_indirect_x23",
27220 "__call_indirect_x24",
27221 "__call_indirect_x25",
27222 "__call_indirect_x26",
27223 "__call_indirect_x27",
27224 "__call_indirect_x28",
27225 "__call_indirect_x29",
27228 /* Function to create a BLR thunk. This thunk is used to mitigate straight
27229 line speculation. Instead of a simple BLR that can be speculated past,
27230 we emit a BL to this thunk, and this thunk contains a BR to the relevant
27231 register. These thunks have the relevant speculation barries put after
27232 their indirect branch so that speculation is blocked.
27234 We use such a thunk so the speculation barriers are kept off the
27235 architecturally executed path in order to reduce the performance overhead.
27237 When optimizing for size we use stubs shared by the linked object.
27238 When optimizing for performance we emit stubs for each function in the hope
27239 that the branch predictor can better train on jumps specific for a given
27240 function. */
27242 aarch64_sls_create_blr_label (int regnum)
27244 gcc_assert (STUB_REGNUM_P (regnum));
27245 if (optimize_function_for_size_p (cfun))
27247 /* For the thunks shared between different functions in this compilation
27248 unit we use a named symbol -- this is just for users to more easily
27249 understand the generated assembly. */
27250 aarch64_sls_shared_thunks_needed = true;
27251 const char *thunk_name = indirect_symbol_names[regnum];
27252 if (aarch64_sls_shared_thunks[regnum] == NULL)
27254 /* Build a decl representing this function stub and record it for
27255 later. We build a decl here so we can use the GCC machinery for
27256 handling sections automatically (through `get_named_section` and
27257 `make_decl_one_only`). That saves us a lot of trouble handling
27258 the specifics of different output file formats. */
27259 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
27260 get_identifier (thunk_name),
27261 build_function_type_list (void_type_node,
27262 NULL_TREE));
27263 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
27264 NULL_TREE, void_type_node);
27265 TREE_PUBLIC (decl) = 1;
27266 TREE_STATIC (decl) = 1;
27267 DECL_IGNORED_P (decl) = 1;
27268 DECL_ARTIFICIAL (decl) = 1;
27269 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27270 resolve_unique_section (decl, 0, false);
27271 aarch64_sls_shared_thunks[regnum] = decl;
27274 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27277 if (cfun->machine->call_via[regnum] == NULL)
27278 cfun->machine->call_via[regnum]
27279 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27280 return cfun->machine->call_via[regnum];
27283 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27284 aarch64_sls_emit_shared_blr_thunks below. */
27285 static void
27286 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27288 /* Save in x16 and branch to that function so this transformation does
27289 not prevent jumping to `BTI c` instructions. */
27290 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27291 asm_fprintf (out_file, "\tbr\tx16\n");
27294 /* Emit all BLR stubs for this particular function.
27295 Here we emit all the BLR stubs needed for the current function. Since we
27296 emit these stubs in a consecutive block we know there will be no speculation
27297 gadgets between each stub, and hence we only emit a speculation barrier at
27298 the end of the stub sequences.
27300 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
27301 void
27302 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27304 if (! aarch64_harden_sls_blr_p ())
27305 return;
27307 bool any_functions_emitted = false;
27308 /* We must save and restore the current function section since this assembly
27309 is emitted at the end of the function. This means it can be emitted *just
27310 after* the cold section of a function. That cold part would be emitted in
27311 a different section. That switch would trigger a `.cfi_endproc` directive
27312 to be emitted in the original section and a `.cfi_startproc` directive to
27313 be emitted in the new section. Switching to the original section without
27314 restoring would mean that the `.cfi_endproc` emitted as a function ends
27315 would happen in a different section -- leaving an unmatched
27316 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27317 in the standard text section. */
27318 section *save_text_section = in_section;
27319 switch_to_section (function_section (current_function_decl));
27320 for (int regnum = 0; regnum < 30; ++regnum)
27322 rtx specu_label = cfun->machine->call_via[regnum];
27323 if (specu_label == NULL)
27324 continue;
27326 targetm.asm_out.print_operand (out_file, specu_label, 0);
27327 asm_fprintf (out_file, ":\n");
27328 aarch64_sls_emit_function_stub (out_file, regnum);
27329 any_functions_emitted = true;
27331 if (any_functions_emitted)
27332 /* Can use the SB if needs be here, since this stub will only be used
27333 by the current function, and hence for the current target. */
27334 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27335 switch_to_section (save_text_section);
27338 /* Emit shared BLR stubs for the current compilation unit.
27339 Over the course of compiling this unit we may have converted some BLR
27340 instructions to a BL to a shared stub function. This is where we emit those
27341 stub functions.
27342 This function is for the stubs shared between different functions in this
27343 compilation unit. We share when optimizing for size instead of speed.
27345 This function is called through the TARGET_ASM_FILE_END hook. */
27346 void
27347 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27349 if (! aarch64_sls_shared_thunks_needed)
27350 return;
27352 for (int regnum = 0; regnum < 30; ++regnum)
27354 tree decl = aarch64_sls_shared_thunks[regnum];
27355 if (!decl)
27356 continue;
27358 const char *name = indirect_symbol_names[regnum];
27359 switch_to_section (get_named_section (decl, NULL, 0));
27360 ASM_OUTPUT_ALIGN (out_file, 2);
27361 targetm.asm_out.globalize_label (out_file, name);
27362 /* Only emits if the compiler is configured for an assembler that can
27363 handle visibility directives. */
27364 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27365 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27366 ASM_OUTPUT_LABEL (out_file, name);
27367 aarch64_sls_emit_function_stub (out_file, regnum);
27368 /* Use the most conservative target to ensure it can always be used by any
27369 function in the translation unit. */
27370 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27371 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27375 /* Implement TARGET_ASM_FILE_END. */
27376 void
27377 aarch64_asm_file_end ()
27379 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27380 /* Since this function will be called for the ASM_FILE_END hook, we ensure
27381 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27382 for FreeBSD) still gets called. */
27383 #ifdef TARGET_ASM_FILE_END
27384 TARGET_ASM_FILE_END ();
27385 #endif
27388 const char *
27389 aarch64_indirect_call_asm (rtx addr)
27391 gcc_assert (REG_P (addr));
27392 if (aarch64_harden_sls_blr_p ())
27394 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27395 output_asm_insn ("bl\t%0", &stub_label);
27397 else
27398 output_asm_insn ("blr\t%0", &addr);
27399 return "";
27402 /* Target-specific selftests. */
27404 #if CHECKING_P
27406 namespace selftest {
27408 /* Selftest for the RTL loader.
27409 Verify that the RTL loader copes with a dump from
27410 print_rtx_function. This is essentially just a test that class
27411 function_reader can handle a real dump, but it also verifies
27412 that lookup_reg_by_dump_name correctly handles hard regs.
27413 The presence of hard reg names in the dump means that the test is
27414 target-specific, hence it is in this file. */
27416 static void
27417 aarch64_test_loading_full_dump ()
27419 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27421 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27423 rtx_insn *insn_1 = get_insn_by_uid (1);
27424 ASSERT_EQ (NOTE, GET_CODE (insn_1));
27426 rtx_insn *insn_15 = get_insn_by_uid (15);
27427 ASSERT_EQ (INSN, GET_CODE (insn_15));
27428 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27430 /* Verify crtl->return_rtx. */
27431 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27432 ASSERT_EQ (0, REGNO (crtl->return_rtx));
27433 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27436 /* Test the fractional_cost class. */
27438 static void
27439 aarch64_test_fractional_cost ()
27441 using cf = fractional_cost;
27443 ASSERT_EQ (cf (0, 20), 0);
27445 ASSERT_EQ (cf (4, 2), 2);
27446 ASSERT_EQ (3, cf (9, 3));
27448 ASSERT_NE (cf (5, 2), 2);
27449 ASSERT_NE (3, cf (8, 3));
27451 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27452 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27453 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27455 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27456 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27457 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27458 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27459 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27460 ASSERT_EQ (3 - cf (10, 3), 0);
27462 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27463 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27465 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27466 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27467 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27468 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27469 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27470 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27471 ASSERT_TRUE (cf (239, 240) < 1);
27472 ASSERT_FALSE (cf (240, 240) < 1);
27473 ASSERT_FALSE (cf (241, 240) < 1);
27474 ASSERT_FALSE (2 < cf (207, 104));
27475 ASSERT_FALSE (2 < cf (208, 104));
27476 ASSERT_TRUE (2 < cf (209, 104));
27478 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27479 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27480 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27481 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27482 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27483 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27484 ASSERT_TRUE (cf (239, 240) < 1);
27485 ASSERT_FALSE (cf (240, 240) < 1);
27486 ASSERT_FALSE (cf (241, 240) < 1);
27487 ASSERT_FALSE (2 < cf (207, 104));
27488 ASSERT_FALSE (2 < cf (208, 104));
27489 ASSERT_TRUE (2 < cf (209, 104));
27491 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27492 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27493 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27494 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27495 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27496 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27497 ASSERT_FALSE (cf (239, 240) >= 1);
27498 ASSERT_TRUE (cf (240, 240) >= 1);
27499 ASSERT_TRUE (cf (241, 240) >= 1);
27500 ASSERT_TRUE (2 >= cf (207, 104));
27501 ASSERT_TRUE (2 >= cf (208, 104));
27502 ASSERT_FALSE (2 >= cf (209, 104));
27504 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27505 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27506 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27507 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27508 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27509 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27510 ASSERT_FALSE (cf (239, 240) > 1);
27511 ASSERT_FALSE (cf (240, 240) > 1);
27512 ASSERT_TRUE (cf (241, 240) > 1);
27513 ASSERT_TRUE (2 > cf (207, 104));
27514 ASSERT_FALSE (2 > cf (208, 104));
27515 ASSERT_FALSE (2 > cf (209, 104));
27517 ASSERT_EQ (cf (1, 2).ceil (), 1);
27518 ASSERT_EQ (cf (11, 7).ceil (), 2);
27519 ASSERT_EQ (cf (20, 1).ceil (), 20);
27520 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27521 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27522 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27523 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27524 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27526 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27529 /* Run all target-specific selftests. */
27531 static void
27532 aarch64_run_selftests (void)
27534 aarch64_test_loading_full_dump ();
27535 aarch64_test_fractional_cost ();
27538 } // namespace selftest
27540 #endif /* #if CHECKING_P */
27542 #undef TARGET_STACK_PROTECT_GUARD
27543 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27545 #undef TARGET_ADDRESS_COST
27546 #define TARGET_ADDRESS_COST aarch64_address_cost
27548 /* This hook will determines whether unnamed bitfields affect the alignment
27549 of the containing structure. The hook returns true if the structure
27550 should inherit the alignment requirements of an unnamed bitfield's
27551 type. */
27552 #undef TARGET_ALIGN_ANON_BITFIELD
27553 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27555 #undef TARGET_ASM_ALIGNED_DI_OP
27556 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27558 #undef TARGET_ASM_ALIGNED_HI_OP
27559 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27561 #undef TARGET_ASM_ALIGNED_SI_OP
27562 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27564 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27565 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27566 hook_bool_const_tree_hwi_hwi_const_tree_true
27568 #undef TARGET_ASM_FILE_START
27569 #define TARGET_ASM_FILE_START aarch64_start_file
27571 #undef TARGET_ASM_OUTPUT_MI_THUNK
27572 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27574 #undef TARGET_ASM_SELECT_RTX_SECTION
27575 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27577 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27578 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27580 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27581 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27583 #undef TARGET_BUILD_BUILTIN_VA_LIST
27584 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27586 #undef TARGET_CALLEE_COPIES
27587 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27589 #undef TARGET_CAN_ELIMINATE
27590 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27592 #undef TARGET_CAN_INLINE_P
27593 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27595 #undef TARGET_CANNOT_FORCE_CONST_MEM
27596 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27598 #undef TARGET_CASE_VALUES_THRESHOLD
27599 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27601 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27602 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27604 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27605 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27607 /* Only the least significant bit is used for initialization guard
27608 variables. */
27609 #undef TARGET_CXX_GUARD_MASK_BIT
27610 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27612 #undef TARGET_C_MODE_FOR_SUFFIX
27613 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27615 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27616 #undef TARGET_DEFAULT_TARGET_FLAGS
27617 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27618 #endif
27620 #undef TARGET_CLASS_MAX_NREGS
27621 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27623 #undef TARGET_BUILTIN_DECL
27624 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27626 #undef TARGET_BUILTIN_RECIPROCAL
27627 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27629 #undef TARGET_C_EXCESS_PRECISION
27630 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27632 #undef TARGET_EXPAND_BUILTIN
27633 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27635 #undef TARGET_EXPAND_BUILTIN_VA_START
27636 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27638 #undef TARGET_FOLD_BUILTIN
27639 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27641 #undef TARGET_FUNCTION_ARG
27642 #define TARGET_FUNCTION_ARG aarch64_function_arg
27644 #undef TARGET_FUNCTION_ARG_ADVANCE
27645 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27647 #undef TARGET_FUNCTION_ARG_BOUNDARY
27648 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27650 #undef TARGET_FUNCTION_ARG_PADDING
27651 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27653 #undef TARGET_GET_RAW_RESULT_MODE
27654 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27655 #undef TARGET_GET_RAW_ARG_MODE
27656 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27658 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27659 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27661 #undef TARGET_FUNCTION_VALUE
27662 #define TARGET_FUNCTION_VALUE aarch64_function_value
27664 #undef TARGET_FUNCTION_VALUE_REGNO_P
27665 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27667 #undef TARGET_GIMPLE_FOLD_BUILTIN
27668 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27670 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27671 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27673 #undef TARGET_INIT_BUILTINS
27674 #define TARGET_INIT_BUILTINS aarch64_init_builtins
27676 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27677 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27678 aarch64_ira_change_pseudo_allocno_class
27680 #undef TARGET_LEGITIMATE_ADDRESS_P
27681 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27683 #undef TARGET_LEGITIMATE_CONSTANT_P
27684 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27686 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27687 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27688 aarch64_legitimize_address_displacement
27690 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27691 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27693 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27694 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27695 aarch64_libgcc_floating_mode_supported_p
27697 #undef TARGET_MANGLE_TYPE
27698 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27700 #undef TARGET_INVALID_CONVERSION
27701 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27703 #undef TARGET_INVALID_UNARY_OP
27704 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27706 #undef TARGET_INVALID_BINARY_OP
27707 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27709 #undef TARGET_VERIFY_TYPE_CONTEXT
27710 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27712 #undef TARGET_MEMORY_MOVE_COST
27713 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27715 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27716 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27718 #undef TARGET_MUST_PASS_IN_STACK
27719 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27721 /* This target hook should return true if accesses to volatile bitfields
27722 should use the narrowest mode possible. It should return false if these
27723 accesses should use the bitfield container type. */
27724 #undef TARGET_NARROW_VOLATILE_BITFIELD
27725 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27727 #undef TARGET_OPTION_OVERRIDE
27728 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27730 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27731 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27732 aarch64_override_options_after_change
27734 #undef TARGET_OFFLOAD_OPTIONS
27735 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27737 #undef TARGET_OPTION_RESTORE
27738 #define TARGET_OPTION_RESTORE aarch64_option_restore
27740 #undef TARGET_OPTION_PRINT
27741 #define TARGET_OPTION_PRINT aarch64_option_print
27743 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27744 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27746 #undef TARGET_SET_CURRENT_FUNCTION
27747 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27749 #undef TARGET_PASS_BY_REFERENCE
27750 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27752 #undef TARGET_PREFERRED_RELOAD_CLASS
27753 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27755 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27756 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27758 #undef TARGET_PROMOTED_TYPE
27759 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27761 #undef TARGET_SECONDARY_RELOAD
27762 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27764 #undef TARGET_SECONDARY_MEMORY_NEEDED
27765 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
27767 #undef TARGET_SHIFT_TRUNCATION_MASK
27768 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27770 #undef TARGET_SETUP_INCOMING_VARARGS
27771 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27773 #undef TARGET_STRUCT_VALUE_RTX
27774 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
27776 #undef TARGET_REGISTER_MOVE_COST
27777 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27779 #undef TARGET_RETURN_IN_MEMORY
27780 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27782 #undef TARGET_RETURN_IN_MSB
27783 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27785 #undef TARGET_RTX_COSTS
27786 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27788 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27789 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27791 #undef TARGET_SCHED_ISSUE_RATE
27792 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27794 #undef TARGET_SCHED_VARIABLE_ISSUE
27795 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27797 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27798 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27799 aarch64_sched_first_cycle_multipass_dfa_lookahead
27801 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27802 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27803 aarch64_first_cycle_multipass_dfa_lookahead_guard
27805 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27806 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27807 aarch64_get_separate_components
27809 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27810 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27811 aarch64_components_for_bb
27813 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27814 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27815 aarch64_disqualify_components
27817 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27818 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27819 aarch64_emit_prologue_components
27821 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27822 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27823 aarch64_emit_epilogue_components
27825 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27826 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27827 aarch64_set_handled_components
27829 #undef TARGET_TRAMPOLINE_INIT
27830 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27832 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27833 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27835 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27836 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27838 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
27839 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
27841 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
27842 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
27843 aarch64_builtin_support_vector_misalignment
27845 #undef TARGET_ARRAY_MODE
27846 #define TARGET_ARRAY_MODE aarch64_array_mode
27848 #undef TARGET_ARRAY_MODE_SUPPORTED_P
27849 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
27851 #undef TARGET_VECTORIZE_CREATE_COSTS
27852 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
27854 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
27855 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
27856 aarch64_builtin_vectorization_cost
27858 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
27859 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
27861 #undef TARGET_VECTORIZE_BUILTINS
27862 #define TARGET_VECTORIZE_BUILTINS
27864 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
27865 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
27866 aarch64_autovectorize_vector_modes
27868 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
27869 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
27870 aarch64_atomic_assign_expand_fenv
27872 /* Section anchor support. */
27874 #undef TARGET_MIN_ANCHOR_OFFSET
27875 #define TARGET_MIN_ANCHOR_OFFSET -256
27877 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
27878 byte offset; we can do much more for larger data types, but have no way
27879 to determine the size of the access. We assume accesses are aligned. */
27880 #undef TARGET_MAX_ANCHOR_OFFSET
27881 #define TARGET_MAX_ANCHOR_OFFSET 4095
27883 #undef TARGET_VECTOR_ALIGNMENT
27884 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
27886 #undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
27887 #define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
27888 aarch64_vectorize_can_special_div_by_constant
27890 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
27891 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
27892 aarch64_vectorize_preferred_vector_alignment
27893 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
27894 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
27895 aarch64_simd_vector_alignment_reachable
27897 /* vec_perm support. */
27899 #undef TARGET_VECTORIZE_VEC_PERM_CONST
27900 #define TARGET_VECTORIZE_VEC_PERM_CONST \
27901 aarch64_vectorize_vec_perm_const
27903 #undef TARGET_VECTORIZE_RELATED_MODE
27904 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
27905 #undef TARGET_VECTORIZE_GET_MASK_MODE
27906 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
27907 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
27908 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
27909 aarch64_empty_mask_is_expensive
27910 #undef TARGET_PREFERRED_ELSE_VALUE
27911 #define TARGET_PREFERRED_ELSE_VALUE \
27912 aarch64_preferred_else_value
27914 #undef TARGET_INIT_LIBFUNCS
27915 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
27917 #undef TARGET_FIXED_CONDITION_CODE_REGS
27918 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
27920 #undef TARGET_FLAGS_REGNUM
27921 #define TARGET_FLAGS_REGNUM CC_REGNUM
27923 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
27924 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
27926 #undef TARGET_ASAN_SHADOW_OFFSET
27927 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
27929 #undef TARGET_LEGITIMIZE_ADDRESS
27930 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
27932 #undef TARGET_SCHED_CAN_SPECULATE_INSN
27933 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
27935 #undef TARGET_CAN_USE_DOLOOP_P
27936 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
27938 #undef TARGET_SCHED_ADJUST_PRIORITY
27939 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
27941 #undef TARGET_SCHED_MACRO_FUSION_P
27942 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
27944 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
27945 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
27947 #undef TARGET_SCHED_FUSION_PRIORITY
27948 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
27950 #undef TARGET_UNSPEC_MAY_TRAP_P
27951 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
27953 #undef TARGET_USE_PSEUDO_PIC_REG
27954 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
27956 #undef TARGET_PRINT_OPERAND
27957 #define TARGET_PRINT_OPERAND aarch64_print_operand
27959 #undef TARGET_PRINT_OPERAND_ADDRESS
27960 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
27962 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
27963 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
27965 #undef TARGET_OPTAB_SUPPORTED_P
27966 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
27968 #undef TARGET_OMIT_STRUCT_RETURN_REG
27969 #define TARGET_OMIT_STRUCT_RETURN_REG true
27971 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
27972 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
27973 aarch64_dwarf_poly_indeterminate_value
27975 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
27976 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
27977 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
27979 #undef TARGET_HARD_REGNO_NREGS
27980 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
27981 #undef TARGET_HARD_REGNO_MODE_OK
27982 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
27984 #undef TARGET_MODES_TIEABLE_P
27985 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
27987 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
27988 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
27989 aarch64_hard_regno_call_part_clobbered
27991 #undef TARGET_INSN_CALLEE_ABI
27992 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
27994 #undef TARGET_CONSTANT_ALIGNMENT
27995 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
27997 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
27998 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
27999 aarch64_stack_clash_protection_alloca_probe_range
28001 #undef TARGET_COMPUTE_PRESSURE_CLASSES
28002 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28004 #undef TARGET_CAN_CHANGE_MODE_CLASS
28005 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28007 #undef TARGET_SELECT_EARLY_REMAT_MODES
28008 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28010 #undef TARGET_SPECULATION_SAFE_VALUE
28011 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28013 #undef TARGET_ESTIMATED_POLY_VALUE
28014 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28016 #undef TARGET_ATTRIBUTE_TABLE
28017 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28019 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28020 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28021 aarch64_simd_clone_compute_vecsize_and_simdlen
28023 #undef TARGET_SIMD_CLONE_ADJUST
28024 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28026 #undef TARGET_SIMD_CLONE_USABLE
28027 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28029 #undef TARGET_COMP_TYPE_ATTRIBUTES
28030 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28032 #undef TARGET_GET_MULTILIB_ABI_NAME
28033 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28035 #undef TARGET_FNTYPE_ABI
28036 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
28038 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28039 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28041 #if CHECKING_P
28042 #undef TARGET_RUN_TARGET_SELFTESTS
28043 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28044 #endif /* #if CHECKING_P */
28046 #undef TARGET_ASM_POST_CFI_STARTPROC
28047 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28049 #undef TARGET_STRICT_ARGUMENT_NAMING
28050 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28052 #undef TARGET_MD_ASM_ADJUST
28053 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28055 #undef TARGET_ASM_FILE_END
28056 #define TARGET_ASM_FILE_END aarch64_asm_file_end
28058 #undef TARGET_ASM_FUNCTION_EPILOGUE
28059 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28061 #undef TARGET_HAVE_SHADOW_CALL_STACK
28062 #define TARGET_HAVE_SHADOW_CALL_STACK true
28064 struct gcc_target targetm = TARGET_INITIALIZER;
28066 #include "gt-aarch64.h"