AArch64: Reassociate CONST in address expressions
[official-gcc.git] / gcc / config / aarch64 / aarch64.cc
blobe6bd3fd0bb42c70603d5335402b89c9deeaf48d8
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2024 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #define INCLUDE_VECTOR
26 #include "config.h"
27 #include "system.h"
28 #include "coretypes.h"
29 #include "backend.h"
30 #include "target.h"
31 #include "rtl.h"
32 #include "tree.h"
33 #include "memmodel.h"
34 #include "gimple.h"
35 #include "cfghooks.h"
36 #include "cfgloop.h"
37 #include "df.h"
38 #include "tm_p.h"
39 #include "stringpool.h"
40 #include "attribs.h"
41 #include "optabs.h"
42 #include "regs.h"
43 #include "emit-rtl.h"
44 #include "recog.h"
45 #include "cgraph.h"
46 #include "diagnostic.h"
47 #include "insn-attr.h"
48 #include "alias.h"
49 #include "fold-const.h"
50 #include "stor-layout.h"
51 #include "calls.h"
52 #include "varasm.h"
53 #include "output.h"
54 #include "flags.h"
55 #include "explow.h"
56 #include "expr.h"
57 #include "reload.h"
58 #include "langhooks.h"
59 #include "opts.h"
60 #include "gimplify.h"
61 #include "dwarf2.h"
62 #include "gimple-iterator.h"
63 #include "tree-vectorizer.h"
64 #include "aarch64-cost-tables.h"
65 #include "dumpfile.h"
66 #include "builtins.h"
67 #include "rtl-iter.h"
68 #include "tm-constrs.h"
69 #include "sched-int.h"
70 #include "target-globals.h"
71 #include "common/common-target.h"
72 #include "cfgrtl.h"
73 #include "selftest.h"
74 #include "selftest-rtl.h"
75 #include "rtx-vector-builder.h"
76 #include "intl.h"
77 #include "expmed.h"
78 #include "function-abi.h"
79 #include "gimple-pretty-print.h"
80 #include "tree-ssa-loop-niter.h"
81 #include "fractional-cost.h"
82 #include "rtlanal.h"
83 #include "tree-dfa.h"
84 #include "asan.h"
85 #include "aarch64-feature-deps.h"
86 #include "config/arm/aarch-common.h"
87 #include "config/arm/aarch-common-protos.h"
88 #include "common/config/aarch64/cpuinfo.h"
89 #include "ssa.h"
90 #include "except.h"
91 #include "tree-pass.h"
92 #include "cfgbuild.h"
93 #include "symbol-summary.h"
94 #include "ipa-prop.h"
95 #include "ipa-fnsummary.h"
96 #include "hash-map.h"
98 /* This file should be included last. */
99 #include "target-def.h"
101 /* Defined for convenience. */
102 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
104 /* Flags that describe how a function shares certain architectural state
105 with its callers.
107 - AARCH64_STATE_SHARED indicates that the function does share the state
108 with callers.
110 - AARCH64_STATE_IN indicates that the function reads (or might read) the
111 incoming state. The converse is that the function ignores the incoming
112 state.
114 - AARCH64_STATE_OUT indicates that the function returns new state.
115 The converse is that the state on return is the same as it was on entry.
117 A function that partially modifies the state treats it as both IN
118 and OUT (because the value on return depends to some extent on the
119 value on input). */
120 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
121 constexpr auto AARCH64_STATE_IN = 1U << 1;
122 constexpr auto AARCH64_STATE_OUT = 1U << 2;
124 /* Information about a legitimate vector immediate operand. */
125 struct simd_immediate_info
127 enum insn_type { MOV, MVN, INDEX, PTRUE };
128 enum modifier_type { LSL, MSL };
130 simd_immediate_info () {}
131 simd_immediate_info (scalar_float_mode, rtx);
132 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
133 insn_type = MOV, modifier_type = LSL,
134 unsigned int = 0);
135 simd_immediate_info (scalar_mode, rtx, rtx);
136 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
138 /* The mode of the elements. */
139 scalar_mode elt_mode;
141 /* The instruction to use to move the immediate into a vector. */
142 insn_type insn;
144 union
146 /* For MOV and MVN. */
147 struct
149 /* The value of each element. */
150 rtx value;
152 /* The kind of shift modifier to use, and the number of bits to shift.
153 This is (LSL, 0) if no shift is needed. */
154 modifier_type modifier;
155 unsigned int shift;
156 } mov;
158 /* For INDEX. */
159 struct
161 /* The value of the first element and the step to be added for each
162 subsequent element. */
163 rtx base, step;
164 } index;
166 /* For PTRUE. */
167 aarch64_svpattern pattern;
168 } u;
171 /* Construct a floating-point immediate in which each element has mode
172 ELT_MODE_IN and value VALUE_IN. */
173 inline simd_immediate_info
174 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
175 : elt_mode (elt_mode_in), insn (MOV)
177 u.mov.value = value_in;
178 u.mov.modifier = LSL;
179 u.mov.shift = 0;
182 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
183 and value VALUE_IN. The other parameters are as for the structure
184 fields. */
185 inline simd_immediate_info
186 ::simd_immediate_info (scalar_int_mode elt_mode_in,
187 unsigned HOST_WIDE_INT value_in,
188 insn_type insn_in, modifier_type modifier_in,
189 unsigned int shift_in)
190 : elt_mode (elt_mode_in), insn (insn_in)
192 u.mov.value = gen_int_mode (value_in, elt_mode_in);
193 u.mov.modifier = modifier_in;
194 u.mov.shift = shift_in;
197 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
198 and where element I is equal to BASE_IN + I * STEP_IN. */
199 inline simd_immediate_info
200 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
201 : elt_mode (elt_mode_in), insn (INDEX)
203 u.index.base = base_in;
204 u.index.step = step_in;
207 /* Construct a predicate that controls elements of mode ELT_MODE_IN
208 and has PTRUE pattern PATTERN_IN. */
209 inline simd_immediate_info
210 ::simd_immediate_info (scalar_int_mode elt_mode_in,
211 aarch64_svpattern pattern_in)
212 : elt_mode (elt_mode_in), insn (PTRUE)
214 u.pattern = pattern_in;
217 namespace {
219 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
220 class pure_scalable_type_info
222 public:
223 /* Represents the result of analyzing a type. All values are nonzero,
224 in the possibly forlorn hope that accidental conversions to bool
225 trigger a warning. */
226 enum analysis_result
228 /* The type does not have an ABI identity; i.e. it doesn't contain
229 at least one object whose type is a Fundamental Data Type. */
230 NO_ABI_IDENTITY = 1,
232 /* The type is definitely a Pure Scalable Type. */
233 IS_PST,
235 /* The type is definitely not a Pure Scalable Type. */
236 ISNT_PST,
238 /* It doesn't matter for PCS purposes whether the type is a Pure
239 Scalable Type or not, since the type will be handled the same
240 way regardless.
242 Specifically, this means that if the type is a Pure Scalable Type,
243 there aren't enough argument registers to hold it, and so it will
244 need to be passed or returned in memory. If the type isn't a
245 Pure Scalable Type, it's too big to be passed or returned in core
246 or SIMD&FP registers, and so again will need to go in memory. */
247 DOESNT_MATTER
250 /* Aggregates of 17 bytes or more are normally passed and returned
251 in memory, so aggregates of that size can safely be analyzed as
252 DOESNT_MATTER. We need to be able to collect enough pieces to
253 represent a PST that is smaller than that. Since predicates are
254 2 bytes in size for -msve-vector-bits=128, that means we need to be
255 able to store at least 8 pieces.
257 We also need to be able to store enough pieces to represent
258 a single vector in each vector argument register and a single
259 predicate in each predicate argument register. This means that
260 we need at least 12 pieces. */
261 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
262 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
264 /* Describes one piece of a PST. Each piece is one of:
266 - a single Scalable Vector Type (SVT)
267 - a single Scalable Predicate Type (SPT)
268 - a PST containing 2, 3 or 4 SVTs, with no padding
270 It either represents a single built-in type or a PST formed from
271 multiple homogeneous built-in types. */
272 struct piece
274 rtx get_rtx (unsigned int, unsigned int) const;
276 /* The number of vector and predicate registers that the piece
277 occupies. One of the two is always zero. */
278 unsigned int num_zr;
279 unsigned int num_pr;
281 /* The mode of the registers described above. */
282 machine_mode mode;
284 /* If this piece is formed from multiple homogeneous built-in types,
285 this is the mode of the built-in types, otherwise it is MODE. */
286 machine_mode orig_mode;
288 /* The offset in bytes of the piece from the start of the type. */
289 poly_uint64 offset;
292 /* Divides types analyzed as IS_PST into individual pieces. The pieces
293 are in memory order. */
294 auto_vec<piece, MAX_PIECES> pieces;
296 unsigned int num_zr () const;
297 unsigned int num_pr () const;
299 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
301 analysis_result analyze (const_tree);
302 bool analyze_registers (const_tree);
304 private:
305 analysis_result analyze_array (const_tree);
306 analysis_result analyze_record (const_tree);
307 void add_piece (const piece &);
311 /* The current code model. */
312 enum aarch64_code_model aarch64_cmodel;
314 enum aarch64_tp_reg aarch64_tpidr_register;
316 /* The number of 64-bit elements in an SVE vector. */
317 poly_uint16 aarch64_sve_vg;
319 #ifdef HAVE_AS_TLS
320 #undef TARGET_HAVE_TLS
321 #define TARGET_HAVE_TLS 1
322 #endif
324 static bool aarch64_composite_type_p (const_tree, machine_mode);
325 static bool aarch64_return_in_memory_1 (const_tree);
326 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
327 const_tree,
328 machine_mode *, int *,
329 bool *, bool);
330 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
331 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
332 static void aarch64_override_options_after_change (void);
333 static bool aarch64_vector_mode_supported_p (machine_mode);
334 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
335 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
336 const_tree type,
337 int misalignment,
338 bool is_packed);
339 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
340 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
341 aarch64_addr_query_type);
343 /* The processor for which instructions should be scheduled. */
344 enum aarch64_processor aarch64_tune = cortexa53;
346 /* Mask to specify which instruction scheduling options should be used. */
347 uint64_t aarch64_tune_flags = 0;
349 /* Global flag for PC relative loads. */
350 bool aarch64_pcrelative_literal_loads;
352 /* Global flag for whether frame pointer is enabled. */
353 bool aarch64_use_frame_pointer;
355 /* Support for command line parsing of boolean flags in the tuning
356 structures. */
357 struct aarch64_flag_desc
359 const char* name;
360 unsigned int flag;
363 #define AARCH64_FUSION_PAIR(name, internal_name) \
364 { name, AARCH64_FUSE_##internal_name },
365 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
367 { "none", AARCH64_FUSE_NOTHING },
368 #include "aarch64-fusion-pairs.def"
369 { "all", AARCH64_FUSE_ALL },
370 { NULL, AARCH64_FUSE_NOTHING }
373 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
374 { name, AARCH64_EXTRA_TUNE_##internal_name },
375 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
377 { "none", AARCH64_EXTRA_TUNE_NONE },
378 #include "aarch64-tuning-flags.def"
379 { "all", AARCH64_EXTRA_TUNE_ALL },
380 { NULL, AARCH64_EXTRA_TUNE_NONE }
383 /* Tuning parameters. */
384 #include "tuning_models/generic.h"
385 #include "tuning_models/generic_armv8_a.h"
386 #include "tuning_models/generic_armv9_a.h"
387 #include "tuning_models/cortexa35.h"
388 #include "tuning_models/cortexa53.h"
389 #include "tuning_models/cortexa57.h"
390 #include "tuning_models/cortexa72.h"
391 #include "tuning_models/cortexa73.h"
392 #include "tuning_models/exynosm1.h"
393 #include "tuning_models/thunderxt88.h"
394 #include "tuning_models/thunderx.h"
395 #include "tuning_models/tsv110.h"
396 #include "tuning_models/xgene1.h"
397 #include "tuning_models/emag.h"
398 #include "tuning_models/qdf24xx.h"
399 #include "tuning_models/saphira.h"
400 #include "tuning_models/thunderx2t99.h"
401 #include "tuning_models/thunderx3t110.h"
402 #include "tuning_models/neoversen1.h"
403 #include "tuning_models/ampere1.h"
404 #include "tuning_models/ampere1a.h"
405 #include "tuning_models/ampere1b.h"
406 #include "tuning_models/neoversev1.h"
407 #include "tuning_models/neoverse512tvb.h"
408 #include "tuning_models/neoversen2.h"
409 #include "tuning_models/neoversev2.h"
410 #include "tuning_models/a64fx.h"
412 /* Support for fine-grained override of the tuning structures. */
413 struct aarch64_tuning_override_function
415 const char* name;
416 void (*parse_override)(const char*, struct tune_params*);
419 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
420 static void aarch64_parse_tune_string (const char*, struct tune_params*);
421 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
423 static const struct aarch64_tuning_override_function
424 aarch64_tuning_override_functions[] =
426 { "fuse", aarch64_parse_fuse_string },
427 { "tune", aarch64_parse_tune_string },
428 { "sve_width", aarch64_parse_sve_width_string },
429 { NULL, NULL }
432 /* A processor implementing AArch64. */
433 struct processor
435 const char *name;
436 aarch64_processor ident;
437 aarch64_processor sched_core;
438 aarch64_arch arch;
439 aarch64_feature_flags flags;
440 const tune_params *tune;
443 /* Architectures implementing AArch64. */
444 static CONSTEXPR const processor all_architectures[] =
446 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
447 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
448 feature_deps::ARCH_IDENT ().enable, NULL},
449 #include "aarch64-arches.def"
450 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
453 /* Processor cores implementing AArch64. */
454 static const struct processor all_cores[] =
456 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
457 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
458 feature_deps::cpu_##IDENT, &COSTS##_tunings},
459 #include "aarch64-cores.def"
460 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
462 /* Internal representation of system registers. */
463 typedef struct {
464 const char *name;
465 /* Stringified sysreg encoding values, represented as
466 s<sn>_<op1>_c<cn>_c<cm>_<op2>. */
467 const char *encoding;
468 /* Flags affecting sysreg usage, such as read/write-only. */
469 unsigned properties;
470 /* Architectural features implied by sysreg. */
471 aarch64_feature_flags arch_reqs;
472 } sysreg_t;
474 /* An aarch64_feature_set initializer for a single feature,
475 AARCH64_FEATURE_<FEAT>. */
476 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
478 /* Used by AARCH64_FEATURES. */
479 #define AARCH64_OR_FEATURES_1(X, F1) \
480 AARCH64_FEATURE (F1)
481 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
482 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
483 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
484 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
486 /* An aarch64_feature_set initializer for the N features listed in "...". */
487 #define AARCH64_FEATURES(N, ...) \
488 AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
490 #define AARCH64_NO_FEATURES 0
492 /* Flags associated with the properties of system registers. It mainly serves
493 to mark particular registers as read or write only. */
494 #define F_DEPRECATED (1 << 1)
495 #define F_REG_READ (1 << 2)
496 #define F_REG_WRITE (1 << 3)
497 #define F_ARCHEXT (1 << 4)
498 /* Flag indicating register name is alias for another system register. */
499 #define F_REG_ALIAS (1 << 5)
500 /* Flag indicatinig registers which may be implemented with 128-bits. */
501 #define F_REG_128 (1 << 6)
503 /* Database of system registers, their encodings and architectural
504 requirements. */
505 const sysreg_t aarch64_sysregs[] =
507 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
508 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
509 { NAME, ENC, FLAGS, ARCH },
510 #include "aarch64-sys-regs.def"
511 #undef CPENC
514 #undef AARCH64_NO_FEATURES
516 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
517 static sysreg_map_t *sysreg_map = nullptr;
519 /* Map system register names to their hardware metadata: encoding,
520 feature flags and architectural feature requirements, all of which
521 are encoded in a sysreg_t struct. */
522 void
523 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
525 bool dup = sysreg_map->put (name, metadata);
526 gcc_checking_assert (!dup);
529 /* Lazily initialize hash table for system register validation,
530 checking the validity of supplied register name and returning
531 register's associated metadata. */
532 static void
533 aarch64_init_sysregs (void)
535 gcc_assert (!sysreg_map);
536 sysreg_map = new sysreg_map_t;
539 for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
541 const sysreg_t *reg = aarch64_sysregs + i;
542 aarch64_register_sysreg (reg->name, reg);
546 /* No direct access to the sysreg hash-map should be made. Doing so
547 risks trying to acess an unitialized hash-map and dereferencing the
548 returned double pointer without due care risks dereferencing a
549 null-pointer. */
550 const sysreg_t *
551 aarch64_lookup_sysreg_map (const char *regname)
553 if (!sysreg_map)
554 aarch64_init_sysregs ();
556 const sysreg_t **sysreg_entry = sysreg_map->get (regname);
557 if (sysreg_entry != NULL)
558 return *sysreg_entry;
559 return NULL;
562 /* The current tuning set. */
563 struct tune_params aarch64_tune_params = generic_tunings;
565 /* If NAME is the name of an arm:: attribute that describes shared state,
566 return its associated AARCH64_STATE_* flags, otherwise return 0. */
567 static unsigned int
568 aarch64_attribute_shared_state_flags (const char *name)
570 if (strcmp (name, "in") == 0)
571 return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
572 if (strcmp (name, "inout") == 0)
573 return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
574 if (strcmp (name, "out") == 0)
575 return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
576 if (strcmp (name, "preserves") == 0)
577 return AARCH64_STATE_SHARED;
578 return 0;
581 /* See whether attribute list ATTRS has any sharing information
582 for state STATE_NAME. Return the associated state flags if so,
583 otherwise return 0. */
584 static unsigned int
585 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
587 for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
589 if (!cxx11_attribute_p (attr))
590 continue;
592 auto ns = IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr)));
593 if (strcmp (ns, "arm") != 0)
594 continue;
596 auto attr_name = IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr)));
597 auto flags = aarch64_attribute_shared_state_flags (attr_name);
598 if (!flags)
599 continue;
601 for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
603 tree value = TREE_VALUE (arg);
604 if (TREE_CODE (value) == STRING_CST
605 && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
606 return flags;
609 return 0;
612 /* Return true if DECL creates a new scope for state STATE_STRING. */
613 static bool
614 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
616 if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
617 for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
619 tree value = TREE_VALUE (arg);
620 if (TREE_CODE (value) == STRING_CST
621 && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
622 return true;
624 return false;
627 /* Return true if attribute argument VALUE is a recognized state string,
628 otherwise report an error. NAME is the name of the attribute to which
629 VALUE is being passed. */
630 static bool
631 aarch64_check_state_string (tree name, tree value)
633 if (TREE_CODE (value) != STRING_CST)
635 error ("the arguments to %qE must be constant strings", name);
636 return false;
639 const char *state_name = TREE_STRING_POINTER (value);
640 if (strcmp (state_name, "za") != 0
641 && strcmp (state_name, "zt0") != 0)
643 error ("unrecognized state string %qs", state_name);
644 return false;
647 return true;
650 /* qsort callback to compare two STRING_CSTs. */
651 static int
652 cmp_string_csts (const void *a, const void *b)
654 return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
655 TREE_STRING_POINTER (*(const_tree const *) b));
658 /* Canonicalize a list of state strings. ARGS contains the arguments to
659 a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
660 of the same type. If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
661 arguments and drop the new attribute. Otherwise, the new attribute must
662 be kept and ARGS must include the information in OLD_ATTR.
664 In both cases, the new arguments must be a sorted list of state strings
665 with duplicates removed.
667 Return true if new attribute should be kept, false if it should be
668 dropped. */
669 static bool
670 aarch64_merge_string_arguments (tree args, tree old_attr,
671 bool can_merge_in_place)
673 /* Get a sorted list of all state strings (including duplicates). */
674 auto add_args = [](vec<tree> &strings, const_tree args)
676 for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
677 if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
678 strings.safe_push (TREE_VALUE (arg));
680 auto_vec<tree, 16> strings;
681 add_args (strings, args);
682 if (old_attr)
683 add_args (strings, TREE_VALUE (old_attr));
684 strings.qsort (cmp_string_csts);
686 /* The list can be empty if there was no previous attribute and if all
687 the new arguments are erroneous. Drop the attribute in that case. */
688 if (strings.is_empty ())
689 return false;
691 /* Destructively modify one of the argument lists, removing duplicates
692 on the fly. */
693 bool use_old_attr = old_attr && can_merge_in_place;
694 tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
695 tree prev = NULL_TREE;
696 for (tree arg : strings)
698 if (prev && simple_cst_equal (arg, prev))
699 continue;
700 prev = arg;
701 if (!*end)
702 *end = tree_cons (NULL_TREE, arg, NULL_TREE);
703 else
704 TREE_VALUE (*end) = arg;
705 end = &TREE_CHAIN (*end);
707 *end = NULL_TREE;
708 return !use_old_attr;
711 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
713 static tree
714 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
715 int, bool *no_add_attrs)
717 /* Since we set fn_type_req to true, the caller should have checked
718 this for us. */
719 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
720 switch ((arm_pcs) fntype_abi (*node).id ())
722 case ARM_PCS_AAPCS64:
723 case ARM_PCS_SIMD:
724 return NULL_TREE;
726 case ARM_PCS_SVE:
727 error ("the %qE attribute cannot be applied to an SVE function type",
728 name);
729 *no_add_attrs = true;
730 return NULL_TREE;
732 case ARM_PCS_TLSDESC:
733 case ARM_PCS_UNKNOWN:
734 break;
736 gcc_unreachable ();
739 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
740 otherwise report an error. */
741 static bool
742 aarch64_check_arm_new_against_type (tree args, tree decl)
744 tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
745 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
747 tree value = TREE_VALUE (arg);
748 if (TREE_CODE (value) == STRING_CST)
750 const char *state_name = TREE_STRING_POINTER (value);
751 if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
753 error_at (DECL_SOURCE_LOCATION (decl),
754 "cannot create a new %qs scope since %qs is shared"
755 " with callers", state_name, state_name);
756 return false;
760 return true;
763 /* Callback for arm::new attributes. */
764 static tree
765 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
767 tree decl = *node;
768 if (TREE_CODE (decl) != FUNCTION_DECL)
770 error ("%qE attribute applies only to function definitions", name);
771 *no_add_attrs = true;
772 return NULL_TREE;
774 if (TREE_TYPE (decl) == error_mark_node)
776 *no_add_attrs = true;
777 return NULL_TREE;
780 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
781 aarch64_check_state_string (name, TREE_VALUE (arg));
783 if (!aarch64_check_arm_new_against_type (args, decl))
785 *no_add_attrs = true;
786 return NULL_TREE;
789 /* If there is an old attribute, we should try to update it in-place,
790 so that there is only one (definitive) arm::new attribute on the decl. */
791 tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
792 if (!aarch64_merge_string_arguments (args, old_attr, true))
793 *no_add_attrs = true;
795 return NULL_TREE;
798 /* Callback for arm::{in,out,inout,preserves} attributes. */
799 static tree
800 handle_arm_shared (tree *node, tree name, tree args,
801 int, bool *no_add_attrs)
803 tree type = *node;
804 tree old_attrs = TYPE_ATTRIBUTES (type);
805 auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
806 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
808 tree value = TREE_VALUE (arg);
809 if (aarch64_check_state_string (name, value))
811 const char *state_name = TREE_STRING_POINTER (value);
812 auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
813 state_name);
814 if (old_flags && old_flags != flags)
816 error ("inconsistent attributes for state %qs", state_name);
817 *no_add_attrs = true;
818 return NULL_TREE;
823 /* We can't update an old attribute in-place, since types are shared.
824 Instead make sure that this new attribute contains all the
825 information, so that the old attribute becomes redundant. */
826 tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
827 old_attrs);
828 if (!aarch64_merge_string_arguments (args, old_attr, false))
829 *no_add_attrs = true;
831 return NULL_TREE;
834 /* Mutually-exclusive function type attributes for controlling PSTATE.SM. */
835 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
837 /* Attribute name exclusion applies to:
838 function, type, variable */
839 { "streaming", false, true, false },
840 { "streaming_compatible", false, true, false },
841 { NULL, false, false, false }
844 /* Table of machine attributes. */
845 static const attribute_spec aarch64_gnu_attributes[] =
847 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
848 affects_type_identity, handler, exclude } */
849 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
850 handle_aarch64_vector_pcs_attribute, NULL },
851 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
852 aarch64_sve::handle_arm_sve_vector_bits_attribute,
853 NULL },
854 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
855 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
856 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL }
859 static const scoped_attribute_specs aarch64_gnu_attribute_table =
861 "gnu", { aarch64_gnu_attributes }
864 static const attribute_spec aarch64_arm_attributes[] =
866 { "streaming", 0, 0, false, true, true, true,
867 NULL, attr_streaming_exclusions },
868 { "streaming_compatible", 0, 0, false, true, true, true,
869 NULL, attr_streaming_exclusions },
870 { "locally_streaming", 0, 0, true, false, false, false, NULL, NULL },
871 { "new", 1, -1, true, false, false, false,
872 handle_arm_new, NULL },
873 { "preserves", 1, -1, false, true, true, true,
874 handle_arm_shared, NULL },
875 { "in", 1, -1, false, true, true, true,
876 handle_arm_shared, NULL },
877 { "out", 1, -1, false, true, true, true,
878 handle_arm_shared, NULL },
879 { "inout", 1, -1, false, true, true, true,
880 handle_arm_shared, NULL }
883 static const scoped_attribute_specs aarch64_arm_attribute_table =
885 "arm", { aarch64_arm_attributes }
888 static const scoped_attribute_specs *const aarch64_attribute_table[] =
890 &aarch64_gnu_attribute_table,
891 &aarch64_arm_attribute_table
894 typedef enum aarch64_cond_code
896 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
897 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
898 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
900 aarch64_cc;
902 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
905 /* The condition codes of the processor, and the inverse function. */
906 static const char * const aarch64_condition_codes[] =
908 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
909 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
912 /* The preferred condition codes for SVE conditions. */
913 static const char *const aarch64_sve_condition_codes[] =
915 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
916 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
919 /* Return the assembly token for svpattern value VALUE. */
921 static const char *
922 svpattern_token (enum aarch64_svpattern pattern)
924 switch (pattern)
926 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
927 AARCH64_FOR_SVPATTERN (CASE)
928 #undef CASE
929 case AARCH64_NUM_SVPATTERNS:
930 break;
932 gcc_unreachable ();
935 /* Return the location of a piece that is known to be passed or returned
936 in registers. FIRST_ZR is the first unused vector argument register
937 and FIRST_PR is the first unused predicate argument register. */
940 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
941 unsigned int first_pr) const
943 gcc_assert (VECTOR_MODE_P (mode)
944 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
945 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
947 if (num_zr > 0 && num_pr == 0)
948 return gen_rtx_REG (mode, first_zr);
950 if (num_zr == 0 && num_pr <= 2)
951 return gen_rtx_REG (mode, first_pr);
953 gcc_unreachable ();
956 /* Return the total number of vector registers required by the PST. */
958 unsigned int
959 pure_scalable_type_info::num_zr () const
961 unsigned int res = 0;
962 for (unsigned int i = 0; i < pieces.length (); ++i)
963 res += pieces[i].num_zr;
964 return res;
967 /* Return the total number of predicate registers required by the PST. */
969 unsigned int
970 pure_scalable_type_info::num_pr () const
972 unsigned int res = 0;
973 for (unsigned int i = 0; i < pieces.length (); ++i)
974 res += pieces[i].num_pr;
975 return res;
978 /* Return the location of a PST that is known to be passed or returned
979 in registers. FIRST_ZR is the first unused vector argument register
980 and FIRST_PR is the first unused predicate argument register. */
983 pure_scalable_type_info::get_rtx (machine_mode mode,
984 unsigned int first_zr,
985 unsigned int first_pr) const
987 /* Try to return a single REG if possible. This leads to better
988 code generation; it isn't required for correctness. */
989 if (mode == pieces[0].mode)
991 gcc_assert (pieces.length () == 1);
992 return pieces[0].get_rtx (first_zr, first_pr);
995 /* Build up a PARALLEL that contains the individual pieces. */
996 rtvec rtxes = rtvec_alloc (pieces.length ());
997 for (unsigned int i = 0; i < pieces.length (); ++i)
999 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1000 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1001 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1002 first_zr += pieces[i].num_zr;
1003 first_pr += pieces[i].num_pr;
1005 return gen_rtx_PARALLEL (mode, rtxes);
1008 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1009 in the AAPCS64. */
1011 pure_scalable_type_info::analysis_result
1012 pure_scalable_type_info::analyze (const_tree type)
1014 /* Prevent accidental reuse. */
1015 gcc_assert (pieces.is_empty ());
1017 /* No code will be generated for erroneous types, so we won't establish
1018 an ABI mapping. */
1019 if (type == error_mark_node)
1020 return NO_ABI_IDENTITY;
1022 /* Zero-sized types disappear in the language->ABI mapping. */
1023 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1024 return NO_ABI_IDENTITY;
1026 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1027 piece p = {};
1028 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1030 machine_mode mode = TYPE_MODE_RAW (type);
1031 gcc_assert (VECTOR_MODE_P (mode)
1032 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1034 p.mode = p.orig_mode = mode;
1035 add_piece (p);
1036 return IS_PST;
1039 /* Check for user-defined PSTs. */
1040 if (TREE_CODE (type) == ARRAY_TYPE)
1041 return analyze_array (type);
1042 if (TREE_CODE (type) == RECORD_TYPE)
1043 return analyze_record (type);
1045 return ISNT_PST;
1048 /* Analyze a type that is known not to be passed or returned in memory.
1049 Return true if it has an ABI identity and is a Pure Scalable Type. */
1051 bool
1052 pure_scalable_type_info::analyze_registers (const_tree type)
1054 analysis_result result = analyze (type);
1055 gcc_assert (result != DOESNT_MATTER);
1056 return result == IS_PST;
1059 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1061 pure_scalable_type_info::analysis_result
1062 pure_scalable_type_info::analyze_array (const_tree type)
1064 /* Analyze the element type. */
1065 pure_scalable_type_info element_info;
1066 analysis_result result = element_info.analyze (TREE_TYPE (type));
1067 if (result != IS_PST)
1068 return result;
1070 /* An array of unknown, flexible or variable length will be passed and
1071 returned by reference whatever we do. */
1072 tree nelts_minus_one = array_type_nelts (type);
1073 if (!tree_fits_uhwi_p (nelts_minus_one))
1074 return DOESNT_MATTER;
1076 /* Likewise if the array is constant-sized but too big to be interesting.
1077 The double checks against MAX_PIECES are to protect against overflow. */
1078 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1079 if (count > MAX_PIECES)
1080 return DOESNT_MATTER;
1081 count += 1;
1082 if (count * element_info.pieces.length () > MAX_PIECES)
1083 return DOESNT_MATTER;
1085 /* The above checks should have weeded out elements of unknown size. */
1086 poly_uint64 element_bytes;
1087 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1088 gcc_unreachable ();
1090 /* Build up the list of individual vectors and predicates. */
1091 gcc_assert (!element_info.pieces.is_empty ());
1092 for (unsigned int i = 0; i < count; ++i)
1093 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1095 piece p = element_info.pieces[j];
1096 p.offset += i * element_bytes;
1097 add_piece (p);
1099 return IS_PST;
1102 /* Subroutine of analyze for handling RECORD_TYPEs. */
1104 pure_scalable_type_info::analysis_result
1105 pure_scalable_type_info::analyze_record (const_tree type)
1107 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1109 if (TREE_CODE (field) != FIELD_DECL)
1110 continue;
1112 /* Zero-sized fields disappear in the language->ABI mapping. */
1113 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1114 continue;
1116 /* All fields with an ABI identity must be PSTs for the record as
1117 a whole to be a PST. If any individual field is too big to be
1118 interesting then the record is too. */
1119 pure_scalable_type_info field_info;
1120 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1121 if (subresult == NO_ABI_IDENTITY)
1122 continue;
1123 if (subresult != IS_PST)
1124 return subresult;
1126 /* Since all previous fields are PSTs, we ought to be able to track
1127 the field offset using poly_ints. */
1128 tree bitpos = bit_position (field);
1129 gcc_assert (poly_int_tree_p (bitpos));
1131 /* For the same reason, it shouldn't be possible to create a PST field
1132 whose offset isn't byte-aligned. */
1133 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1134 BITS_PER_UNIT);
1136 /* Punt if the record is too big to be interesting. */
1137 poly_uint64 bytepos;
1138 if (!wide_bytepos.to_uhwi (&bytepos)
1139 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1140 return DOESNT_MATTER;
1142 /* Add the individual vectors and predicates in the field to the
1143 record's list. */
1144 gcc_assert (!field_info.pieces.is_empty ());
1145 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1147 piece p = field_info.pieces[i];
1148 p.offset += bytepos;
1149 add_piece (p);
1152 /* Empty structures disappear in the language->ABI mapping. */
1153 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1156 /* Add P to the list of pieces in the type. */
1158 void
1159 pure_scalable_type_info::add_piece (const piece &p)
1161 /* Try to fold the new piece into the previous one to form a
1162 single-mode PST. For example, if we see three consecutive vectors
1163 of the same mode, we can represent them using the corresponding
1164 3-tuple mode.
1166 This is purely an optimization. */
1167 if (!pieces.is_empty ())
1169 piece &prev = pieces.last ();
1170 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1171 unsigned int nelems1, nelems2;
1172 if (prev.orig_mode == p.orig_mode
1173 && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1174 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1175 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1176 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1177 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1178 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1179 && targetm.array_mode (p.orig_mode,
1180 nelems1 + nelems2).exists (&prev.mode))
1182 prev.num_zr += p.num_zr;
1183 prev.num_pr += p.num_pr;
1184 return;
1187 pieces.quick_push (p);
1190 /* Return true if at least one possible value of type TYPE includes at
1191 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1193 This is a relatively expensive test for some types, so it should
1194 generally be made as late as possible. */
1196 static bool
1197 aarch64_some_values_include_pst_objects_p (const_tree type)
1199 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1200 return false;
1202 if (aarch64_sve::builtin_type_p (type))
1203 return true;
1205 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1206 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1208 if (RECORD_OR_UNION_TYPE_P (type))
1209 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1210 if (TREE_CODE (field) == FIELD_DECL
1211 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1212 return true;
1214 return false;
1217 /* Return the descriptor of the SIMD ABI. */
1219 static const predefined_function_abi &
1220 aarch64_simd_abi (void)
1222 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1223 if (!simd_abi.initialized_p ())
1225 HARD_REG_SET full_reg_clobbers
1226 = default_function_abi.full_reg_clobbers ();
1227 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1228 if (FP_SIMD_SAVED_REGNUM_P (regno))
1229 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1230 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1232 return simd_abi;
1235 /* Return the descriptor of the SVE PCS. */
1237 static const predefined_function_abi &
1238 aarch64_sve_abi (void)
1240 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1241 if (!sve_abi.initialized_p ())
1243 HARD_REG_SET full_reg_clobbers
1244 = default_function_abi.full_reg_clobbers ();
1245 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1246 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1247 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1248 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1249 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1251 return sve_abi;
1254 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1255 wraps, otherwise return X itself. */
1257 static rtx
1258 strip_salt (rtx x)
1260 rtx search = x;
1261 if (GET_CODE (search) == CONST)
1262 search = XEXP (search, 0);
1263 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1264 x = XVECEXP (search, 0, 0);
1265 return x;
1268 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1269 expression. */
1271 static rtx
1272 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1274 return strip_salt (strip_offset (addr, offset));
1277 /* Generate code to enable conditional branches in functions over 1 MiB. */
1278 const char *
1279 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1280 const char * branch_format)
1282 rtx_code_label * tmp_label = gen_label_rtx ();
1283 char label_buf[256];
1284 char buffer[128];
1285 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1286 CODE_LABEL_NUMBER (tmp_label));
1287 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1288 rtx dest_label = operands[pos_label];
1289 operands[pos_label] = tmp_label;
1291 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1292 output_asm_insn (buffer, operands);
1294 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1295 operands[pos_label] = dest_label;
1296 output_asm_insn (buffer, operands);
1297 return "";
1300 void
1301 aarch64_err_no_fpadvsimd (machine_mode mode)
1303 if (TARGET_GENERAL_REGS_ONLY)
1304 if (FLOAT_MODE_P (mode))
1305 error ("%qs is incompatible with the use of floating-point types",
1306 "-mgeneral-regs-only");
1307 else
1308 error ("%qs is incompatible with the use of vector types",
1309 "-mgeneral-regs-only");
1310 else
1311 if (FLOAT_MODE_P (mode))
1312 error ("%qs feature modifier is incompatible with the use of"
1313 " floating-point types", "+nofp");
1314 else
1315 error ("%qs feature modifier is incompatible with the use of"
1316 " vector types", "+nofp");
1319 /* Report when we try to do something that requires SVE when SVE is disabled.
1320 This is an error of last resort and isn't very high-quality. It usually
1321 involves attempts to measure the vector length in some way. */
1322 static void
1323 aarch64_report_sve_required (void)
1325 static bool reported_p = false;
1327 /* Avoid reporting a slew of messages for a single oversight. */
1328 if (reported_p)
1329 return;
1331 error ("this operation requires the SVE ISA extension");
1332 inform (input_location, "you can enable SVE using the command-line"
1333 " option %<-march%>, or by using the %<target%>"
1334 " attribute or pragma");
1335 reported_p = true;
1338 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1339 registers. */
1340 inline bool
1341 pr_or_ffr_regnum_p (unsigned int regno)
1343 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1346 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1347 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1348 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1349 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1350 and GENERAL_REGS is lower than the memory cost (in this case the best class
1351 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1352 cost results in bad allocations with many redundant int<->FP moves which
1353 are expensive on various cores.
1354 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1355 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1356 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1357 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1358 The result of this is that it is no longer inefficient to have a higher
1359 memory move cost than the register move cost.
1362 static reg_class_t
1363 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1364 reg_class_t best_class)
1366 machine_mode mode;
1368 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1369 || !reg_class_subset_p (FP_REGS, allocno_class))
1370 return allocno_class;
1372 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1373 || !reg_class_subset_p (FP_REGS, best_class))
1374 return best_class;
1376 mode = PSEUDO_REGNO_MODE (regno);
1377 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1380 static unsigned int
1381 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1383 if (GET_MODE_UNIT_SIZE (mode) == 4)
1384 return aarch64_tune_params.min_div_recip_mul_sf;
1385 return aarch64_tune_params.min_div_recip_mul_df;
1388 /* Return the reassociation width of treeop OPC with mode MODE. */
1389 static int
1390 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1392 if (VECTOR_MODE_P (mode))
1393 return aarch64_tune_params.vec_reassoc_width;
1394 if (INTEGRAL_MODE_P (mode))
1395 return aarch64_tune_params.int_reassoc_width;
1396 /* Reassociation reduces the number of FMAs which may result in worse
1397 performance. Use a per-CPU setting for FMA reassociation which allows
1398 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1399 CPUs with many FP pipes to enable reassociation.
1400 Since the reassociation pass doesn't understand FMA at all, assume
1401 that any FP addition might turn into FMA. */
1402 if (FLOAT_MODE_P (mode))
1403 return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1404 : aarch64_tune_params.fp_reassoc_width;
1405 return 1;
1408 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1409 unsigned
1410 aarch64_debugger_regno (unsigned regno)
1412 if (GP_REGNUM_P (regno))
1413 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1414 else if (regno == SP_REGNUM)
1415 return AARCH64_DWARF_SP;
1416 else if (FP_REGNUM_P (regno))
1417 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1418 else if (PR_REGNUM_P (regno))
1419 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1420 else if (regno == VG_REGNUM)
1421 return AARCH64_DWARF_VG;
1423 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1424 equivalent DWARF register. */
1425 return DWARF_FRAME_REGISTERS;
1428 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
1429 static machine_mode
1430 aarch64_dwarf_frame_reg_mode (int regno)
1432 /* Predicate registers are call-clobbered in the EH ABI (which is
1433 ARM_PCS_AAPCS64), so they should not be described by CFI.
1434 Their size changes as VL changes, so any values computed by
1435 __builtin_init_dwarf_reg_size_table might not be valid for
1436 all frames. */
1437 if (PR_REGNUM_P (regno))
1438 return VOIDmode;
1439 return default_dwarf_frame_reg_mode (regno);
1442 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1443 integer, otherwise return X unmodified. */
1444 static rtx
1445 aarch64_bit_representation (rtx x)
1447 if (CONST_DOUBLE_P (x))
1448 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1449 return x;
1452 /* Return an estimate for the number of quadwords in an SVE vector. This is
1453 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
1454 static unsigned int
1455 aarch64_estimated_sve_vq ()
1457 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1460 /* Return true if MODE is an SVE predicate mode. */
1461 static bool
1462 aarch64_sve_pred_mode_p (machine_mode mode)
1464 return (TARGET_SVE
1465 && (mode == VNx16BImode
1466 || mode == VNx8BImode
1467 || mode == VNx4BImode
1468 || mode == VNx2BImode));
1471 /* Three mutually-exclusive flags describing a vector or predicate type. */
1472 const unsigned int VEC_ADVSIMD = 1;
1473 const unsigned int VEC_SVE_DATA = 2;
1474 const unsigned int VEC_SVE_PRED = 4;
1475 /* Indicates a structure of 2, 3 or 4 vectors or predicates. */
1476 const unsigned int VEC_STRUCT = 8;
1477 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1478 vector has fewer significant bytes than a full SVE vector. */
1479 const unsigned int VEC_PARTIAL = 16;
1480 /* Useful combinations of the above. */
1481 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1482 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1484 /* Return a set of flags describing the vector properties of mode MODE.
1485 If ANY_TARGET_P is false (the default), ignore modes that are not supported
1486 by the current target. Otherwise categorize the modes that can be used
1487 with the set of all targets supported by the port. */
1489 static unsigned int
1490 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1492 if (aarch64_sve_pred_mode_p (mode))
1493 return VEC_SVE_PRED;
1495 /* Make the decision based on the mode's enum value rather than its
1496 properties, so that we keep the correct classification regardless
1497 of -msve-vector-bits. */
1498 switch (mode)
1500 /* Partial SVE QI vectors. */
1501 case E_VNx2QImode:
1502 case E_VNx4QImode:
1503 case E_VNx8QImode:
1504 /* Partial SVE HI vectors. */
1505 case E_VNx2HImode:
1506 case E_VNx4HImode:
1507 /* Partial SVE SI vector. */
1508 case E_VNx2SImode:
1509 /* Partial SVE HF vectors. */
1510 case E_VNx2HFmode:
1511 case E_VNx4HFmode:
1512 /* Partial SVE BF vectors. */
1513 case E_VNx2BFmode:
1514 case E_VNx4BFmode:
1515 /* Partial SVE SF vector. */
1516 case E_VNx2SFmode:
1517 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1519 case E_VNx16QImode:
1520 case E_VNx8HImode:
1521 case E_VNx4SImode:
1522 case E_VNx2DImode:
1523 case E_VNx8BFmode:
1524 case E_VNx8HFmode:
1525 case E_VNx4SFmode:
1526 case E_VNx2DFmode:
1527 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1529 /* x2 SVE vectors. */
1530 case E_VNx32QImode:
1531 case E_VNx16HImode:
1532 case E_VNx8SImode:
1533 case E_VNx4DImode:
1534 case E_VNx16BFmode:
1535 case E_VNx16HFmode:
1536 case E_VNx8SFmode:
1537 case E_VNx4DFmode:
1538 /* x3 SVE vectors. */
1539 case E_VNx48QImode:
1540 case E_VNx24HImode:
1541 case E_VNx12SImode:
1542 case E_VNx6DImode:
1543 case E_VNx24BFmode:
1544 case E_VNx24HFmode:
1545 case E_VNx12SFmode:
1546 case E_VNx6DFmode:
1547 /* x4 SVE vectors. */
1548 case E_VNx64QImode:
1549 case E_VNx32HImode:
1550 case E_VNx16SImode:
1551 case E_VNx8DImode:
1552 case E_VNx32BFmode:
1553 case E_VNx32HFmode:
1554 case E_VNx16SFmode:
1555 case E_VNx8DFmode:
1556 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1558 case E_OImode:
1559 case E_CImode:
1560 case E_XImode:
1561 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1563 /* Structures of 64-bit Advanced SIMD vectors. */
1564 case E_V2x8QImode:
1565 case E_V2x4HImode:
1566 case E_V2x2SImode:
1567 case E_V2x1DImode:
1568 case E_V2x4BFmode:
1569 case E_V2x4HFmode:
1570 case E_V2x2SFmode:
1571 case E_V2x1DFmode:
1572 case E_V3x8QImode:
1573 case E_V3x4HImode:
1574 case E_V3x2SImode:
1575 case E_V3x1DImode:
1576 case E_V3x4BFmode:
1577 case E_V3x4HFmode:
1578 case E_V3x2SFmode:
1579 case E_V3x1DFmode:
1580 case E_V4x8QImode:
1581 case E_V4x4HImode:
1582 case E_V4x2SImode:
1583 case E_V4x1DImode:
1584 case E_V4x4BFmode:
1585 case E_V4x4HFmode:
1586 case E_V4x2SFmode:
1587 case E_V4x1DFmode:
1588 return (TARGET_FLOAT || any_target_p)
1589 ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1591 /* Structures of 128-bit Advanced SIMD vectors. */
1592 case E_V2x16QImode:
1593 case E_V2x8HImode:
1594 case E_V2x4SImode:
1595 case E_V2x2DImode:
1596 case E_V2x8BFmode:
1597 case E_V2x8HFmode:
1598 case E_V2x4SFmode:
1599 case E_V2x2DFmode:
1600 case E_V3x16QImode:
1601 case E_V3x8HImode:
1602 case E_V3x4SImode:
1603 case E_V3x2DImode:
1604 case E_V3x8BFmode:
1605 case E_V3x8HFmode:
1606 case E_V3x4SFmode:
1607 case E_V3x2DFmode:
1608 case E_V4x16QImode:
1609 case E_V4x8HImode:
1610 case E_V4x4SImode:
1611 case E_V4x2DImode:
1612 case E_V4x8BFmode:
1613 case E_V4x8HFmode:
1614 case E_V4x4SFmode:
1615 case E_V4x2DFmode:
1616 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1618 /* 64-bit Advanced SIMD vectors. */
1619 case E_V8QImode:
1620 case E_V4HImode:
1621 case E_V2SImode:
1622 case E_V1DImode:
1623 case E_V4HFmode:
1624 case E_V4BFmode:
1625 case E_V2SFmode:
1626 case E_V1DFmode:
1627 /* 128-bit Advanced SIMD vectors. */
1628 case E_V16QImode:
1629 case E_V8HImode:
1630 case E_V4SImode:
1631 case E_V2DImode:
1632 case E_V8HFmode:
1633 case E_V8BFmode:
1634 case E_V4SFmode:
1635 case E_V2DFmode:
1636 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1638 case E_VNx32BImode:
1639 return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1641 default:
1642 return 0;
1646 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1647 bool
1648 aarch64_advsimd_struct_mode_p (machine_mode mode)
1650 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1651 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1654 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
1655 static bool
1656 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1658 return (aarch64_classify_vector_mode (mode)
1659 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1662 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
1663 static bool
1664 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1666 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1669 /* Return true if MODE is any of the data vector modes, including
1670 structure modes. */
1671 static bool
1672 aarch64_vector_data_mode_p (machine_mode mode)
1674 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1677 /* Return true if MODE is any form of SVE mode, including predicates,
1678 vectors and structures. */
1679 bool
1680 aarch64_sve_mode_p (machine_mode mode)
1682 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1685 /* Return true if MODE is an SVE data vector mode; either a single vector
1686 or a structure of vectors. */
1687 static bool
1688 aarch64_sve_data_mode_p (machine_mode mode)
1690 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1693 /* Return the number of defined bytes in one constituent vector of
1694 SVE mode MODE, which has vector flags VEC_FLAGS. */
1695 static poly_int64
1696 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1698 if (vec_flags & VEC_PARTIAL)
1699 /* A single partial vector. */
1700 return GET_MODE_SIZE (mode);
1702 if (vec_flags & VEC_SVE_DATA)
1703 /* A single vector or a tuple. */
1704 return BYTES_PER_SVE_VECTOR;
1706 /* A single predicate. */
1707 gcc_assert (vec_flags & VEC_SVE_PRED);
1708 return BYTES_PER_SVE_PRED;
1711 /* If MODE holds an array of vectors, return the number of vectors
1712 in the array, otherwise return 1. */
1714 static unsigned int
1715 aarch64_ldn_stn_vectors (machine_mode mode)
1717 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1718 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1719 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1720 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1721 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1722 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1723 return exact_div (GET_MODE_SIZE (mode),
1724 BYTES_PER_SVE_VECTOR).to_constant ();
1725 return 1;
1728 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1729 corresponding vector structure mode. */
1730 static opt_machine_mode
1731 aarch64_advsimd_vector_array_mode (machine_mode mode,
1732 unsigned HOST_WIDE_INT nelems)
1734 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1735 if (known_eq (GET_MODE_SIZE (mode), 8))
1736 flags |= VEC_PARTIAL;
1738 machine_mode struct_mode;
1739 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1740 if (aarch64_classify_vector_mode (struct_mode) == flags
1741 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1742 && known_eq (GET_MODE_NUNITS (struct_mode),
1743 GET_MODE_NUNITS (mode) * nelems))
1744 return struct_mode;
1745 return opt_machine_mode ();
1748 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1750 opt_machine_mode
1751 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1753 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1754 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1755 machine_mode mode;
1756 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1757 if (inner_mode == GET_MODE_INNER (mode)
1758 && known_eq (nunits, GET_MODE_NUNITS (mode))
1759 && aarch64_sve_data_mode_p (mode))
1760 return mode;
1761 return opt_machine_mode ();
1764 /* Implement target hook TARGET_ARRAY_MODE. */
1765 static opt_machine_mode
1766 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1768 if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1770 /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1771 a mode to other array sizes. Using integer modes requires a round
1772 trip through memory and generates terrible code. */
1773 if (nelems == 1)
1774 return mode;
1775 if (mode == VNx16BImode && nelems == 2)
1776 return VNx32BImode;
1777 return BLKmode;
1780 auto flags = aarch64_classify_vector_mode (mode);
1781 if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1782 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1783 GET_MODE_NUNITS (mode) * nelems);
1785 if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1786 return aarch64_advsimd_vector_array_mode (mode, nelems);
1788 return opt_machine_mode ();
1791 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1792 static bool
1793 aarch64_array_mode_supported_p (machine_mode mode,
1794 unsigned HOST_WIDE_INT nelems)
1796 if (TARGET_BASE_SIMD
1797 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1798 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1799 && (nelems >= 2 && nelems <= 4))
1800 return true;
1802 return false;
1805 /* MODE is some form of SVE vector mode. For data modes, return the number
1806 of vector register bits that each element of MODE occupies, such as 64
1807 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1808 in a 64-bit container). For predicate modes, return the number of
1809 data bits controlled by each significant predicate bit. */
1811 static unsigned int
1812 aarch64_sve_container_bits (machine_mode mode)
1814 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1815 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1816 ? BITS_PER_SVE_VECTOR
1817 : GET_MODE_BITSIZE (mode));
1818 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1821 /* Return the SVE predicate mode to use for elements that have
1822 ELEM_NBYTES bytes, if such a mode exists. */
1824 opt_machine_mode
1825 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1827 if (TARGET_SVE)
1829 if (elem_nbytes == 1)
1830 return VNx16BImode;
1831 if (elem_nbytes == 2)
1832 return VNx8BImode;
1833 if (elem_nbytes == 4)
1834 return VNx4BImode;
1835 if (elem_nbytes == 8)
1836 return VNx2BImode;
1838 return opt_machine_mode ();
1841 /* Return the SVE predicate mode that should be used to control
1842 SVE mode MODE. */
1844 machine_mode
1845 aarch64_sve_pred_mode (machine_mode mode)
1847 unsigned int bits = aarch64_sve_container_bits (mode);
1848 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1851 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1853 static opt_machine_mode
1854 aarch64_get_mask_mode (machine_mode mode)
1856 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1857 if (vec_flags & VEC_SVE_DATA)
1858 return aarch64_sve_pred_mode (mode);
1860 return default_get_mask_mode (mode);
1863 /* Return the integer element mode associated with SVE mode MODE. */
1865 static scalar_int_mode
1866 aarch64_sve_element_int_mode (machine_mode mode)
1868 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1869 ? BITS_PER_SVE_VECTOR
1870 : GET_MODE_BITSIZE (mode));
1871 unsigned int elt_bits = vector_element_size (vector_bits,
1872 GET_MODE_NUNITS (mode));
1873 return int_mode_for_size (elt_bits, 0).require ();
1876 /* Return an integer element mode that contains exactly
1877 aarch64_sve_container_bits (MODE) bits. This is wider than
1878 aarch64_sve_element_int_mode if MODE is a partial vector,
1879 otherwise it's the same. */
1881 static scalar_int_mode
1882 aarch64_sve_container_int_mode (machine_mode mode)
1884 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1887 /* Return the integer vector mode associated with SVE mode MODE.
1888 Unlike related_int_vector_mode, this can handle the case in which
1889 MODE is a predicate (and thus has a different total size). */
1891 machine_mode
1892 aarch64_sve_int_mode (machine_mode mode)
1894 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1895 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1898 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1900 static opt_machine_mode
1901 aarch64_vectorize_related_mode (machine_mode vector_mode,
1902 scalar_mode element_mode,
1903 poly_uint64 nunits)
1905 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1907 /* If we're operating on SVE vectors, try to return an SVE mode. */
1908 poly_uint64 sve_nunits;
1909 if ((vec_flags & VEC_SVE_DATA)
1910 && multiple_p (BYTES_PER_SVE_VECTOR,
1911 GET_MODE_SIZE (element_mode), &sve_nunits))
1913 machine_mode sve_mode;
1914 if (maybe_ne (nunits, 0U))
1916 /* Try to find a full or partial SVE mode with exactly
1917 NUNITS units. */
1918 if (multiple_p (sve_nunits, nunits)
1919 && aarch64_sve_data_mode (element_mode,
1920 nunits).exists (&sve_mode))
1921 return sve_mode;
1923 else
1925 /* Take the preferred number of units from the number of bytes
1926 that fit in VECTOR_MODE. We always start by "autodetecting"
1927 a full vector mode with preferred_simd_mode, so vectors
1928 chosen here will also be full vector modes. Then
1929 autovectorize_vector_modes tries smaller starting modes
1930 and thus smaller preferred numbers of units. */
1931 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1932 if (aarch64_sve_data_mode (element_mode,
1933 sve_nunits).exists (&sve_mode))
1934 return sve_mode;
1938 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1939 if (TARGET_SIMD
1940 && (vec_flags & VEC_ADVSIMD)
1941 && known_eq (nunits, 0U)
1942 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1943 && maybe_ge (GET_MODE_BITSIZE (element_mode)
1944 * GET_MODE_NUNITS (vector_mode), 128U))
1946 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1947 if (VECTOR_MODE_P (res))
1948 return res;
1951 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1954 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
1956 static bool
1957 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
1959 machine_mode mode = TYPE_MODE (type);
1960 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1961 bool sve_p = (vec_flags & VEC_ANY_SVE);
1962 bool simd_p = (vec_flags & VEC_ADVSIMD);
1964 return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
1967 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1968 prefer to use the first arithmetic operand as the else value if
1969 the else value doesn't matter, since that exactly matches the SVE
1970 destructive merging form. For ternary operations we could either
1971 pick the first operand and use FMAD-like instructions or the last
1972 operand and use FMLA-like instructions; the latter seems more
1973 natural. */
1975 static tree
1976 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1978 return nops == 3 ? ops[2] : ops[0];
1981 /* Implement TARGET_HARD_REGNO_NREGS. */
1983 static unsigned int
1984 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1986 /* ??? Logically we should only need to provide a value when
1987 HARD_REGNO_MODE_OK says that the combination is valid,
1988 but at the moment we need to handle all modes. Just ignore
1989 any runtime parts for registers that can't store them. */
1990 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1991 switch (aarch64_regno_regclass (regno))
1993 case FP_REGS:
1994 case FP_LO_REGS:
1995 case FP_LO8_REGS:
1997 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1998 if (vec_flags & VEC_SVE_DATA)
1999 return exact_div (GET_MODE_SIZE (mode),
2000 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2001 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2002 return GET_MODE_SIZE (mode).to_constant () / 8;
2003 return CEIL (lowest_size, UNITS_PER_VREG);
2006 case PR_REGS:
2007 case PR_LO_REGS:
2008 case PR_HI_REGS:
2009 return mode == VNx32BImode ? 2 : 1;
2011 case FFR_REGS:
2012 case PR_AND_FFR_REGS:
2013 case FAKE_REGS:
2014 return 1;
2016 default:
2017 return CEIL (lowest_size, UNITS_PER_WORD);
2019 gcc_unreachable ();
2022 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2024 static bool
2025 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2027 if (mode == V8DImode)
2028 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2029 && multiple_p (regno - R0_REGNUM, 2);
2031 if (GET_MODE_CLASS (mode) == MODE_CC)
2032 return regno == CC_REGNUM;
2034 if (regno == VG_REGNUM)
2035 /* This must have the same size as _Unwind_Word. */
2036 return mode == DImode;
2038 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2039 if (vec_flags == VEC_SVE_PRED)
2040 return pr_or_ffr_regnum_p (regno);
2042 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2043 return PR_REGNUM_P (regno);
2045 if (pr_or_ffr_regnum_p (regno))
2046 return false;
2048 /* These registers are abstract; their modes don't matter. */
2049 if (FAKE_REGNUM_P (regno))
2050 return true;
2052 if (regno == SP_REGNUM)
2053 /* The purpose of comparing with ptr_mode is to support the
2054 global register variable associated with the stack pointer
2055 register via the syntax of asm ("wsp") in ILP32. */
2056 return mode == Pmode || mode == ptr_mode;
2058 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2059 return mode == Pmode;
2061 if (GP_REGNUM_P (regno))
2063 if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2064 return false;
2065 if (known_le (GET_MODE_SIZE (mode), 8))
2066 return true;
2067 if (known_le (GET_MODE_SIZE (mode), 16))
2068 return (regno & 1) == 0;
2070 else if (FP_REGNUM_P (regno))
2072 if (vec_flags & VEC_STRUCT)
2073 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2074 else
2075 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2078 return false;
2081 /* Return true if a function with type FNTYPE returns its value in
2082 SVE vector or predicate registers. */
2084 static bool
2085 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2087 tree return_type = TREE_TYPE (fntype);
2089 pure_scalable_type_info pst_info;
2090 switch (pst_info.analyze (return_type))
2092 case pure_scalable_type_info::IS_PST:
2093 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2094 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2096 case pure_scalable_type_info::DOESNT_MATTER:
2097 gcc_assert (aarch64_return_in_memory_1 (return_type));
2098 return false;
2100 case pure_scalable_type_info::NO_ABI_IDENTITY:
2101 case pure_scalable_type_info::ISNT_PST:
2102 return false;
2104 gcc_unreachable ();
2107 /* Return true if a function with type FNTYPE takes arguments in
2108 SVE vector or predicate registers. */
2110 static bool
2111 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2113 CUMULATIVE_ARGS args_so_far_v;
2114 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2115 NULL_TREE, 0, true);
2116 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2118 for (tree chain = TYPE_ARG_TYPES (fntype);
2119 chain && chain != void_list_node;
2120 chain = TREE_CHAIN (chain))
2122 tree arg_type = TREE_VALUE (chain);
2123 if (arg_type == error_mark_node)
2124 return false;
2126 function_arg_info arg (arg_type, /*named=*/true);
2127 apply_pass_by_reference_rules (&args_so_far_v, arg);
2128 pure_scalable_type_info pst_info;
2129 if (pst_info.analyze_registers (arg.type))
2131 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2132 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2133 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2134 return true;
2137 targetm.calls.function_arg_advance (args_so_far, arg);
2139 return false;
2142 /* Implement TARGET_FNTYPE_ABI. */
2144 static const predefined_function_abi &
2145 aarch64_fntype_abi (const_tree fntype)
2147 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2148 return aarch64_simd_abi ();
2150 if (aarch64_returns_value_in_sve_regs_p (fntype)
2151 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2152 return aarch64_sve_abi ();
2154 return default_function_abi;
2157 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE. */
2159 static aarch64_feature_flags
2160 aarch64_fntype_pstate_sm (const_tree fntype)
2162 if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2163 return AARCH64_FL_SM_ON;
2165 if (lookup_attribute ("arm", "streaming_compatible",
2166 TYPE_ATTRIBUTES (fntype)))
2167 return 0;
2169 return AARCH64_FL_SM_OFF;
2172 /* Return state flags that describe whether and how functions of type
2173 FNTYPE share state STATE_NAME with their callers. */
2175 static unsigned int
2176 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2178 return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2179 state_name);
2182 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE. */
2184 static aarch64_feature_flags
2185 aarch64_fntype_pstate_za (const_tree fntype)
2187 if (aarch64_fntype_shared_flags (fntype, "za")
2188 || aarch64_fntype_shared_flags (fntype, "zt0"))
2189 return AARCH64_FL_ZA_ON;
2191 return 0;
2194 /* Return the ISA mode on entry to functions of type FNTYPE. */
2196 static aarch64_feature_flags
2197 aarch64_fntype_isa_mode (const_tree fntype)
2199 return (aarch64_fntype_pstate_sm (fntype)
2200 | aarch64_fntype_pstate_za (fntype));
2203 /* Return true if FNDECL uses streaming mode internally, as an
2204 implementation choice. */
2206 static bool
2207 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2209 return lookup_attribute ("arm", "locally_streaming",
2210 DECL_ATTRIBUTES (fndecl));
2213 /* Return the state of PSTATE.SM when compiling the body of
2214 function FNDECL. This might be different from the state of
2215 PSTATE.SM on entry. */
2217 static aarch64_feature_flags
2218 aarch64_fndecl_pstate_sm (const_tree fndecl)
2220 if (aarch64_fndecl_is_locally_streaming (fndecl))
2221 return AARCH64_FL_SM_ON;
2223 return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2226 /* Return true if function FNDECL has state STATE_NAME, either by creating
2227 new state itself or by sharing state with callers. */
2229 static bool
2230 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2232 return (aarch64_fndecl_has_new_state (fndecl, state_name)
2233 || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2234 state_name) != 0);
2237 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2238 This might be different from the state of PSTATE.ZA on entry. */
2240 static aarch64_feature_flags
2241 aarch64_fndecl_pstate_za (const_tree fndecl)
2243 if (aarch64_fndecl_has_new_state (fndecl, "za")
2244 || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2245 return AARCH64_FL_ZA_ON;
2247 return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2250 /* Return the ISA mode that should be used to compile the body of
2251 function FNDECL. */
2253 static aarch64_feature_flags
2254 aarch64_fndecl_isa_mode (const_tree fndecl)
2256 return (aarch64_fndecl_pstate_sm (fndecl)
2257 | aarch64_fndecl_pstate_za (fndecl));
2260 /* Return the state of PSTATE.SM on entry to the current function.
2261 This might be different from the state of PSTATE.SM in the function
2262 body. */
2264 static aarch64_feature_flags
2265 aarch64_cfun_incoming_pstate_sm ()
2267 return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2270 /* Return the state of PSTATE.ZA on entry to the current function.
2271 This might be different from the state of PSTATE.ZA in the function
2272 body. */
2274 static aarch64_feature_flags
2275 aarch64_cfun_incoming_pstate_za ()
2277 return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2280 /* Return state flags that describe whether and how the current function shares
2281 state STATE_NAME with callers. */
2283 static unsigned int
2284 aarch64_cfun_shared_flags (const char *state_name)
2286 return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2289 /* Return true if the current function creates new state of type STATE_NAME
2290 (as opposed to sharing the state with its callers or ignoring the state
2291 altogether). */
2293 static bool
2294 aarch64_cfun_has_new_state (const char *state_name)
2296 return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2299 /* Return true if PSTATE.SM is 1 in the body of the current function,
2300 but is not guaranteed to be 1 on entry. */
2302 static bool
2303 aarch64_cfun_enables_pstate_sm ()
2305 return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2306 && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
2309 /* Return true if the current function has state STATE_NAME, either by
2310 creating new state itself or by sharing state with callers. */
2312 static bool
2313 aarch64_cfun_has_state (const char *state_name)
2315 return aarch64_fndecl_has_state (cfun->decl, state_name);
2318 /* Return true if a call from the current function to a function with
2319 ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2320 the BL instruction. */
2322 static bool
2323 aarch64_call_switches_pstate_sm (aarch64_feature_flags callee_mode)
2325 return (callee_mode & ~AARCH64_ISA_MODE & AARCH64_FL_SM_STATE) != 0;
2328 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2330 static bool
2331 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2333 return (aarch64_sve::builtin_type_p (type1)
2334 == aarch64_sve::builtin_type_p (type2));
2337 /* Return true if we should emit CFI for register REGNO. */
2339 static bool
2340 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2342 return (GP_REGNUM_P (regno)
2343 || !default_function_abi.clobbers_full_reg_p (regno));
2346 /* Return the mode we should use to save and restore register REGNO. */
2348 static machine_mode
2349 aarch64_reg_save_mode (unsigned int regno)
2351 if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2352 return DImode;
2354 if (FP_REGNUM_P (regno))
2355 switch (crtl->abi->id ())
2357 case ARM_PCS_AAPCS64:
2358 /* Only the low 64 bits are saved by the base PCS. */
2359 return DFmode;
2361 case ARM_PCS_SIMD:
2362 /* The vector PCS saves the low 128 bits (which is the full
2363 register on non-SVE targets). */
2364 return TFmode;
2366 case ARM_PCS_SVE:
2367 /* Use vectors of DImode for registers that need frame
2368 information, so that the first 64 bytes of the save slot
2369 are always the equivalent of what storing D<n> would give. */
2370 if (aarch64_emit_cfi_for_reg_p (regno))
2371 return VNx2DImode;
2373 /* Use vectors of bytes otherwise, so that the layout is
2374 endian-agnostic, and so that we can use LDR and STR for
2375 big-endian targets. */
2376 return VNx16QImode;
2378 case ARM_PCS_TLSDESC:
2379 case ARM_PCS_UNKNOWN:
2380 break;
2383 if (PR_REGNUM_P (regno))
2384 /* Save the full predicate register. */
2385 return VNx16BImode;
2387 gcc_unreachable ();
2390 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2391 return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx. */
2394 aarch64_gen_callee_cookie (aarch64_feature_flags isa_mode, arm_pcs pcs_variant)
2396 return gen_int_mode ((unsigned int) isa_mode
2397 | (unsigned int) pcs_variant << AARCH64_NUM_ISA_MODES,
2398 DImode);
2401 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2402 callee's ABI. */
2404 static const predefined_function_abi &
2405 aarch64_callee_abi (rtx cookie)
2407 return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES];
2410 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2411 required ISA mode on entry to the callee, which is also the ISA
2412 mode on return from the callee. */
2414 static aarch64_feature_flags
2415 aarch64_callee_isa_mode (rtx cookie)
2417 return UINTVAL (cookie) & AARCH64_FL_ISA_MODES;
2420 /* INSN is a call instruction. Return the CONST_INT stored in its
2421 UNSPEC_CALLEE_ABI rtx. */
2423 static rtx
2424 aarch64_insn_callee_cookie (const rtx_insn *insn)
2426 rtx pat = PATTERN (insn);
2427 gcc_assert (GET_CODE (pat) == PARALLEL);
2428 rtx unspec = XVECEXP (pat, 0, 1);
2429 gcc_assert (GET_CODE (unspec) == UNSPEC
2430 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2431 return XVECEXP (unspec, 0, 0);
2434 /* Implement TARGET_INSN_CALLEE_ABI. */
2436 const predefined_function_abi &
2437 aarch64_insn_callee_abi (const rtx_insn *insn)
2439 return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2442 /* INSN is a call instruction. Return the required ISA mode on entry to
2443 the callee, which is also the ISA mode on return from the callee. */
2445 static aarch64_feature_flags
2446 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2448 return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2451 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2452 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2453 clobbers the top 64 bits when restoring the bottom 64 bits. */
2455 static bool
2456 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2457 unsigned int regno,
2458 machine_mode mode)
2460 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2462 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2463 unsigned int nregs = hard_regno_nregs (regno, mode);
2464 if (nregs > 1)
2465 per_register_size = exact_div (per_register_size, nregs);
2466 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2467 return maybe_gt (per_register_size, 16);
2468 return maybe_gt (per_register_size, 8);
2470 return false;
2473 /* Implement REGMODE_NATURAL_SIZE. */
2474 poly_uint64
2475 aarch64_regmode_natural_size (machine_mode mode)
2477 /* The natural size for SVE data modes is one SVE data vector,
2478 and similarly for predicates. We can't independently modify
2479 anything smaller than that. */
2480 /* ??? For now, only do this for variable-width SVE registers.
2481 Doing it for constant-sized registers breaks lower-subreg.cc. */
2482 /* ??? And once that's fixed, we should probably have similar
2483 code for Advanced SIMD. */
2484 if (!aarch64_sve_vg.is_constant ())
2486 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2487 if (vec_flags & VEC_SVE_PRED)
2488 return BYTES_PER_SVE_PRED;
2489 if (vec_flags & VEC_SVE_DATA)
2490 return BYTES_PER_SVE_VECTOR;
2492 return UNITS_PER_WORD;
2495 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2496 machine_mode
2497 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2498 machine_mode mode)
2500 /* The predicate mode determines which bits are significant and
2501 which are "don't care". Decreasing the number of lanes would
2502 lose data while increasing the number of lanes would make bits
2503 unnecessarily significant. */
2504 if (PR_REGNUM_P (regno))
2505 return mode;
2506 if (known_ge (GET_MODE_SIZE (mode), 4))
2507 return mode;
2508 else
2509 return SImode;
2512 /* Return true if I's bits are consecutive ones from the MSB. */
2513 bool
2514 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2516 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2519 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2520 that strcpy from constants will be faster. */
2522 static HOST_WIDE_INT
2523 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2525 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2526 return MAX (align, BITS_PER_WORD);
2527 return align;
2530 /* Return true if calls to DECL should be treated as
2531 long-calls (ie called via a register). */
2532 static bool
2533 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2535 return false;
2538 /* Return true if calls to symbol-ref SYM should be treated as
2539 long-calls (ie called via a register). */
2540 bool
2541 aarch64_is_long_call_p (rtx sym)
2543 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2546 /* Return true if calls to symbol-ref SYM should not go through
2547 plt stubs. */
2549 bool
2550 aarch64_is_noplt_call_p (rtx sym)
2552 const_tree decl = SYMBOL_REF_DECL (sym);
2554 if (flag_pic
2555 && decl
2556 && (!flag_plt
2557 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2558 && !targetm.binds_local_p (decl))
2559 return true;
2561 return false;
2564 /* Emit an insn that's a simple single-set. Both the operands must be
2565 known to be valid. */
2566 inline static rtx_insn *
2567 emit_set_insn (rtx x, rtx y)
2569 return emit_insn (gen_rtx_SET (x, y));
2572 /* X and Y are two things to compare using CODE. Emit the compare insn and
2573 return the rtx for register 0 in the proper mode. */
2575 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2577 machine_mode cmp_mode = GET_MODE (x);
2578 machine_mode cc_mode;
2579 rtx cc_reg;
2581 if (cmp_mode == TImode)
2583 gcc_assert (code == NE);
2585 cc_mode = CCmode;
2586 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2588 rtx x_lo = operand_subword (x, 0, 0, TImode);
2589 rtx y_lo = operand_subword (y, 0, 0, TImode);
2590 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2592 rtx x_hi = operand_subword (x, 1, 0, TImode);
2593 rtx y_hi = operand_subword (y, 1, 0, TImode);
2594 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2595 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2596 GEN_INT (AARCH64_EQ)));
2598 else
2600 cc_mode = SELECT_CC_MODE (code, x, y);
2601 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2602 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2604 return cc_reg;
2607 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2609 static rtx
2610 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2611 machine_mode y_mode)
2613 if (y_mode == E_QImode || y_mode == E_HImode)
2615 if (CONST_INT_P (y))
2617 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2618 y_mode = SImode;
2620 else
2622 rtx t, cc_reg;
2623 machine_mode cc_mode;
2625 t = gen_rtx_ZERO_EXTEND (SImode, y);
2626 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2627 cc_mode = CC_SWPmode;
2628 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2629 emit_set_insn (cc_reg, t);
2630 return cc_reg;
2634 if (!aarch64_plus_operand (y, y_mode))
2635 y = force_reg (y_mode, y);
2637 return aarch64_gen_compare_reg (code, x, y);
2640 /* Consider the operation:
2642 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2644 where:
2646 - CODE is [SU]MAX or [SU]MIN
2647 - OPERANDS[2] and OPERANDS[3] are constant integers
2648 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2649 - all operands have mode MODE
2651 Decide whether it is possible to implement the operation using:
2653 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2655 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2657 followed by:
2659 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2661 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
2662 If GENERATE_P is true, also update OPERANDS as follows:
2664 OPERANDS[4] = -OPERANDS[3]
2665 OPERANDS[5] = the rtl condition representing <cond>
2666 OPERANDS[6] = <tmp>
2667 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
2668 bool
2669 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2671 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2672 rtx dst = operands[0];
2673 rtx maxmin_op = operands[2];
2674 rtx add_op = operands[3];
2675 machine_mode mode = GET_MODE (dst);
2677 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2678 == (x >= y ? x : y) - z
2679 == (x > y ? x : y) - z
2680 == (x > y - 1 ? x : y) - z
2682 min (x, y) - z == (x <= y - 1 ? x : y) - z
2683 == (x <= y ? x : y) - z
2684 == (x < y ? x : y) - z
2685 == (x < y + 1 ? x : y) - z
2687 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2688 which x is compared with z. Set DIFF to y - z. Thus the supported
2689 combinations are as follows, with DIFF being the value after the ":":
2691 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
2692 == x >= y ? x - y : 0 [z == y]
2693 == x > y ? x - y : 0 [z == y]
2694 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
2696 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
2697 == x <= y ? x - y : 0 [z == y]
2698 == x < y ? x - y : 0 [z == y]
2699 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
2700 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2701 auto add_val = rtx_mode_t (add_op, mode);
2702 auto sub_val = wi::neg (add_val);
2703 auto diff = wi::sub (maxmin_val, sub_val);
2704 if (!(diff == 0
2705 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2706 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2707 return false;
2709 if (!generate_p)
2710 return true;
2712 rtx_code cmp;
2713 switch (code)
2715 case SMAX:
2716 cmp = diff == 1 ? GT : GE;
2717 break;
2718 case UMAX:
2719 cmp = diff == 1 ? GTU : GEU;
2720 break;
2721 case SMIN:
2722 cmp = diff == -1 ? LT : LE;
2723 break;
2724 case UMIN:
2725 cmp = diff == -1 ? LTU : LEU;
2726 break;
2727 default:
2728 gcc_unreachable ();
2730 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2732 operands[4] = immed_wide_int_const (sub_val, mode);
2733 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2734 if (can_create_pseudo_p ())
2735 operands[6] = gen_reg_rtx (mode);
2736 else
2737 operands[6] = dst;
2738 operands[7] = immed_wide_int_const (diff, mode);
2740 return true;
2744 /* Build the SYMBOL_REF for __tls_get_addr. */
2746 static GTY(()) rtx tls_get_addr_libfunc;
2749 aarch64_tls_get_addr (void)
2751 if (!tls_get_addr_libfunc)
2752 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2753 return tls_get_addr_libfunc;
2756 /* Return the TLS model to use for ADDR. */
2758 static enum tls_model
2759 tls_symbolic_operand_type (rtx addr)
2761 enum tls_model tls_kind = TLS_MODEL_NONE;
2762 poly_int64 offset;
2763 addr = strip_offset_and_salt (addr, &offset);
2764 if (SYMBOL_REF_P (addr))
2765 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2767 return tls_kind;
2770 /* We'll allow lo_sum's in addresses in our legitimate addresses
2771 so that combine would take care of combining addresses where
2772 necessary, but for generation purposes, we'll generate the address
2773 as :
2774 RTL Absolute
2775 tmp = hi (symbol_ref); adrp x1, foo
2776 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2779 PIC TLS
2780 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2781 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2782 bl __tls_get_addr
2785 Load TLS symbol, depending on TLS mechanism and TLS access model.
2787 Global Dynamic - Traditional TLS:
2788 adrp tmp, :tlsgd:imm
2789 add dest, tmp, #:tlsgd_lo12:imm
2790 bl __tls_get_addr
2792 Global Dynamic - TLS Descriptors:
2793 adrp dest, :tlsdesc:imm
2794 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2795 add dest, dest, #:tlsdesc_lo12:imm
2796 blr tmp
2797 mrs tp, tpidr_el0
2798 add dest, dest, tp
2800 Initial Exec:
2801 mrs tp, tpidr_el0
2802 adrp tmp, :gottprel:imm
2803 ldr dest, [tmp, #:gottprel_lo12:imm]
2804 add dest, dest, tp
2806 Local Exec:
2807 mrs tp, tpidr_el0
2808 add t0, tp, #:tprel_hi12:imm, lsl #12
2809 add t0, t0, #:tprel_lo12_nc:imm
2812 static void
2813 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2814 enum aarch64_symbol_type type)
2816 switch (type)
2818 case SYMBOL_SMALL_ABSOLUTE:
2820 /* In ILP32, the mode of dest can be either SImode or DImode. */
2821 rtx tmp_reg = dest;
2822 machine_mode mode = GET_MODE (dest);
2824 gcc_assert (mode == Pmode || mode == ptr_mode);
2826 if (can_create_pseudo_p ())
2827 tmp_reg = gen_reg_rtx (mode);
2829 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
2830 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2831 return;
2834 case SYMBOL_TINY_ABSOLUTE:
2835 emit_insn (gen_rtx_SET (dest, imm));
2836 return;
2838 case SYMBOL_SMALL_GOT_28K:
2840 machine_mode mode = GET_MODE (dest);
2841 rtx gp_rtx = pic_offset_table_rtx;
2842 rtx insn;
2843 rtx mem;
2845 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2846 here before rtl expand. Tree IVOPT will generate rtl pattern to
2847 decide rtx costs, in which case pic_offset_table_rtx is not
2848 initialized. For that case no need to generate the first adrp
2849 instruction as the final cost for global variable access is
2850 one instruction. */
2851 if (gp_rtx != NULL)
2853 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2854 using the page base as GOT base, the first page may be wasted,
2855 in the worst scenario, there is only 28K space for GOT).
2857 The generate instruction sequence for accessing global variable
2860 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2862 Only one instruction needed. But we must initialize
2863 pic_offset_table_rtx properly. We generate initialize insn for
2864 every global access, and allow CSE to remove all redundant.
2866 The final instruction sequences will look like the following
2867 for multiply global variables access.
2869 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2871 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2872 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2873 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2874 ... */
2876 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2877 crtl->uses_pic_offset_table = 1;
2878 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2880 if (mode != GET_MODE (gp_rtx))
2881 gp_rtx = gen_lowpart (mode, gp_rtx);
2885 if (mode == ptr_mode)
2887 if (mode == DImode)
2888 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2889 else
2890 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2892 mem = XVECEXP (SET_SRC (insn), 0, 0);
2894 else
2896 gcc_assert (mode == Pmode);
2898 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2899 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2902 /* The operand is expected to be MEM. Whenever the related insn
2903 pattern changed, above code which calculate mem should be
2904 updated. */
2905 gcc_assert (MEM_P (mem));
2906 MEM_READONLY_P (mem) = 1;
2907 MEM_NOTRAP_P (mem) = 1;
2908 emit_insn (insn);
2909 return;
2912 case SYMBOL_SMALL_GOT_4G:
2913 emit_insn (gen_rtx_SET (dest, imm));
2914 return;
2916 case SYMBOL_SMALL_TLSGD:
2918 rtx_insn *insns;
2919 /* The return type of __tls_get_addr is the C pointer type
2920 so use ptr_mode. */
2921 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
2922 rtx tmp_reg = dest;
2924 if (GET_MODE (dest) != ptr_mode)
2925 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
2927 start_sequence ();
2928 if (ptr_mode == SImode)
2929 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2930 else
2931 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2932 insns = get_insns ();
2933 end_sequence ();
2935 RTL_CONST_CALL_P (insns) = 1;
2936 emit_libcall_block (insns, tmp_reg, result, imm);
2937 /* Convert back to the mode of the dest adding a zero_extend
2938 from SImode (ptr_mode) to DImode (Pmode). */
2939 if (dest != tmp_reg)
2940 convert_move (dest, tmp_reg, true);
2941 return;
2944 case SYMBOL_SMALL_TLSDESC:
2946 machine_mode mode = GET_MODE (dest);
2947 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2948 rtx tp;
2950 gcc_assert (mode == Pmode || mode == ptr_mode);
2952 /* In ILP32, the got entry is always of SImode size. Unlike
2953 small GOT, the dest is fixed at reg 0. */
2954 if (TARGET_ILP32)
2955 emit_insn (gen_tlsdesc_small_si (imm));
2956 else
2957 emit_insn (gen_tlsdesc_small_di (imm));
2958 tp = aarch64_load_tp (NULL);
2960 if (mode != Pmode)
2961 tp = gen_lowpart (mode, tp);
2963 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2964 if (REG_P (dest))
2965 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2966 return;
2969 case SYMBOL_SMALL_TLSIE:
2971 /* In ILP32, the mode of dest can be either SImode or DImode,
2972 while the got entry is always of SImode size. The mode of
2973 dest depends on how dest is used: if dest is assigned to a
2974 pointer (e.g. in the memory), it has SImode; it may have
2975 DImode if dest is dereferenced to access the memeory.
2976 This is why we have to handle three different tlsie_small
2977 patterns here (two patterns for ILP32). */
2978 machine_mode mode = GET_MODE (dest);
2979 rtx tmp_reg = gen_reg_rtx (mode);
2980 rtx tp = aarch64_load_tp (NULL);
2982 if (mode == ptr_mode)
2984 if (mode == DImode)
2985 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2986 else
2988 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2989 tp = gen_lowpart (mode, tp);
2992 else
2994 gcc_assert (mode == Pmode);
2995 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2998 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2999 if (REG_P (dest))
3000 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3001 return;
3004 case SYMBOL_TLSLE12:
3005 case SYMBOL_TLSLE24:
3006 case SYMBOL_TLSLE32:
3007 case SYMBOL_TLSLE48:
3009 machine_mode mode = GET_MODE (dest);
3010 rtx tp = aarch64_load_tp (NULL);
3012 if (mode != Pmode)
3013 tp = gen_lowpart (mode, tp);
3015 switch (type)
3017 case SYMBOL_TLSLE12:
3018 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3019 (dest, tp, imm));
3020 break;
3021 case SYMBOL_TLSLE24:
3022 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3023 (dest, tp, imm));
3024 break;
3025 case SYMBOL_TLSLE32:
3026 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3027 (dest, imm));
3028 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3029 (dest, dest, tp));
3030 break;
3031 case SYMBOL_TLSLE48:
3032 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3033 (dest, imm));
3034 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3035 (dest, dest, tp));
3036 break;
3037 default:
3038 gcc_unreachable ();
3041 if (REG_P (dest))
3042 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3043 return;
3046 case SYMBOL_TINY_GOT:
3048 rtx insn;
3049 machine_mode mode = GET_MODE (dest);
3051 if (mode == ptr_mode)
3052 insn = gen_ldr_got_tiny (mode, dest, imm);
3053 else
3055 gcc_assert (mode == Pmode);
3056 insn = gen_ldr_got_tiny_sidi (dest, imm);
3059 emit_insn (insn);
3060 return;
3063 case SYMBOL_TINY_TLSIE:
3065 machine_mode mode = GET_MODE (dest);
3066 rtx tp = aarch64_load_tp (NULL);
3068 if (mode == ptr_mode)
3070 if (mode == DImode)
3071 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3072 else
3074 tp = gen_lowpart (mode, tp);
3075 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3078 else
3080 gcc_assert (mode == Pmode);
3081 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3084 if (REG_P (dest))
3085 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3086 return;
3089 default:
3090 gcc_unreachable ();
3094 /* Emit a move from SRC to DEST. Assume that the move expanders can
3095 handle all moves if !can_create_pseudo_p (). The distinction is
3096 important because, unlike emit_move_insn, the move expanders know
3097 how to force Pmode objects into the constant pool even when the
3098 constant pool address is not itself legitimate. */
3099 static rtx
3100 aarch64_emit_move (rtx dest, rtx src)
3102 return (can_create_pseudo_p ()
3103 ? emit_move_insn (dest, src)
3104 : emit_move_insn_1 (dest, src));
3107 /* Apply UNOPTAB to OP and store the result in DEST. */
3109 static void
3110 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3112 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3113 if (dest != tmp)
3114 emit_move_insn (dest, tmp);
3117 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3119 static void
3120 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3122 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3123 OPTAB_DIRECT);
3124 if (dest != tmp)
3125 emit_move_insn (dest, tmp);
3128 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE. */
3130 void
3131 aarch64_split_double_move (rtx dst, rtx src, machine_mode single_mode)
3133 machine_mode mode = GET_MODE (dst);
3135 rtx dst0 = simplify_gen_subreg (single_mode, dst, mode, 0);
3136 rtx dst1 = simplify_gen_subreg (single_mode, dst, mode,
3137 GET_MODE_SIZE (single_mode));
3138 rtx src0 = simplify_gen_subreg (single_mode, src, mode, 0);
3139 rtx src1 = simplify_gen_subreg (single_mode, src, mode,
3140 GET_MODE_SIZE (single_mode));
3142 /* At most one pairing may overlap. */
3143 if (reg_overlap_mentioned_p (dst0, src1))
3145 aarch64_emit_move (dst1, src1);
3146 aarch64_emit_move (dst0, src0);
3148 else
3150 aarch64_emit_move (dst0, src0);
3151 aarch64_emit_move (dst1, src1);
3155 /* Split a 128-bit move operation into two 64-bit move operations,
3156 taking care to handle partial overlap of register to register
3157 copies. Special cases are needed when moving between GP regs and
3158 FP regs. SRC can be a register, constant or memory; DST a register
3159 or memory. If either operand is memory it must not have any side
3160 effects. */
3161 void
3162 aarch64_split_128bit_move (rtx dst, rtx src)
3164 machine_mode mode = GET_MODE (dst);
3166 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3167 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3168 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3170 if (REG_P (dst) && REG_P (src))
3172 int src_regno = REGNO (src);
3173 int dst_regno = REGNO (dst);
3175 /* Handle FP <-> GP regs. */
3176 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3178 rtx src_lo = gen_lowpart (word_mode, src);
3179 rtx src_hi = gen_highpart (word_mode, src);
3181 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3182 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3183 return;
3185 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3187 rtx dst_lo = gen_lowpart (word_mode, dst);
3188 rtx dst_hi = gen_highpart (word_mode, dst);
3190 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3191 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3192 return;
3196 aarch64_split_double_move (dst, src, word_mode);
3199 /* Return true if we should split a move from 128-bit value SRC
3200 to 128-bit register DEST. */
3202 bool
3203 aarch64_split_128bit_move_p (rtx dst, rtx src)
3205 if (FP_REGNUM_P (REGNO (dst)))
3206 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3207 /* All moves to GPRs need to be split. */
3208 return true;
3211 /* Split a complex SIMD move. */
3213 void
3214 aarch64_split_simd_move (rtx dst, rtx src)
3216 machine_mode src_mode = GET_MODE (src);
3217 machine_mode dst_mode = GET_MODE (dst);
3219 gcc_assert (VECTOR_MODE_P (dst_mode));
3221 if (REG_P (dst) && REG_P (src))
3223 gcc_assert (VECTOR_MODE_P (src_mode));
3224 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3228 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3229 The semantics of those of svreinterpret rather than those of subregs;
3230 see the comment at the head of aarch64-sve.md for details about the
3231 difference. */
3234 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3236 if (GET_MODE (x) == mode)
3237 return x;
3239 /* can_change_mode_class must only return true if subregs and svreinterprets
3240 have the same semantics. */
3241 if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3242 return lowpart_subreg (mode, x, GET_MODE (x));
3244 rtx res = gen_reg_rtx (mode);
3245 x = force_reg (GET_MODE (x), x);
3246 emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3247 return res;
3250 bool
3251 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3252 machine_mode ymode, rtx y)
3254 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3255 gcc_assert (r != NULL);
3256 return rtx_equal_p (x, r);
3259 /* Return TARGET if it is nonnull and a register of mode MODE.
3260 Otherwise, return a fresh register of mode MODE if we can,
3261 or TARGET reinterpreted as MODE if we can't. */
3263 static rtx
3264 aarch64_target_reg (rtx target, machine_mode mode)
3266 if (target && REG_P (target) && GET_MODE (target) == mode)
3267 return target;
3268 if (!can_create_pseudo_p ())
3270 gcc_assert (target);
3271 return gen_lowpart (mode, target);
3273 return gen_reg_rtx (mode);
3276 /* Return a register that contains the constant in BUILDER, given that
3277 the constant is a legitimate move operand. Use TARGET as the register
3278 if it is nonnull and convenient. */
3280 static rtx
3281 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3283 rtx src = builder.build ();
3284 target = aarch64_target_reg (target, GET_MODE (src));
3285 emit_insn (gen_rtx_SET (target, src));
3286 return target;
3289 static rtx
3290 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3292 if (can_create_pseudo_p ())
3293 return force_reg (mode, value);
3294 else
3296 gcc_assert (x);
3297 aarch64_emit_move (x, value);
3298 return x;
3302 /* Return true if predicate value X is a constant in which every element
3303 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3304 value, i.e. as a predicate in which all bits are significant. */
3306 static bool
3307 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3309 if (!CONST_VECTOR_P (x))
3310 return false;
3312 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3313 GET_MODE_NUNITS (GET_MODE (x)));
3314 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3315 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3316 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3318 unsigned int nelts = const_vector_encoded_nelts (x);
3319 for (unsigned int i = 0; i < nelts; ++i)
3321 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3322 if (!CONST_INT_P (elt))
3323 return false;
3325 builder.quick_push (elt);
3326 for (unsigned int j = 1; j < factor; ++j)
3327 builder.quick_push (const0_rtx);
3329 builder.finalize ();
3330 return true;
3333 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3334 widest predicate element size it can have (that is, the largest size
3335 for which each element would still be 0 or 1). */
3337 unsigned int
3338 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3340 /* Start with the most optimistic assumption: that we only need
3341 one bit per pattern. This is what we will use if only the first
3342 bit in each pattern is ever set. */
3343 unsigned int mask = GET_MODE_SIZE (DImode);
3344 mask |= builder.npatterns ();
3346 /* Look for set bits. */
3347 unsigned int nelts = builder.encoded_nelts ();
3348 for (unsigned int i = 1; i < nelts; ++i)
3349 if (INTVAL (builder.elt (i)) != 0)
3351 if (i & 1)
3352 return 1;
3353 mask |= i;
3355 return mask & -mask;
3358 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3359 return that predicate mode, otherwise return opt_machine_mode (). */
3361 opt_machine_mode
3362 aarch64_ptrue_all_mode (rtx x)
3364 gcc_assert (GET_MODE (x) == VNx16BImode);
3365 if (!CONST_VECTOR_P (x)
3366 || !CONST_VECTOR_DUPLICATE_P (x)
3367 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3368 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3369 return opt_machine_mode ();
3371 unsigned int nelts = const_vector_encoded_nelts (x);
3372 for (unsigned int i = 1; i < nelts; ++i)
3373 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3374 return opt_machine_mode ();
3376 return aarch64_sve_pred_mode (nelts);
3379 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3380 that the constant would have with predicate element size ELT_SIZE
3381 (ignoring the upper bits in each element) and return:
3383 * -1 if all bits are set
3384 * N if the predicate has N leading set bits followed by all clear bits
3385 * 0 if the predicate does not have any of these forms. */
3388 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3389 unsigned int elt_size)
3391 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3392 followed by set bits. */
3393 if (builder.nelts_per_pattern () == 3)
3394 return 0;
3396 /* Skip over leading set bits. */
3397 unsigned int nelts = builder.encoded_nelts ();
3398 unsigned int i = 0;
3399 for (; i < nelts; i += elt_size)
3400 if (INTVAL (builder.elt (i)) == 0)
3401 break;
3402 unsigned int vl = i / elt_size;
3404 /* Check for the all-true case. */
3405 if (i == nelts)
3406 return -1;
3408 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3409 repeating pattern of set bits followed by clear bits. */
3410 if (builder.nelts_per_pattern () != 2)
3411 return 0;
3413 /* We have a "foreground" value and a duplicated "background" value.
3414 If the background might repeat and the last set bit belongs to it,
3415 we might have set bits followed by clear bits followed by set bits. */
3416 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3417 return 0;
3419 /* Make sure that the rest are all clear. */
3420 for (; i < nelts; i += elt_size)
3421 if (INTVAL (builder.elt (i)) != 0)
3422 return 0;
3424 return vl;
3427 /* See if there is an svpattern that encodes an SVE predicate of mode
3428 PRED_MODE in which the first VL bits are set and the rest are clear.
3429 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3430 A VL of -1 indicates an all-true vector. */
3432 aarch64_svpattern
3433 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3435 if (vl < 0)
3436 return AARCH64_SV_ALL;
3438 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3439 return AARCH64_NUM_SVPATTERNS;
3441 if (vl >= 1 && vl <= 8)
3442 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3444 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3445 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3447 int max_vl;
3448 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3450 if (vl == (max_vl / 3) * 3)
3451 return AARCH64_SV_MUL3;
3452 /* These would only trigger for non-power-of-2 lengths. */
3453 if (vl == (max_vl & -4))
3454 return AARCH64_SV_MUL4;
3455 if (vl == (1 << floor_log2 (max_vl)))
3456 return AARCH64_SV_POW2;
3457 if (vl == max_vl)
3458 return AARCH64_SV_ALL;
3460 return AARCH64_NUM_SVPATTERNS;
3463 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3464 bits has the lowest bit set and the upper bits clear. This is the
3465 VNx16BImode equivalent of a PTRUE for controlling elements of
3466 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3467 all bits are significant, even the upper zeros. */
3470 aarch64_ptrue_all (unsigned int elt_size)
3472 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3473 builder.quick_push (const1_rtx);
3474 for (unsigned int i = 1; i < elt_size; ++i)
3475 builder.quick_push (const0_rtx);
3476 return builder.build ();
3479 /* Return an all-true predicate register of mode MODE. */
3482 aarch64_ptrue_reg (machine_mode mode)
3484 gcc_assert (aarch64_sve_pred_mode_p (mode));
3485 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3486 return gen_lowpart (mode, reg);
3489 /* Return an all-false predicate register of mode MODE. */
3492 aarch64_pfalse_reg (machine_mode mode)
3494 gcc_assert (aarch64_sve_pred_mode_p (mode));
3495 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3496 return gen_lowpart (mode, reg);
3499 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3500 for it. PRED2[0] is the predicate for the instruction whose result
3501 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3502 for it. Return true if we can prove that the two predicates are
3503 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3504 with PRED1[0] without changing behavior. */
3506 bool
3507 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3509 machine_mode mode = GET_MODE (pred1[0]);
3510 gcc_assert (aarch64_sve_pred_mode_p (mode)
3511 && mode == GET_MODE (pred2[0])
3512 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3513 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3515 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3516 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3517 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3518 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3519 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3522 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3523 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3524 Use TARGET as the target register if nonnull and convenient. */
3526 static rtx
3527 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3528 machine_mode data_mode, rtx op1, rtx op2)
3530 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3531 expand_operand ops[5];
3532 create_output_operand (&ops[0], target, pred_mode);
3533 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3534 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3535 create_input_operand (&ops[3], op1, data_mode);
3536 create_input_operand (&ops[4], op2, data_mode);
3537 expand_insn (icode, 5, ops);
3538 return ops[0].value;
3541 /* Use a comparison to convert integer vector SRC into MODE, which is
3542 the corresponding SVE predicate mode. Use TARGET for the result
3543 if it's nonnull and convenient. */
3546 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3548 machine_mode src_mode = GET_MODE (src);
3549 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3550 src, CONST0_RTX (src_mode));
3553 /* Return the assembly token for svprfop value PRFOP. */
3555 static const char *
3556 svprfop_token (enum aarch64_svprfop prfop)
3558 switch (prfop)
3560 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3561 AARCH64_FOR_SVPRFOP (CASE)
3562 #undef CASE
3563 case AARCH64_NUM_SVPRFOPS:
3564 break;
3566 gcc_unreachable ();
3569 /* Return the assembly string for an SVE prefetch operation with
3570 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3571 and that SUFFIX is the format for the remaining operands. */
3573 char *
3574 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3575 const char *suffix)
3577 static char buffer[128];
3578 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3579 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3580 mnemonic, svprfop_token (prfop), suffix);
3581 gcc_assert (written < sizeof (buffer));
3582 return buffer;
3585 /* Check whether we can calculate the number of elements in PATTERN
3586 at compile time, given that there are NELTS_PER_VQ elements per
3587 128-bit block. Return the value if so, otherwise return -1. */
3589 HOST_WIDE_INT
3590 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3592 unsigned int vl, const_vg;
3593 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3594 vl = 1 + (pattern - AARCH64_SV_VL1);
3595 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3596 vl = 16 << (pattern - AARCH64_SV_VL16);
3597 else if (aarch64_sve_vg.is_constant (&const_vg))
3599 /* There are two vector granules per quadword. */
3600 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3601 switch (pattern)
3603 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3604 case AARCH64_SV_MUL4: return nelts & -4;
3605 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3606 case AARCH64_SV_ALL: return nelts;
3607 default: gcc_unreachable ();
3610 else
3611 return -1;
3613 /* There are two vector granules per quadword. */
3614 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3615 if (known_le (vl, nelts_all))
3616 return vl;
3618 /* Requesting more elements than are available results in a PFALSE. */
3619 if (known_gt (vl, nelts_all))
3620 return 0;
3622 return -1;
3625 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3626 by the number of 128-bit quadwords in an SVE vector. */
3628 static bool
3629 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3631 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3632 return (IN_RANGE (factor, 2, 16 * 16)
3633 && (factor & 1) == 0
3634 && factor <= 16 * (factor & -factor));
3637 /* Return true if we can move VALUE into a register using a single
3638 CNT[BHWD] instruction. */
3640 static bool
3641 aarch64_sve_cnt_immediate_p (poly_int64 value)
3643 HOST_WIDE_INT factor = value.coeffs[0];
3644 return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3647 /* Likewise for rtx X. */
3649 bool
3650 aarch64_sve_cnt_immediate_p (rtx x)
3652 poly_int64 value;
3653 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3656 /* Return the asm string for an instruction with a CNT-like vector size
3657 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3658 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3659 first part of the operands template (the part that comes before the
3660 vector size itself). PATTERN is the pattern to use. FACTOR is the
3661 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3662 in each quadword. If it is zero, we can use any element size. */
3664 static char *
3665 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3666 aarch64_svpattern pattern,
3667 unsigned int factor,
3668 unsigned int nelts_per_vq)
3670 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3672 if (nelts_per_vq == 0)
3673 /* There is some overlap in the ranges of the four CNT instructions.
3674 Here we always use the smallest possible element size, so that the
3675 multiplier is 1 whereever possible. */
3676 nelts_per_vq = factor & -factor;
3677 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3678 gcc_assert (IN_RANGE (shift, 1, 4));
3679 char suffix = "dwhb"[shift - 1];
3681 factor >>= shift;
3682 unsigned int written;
3683 if (pattern == AARCH64_SV_ALL && factor == 1)
3684 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3685 prefix, suffix, operands);
3686 else if (factor == 1)
3687 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3688 prefix, suffix, operands, svpattern_token (pattern));
3689 else
3690 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3691 prefix, suffix, operands, svpattern_token (pattern),
3692 factor);
3693 gcc_assert (written < sizeof (buffer));
3694 return buffer;
3697 /* Return the asm string for an instruction with a CNT-like vector size
3698 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3699 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3700 first part of the operands template (the part that comes before the
3701 vector size itself). X is the value of the vector size operand,
3702 as a polynomial integer rtx; we need to convert this into an "all"
3703 pattern with a multiplier. */
3705 char *
3706 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3707 rtx x)
3709 poly_int64 value = rtx_to_poly_int64 (x);
3710 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3711 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3712 value.coeffs[1], 0);
3715 /* Return the asm string for an instruction with a CNT-like vector size
3716 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3717 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3718 first part of the operands template (the part that comes before the
3719 vector size itself). CNT_PAT[0..2] are the operands of the
3720 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3722 char *
3723 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3724 const char *operands, rtx *cnt_pat)
3726 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3727 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3728 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3729 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3730 factor, nelts_per_vq);
3733 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3735 bool
3736 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3738 poly_int64 value;
3739 return (poly_int_rtx_p (x, &value)
3740 && (aarch64_sve_cnt_immediate_p (value)
3741 || aarch64_sve_cnt_immediate_p (-value)));
3744 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3745 operand 0. */
3747 char *
3748 aarch64_output_sve_scalar_inc_dec (rtx offset)
3750 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3751 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3752 if (offset_value.coeffs[1] > 0)
3753 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3754 offset_value.coeffs[1], 0);
3755 else
3756 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3757 -offset_value.coeffs[1], 0);
3760 /* Return true if a single RDVL instruction can multiply FACTOR by the
3761 number of 128-bit quadwords in an SVE vector. This is also the
3762 range of ADDVL. */
3764 static bool
3765 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
3767 return (multiple_p (factor, 16)
3768 && IN_RANGE (factor, -32 * 16, 31 * 16));
3771 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3772 of quadwords in an SVE vector. */
3774 static bool
3775 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
3777 return (multiple_p (factor, 2)
3778 && IN_RANGE (factor, -32 * 2, 31 * 2));
3781 /* Return true if we can move VALUE into a register using a single
3782 RDVL instruction. */
3784 static bool
3785 aarch64_sve_rdvl_immediate_p (poly_int64 value)
3787 HOST_WIDE_INT factor = value.coeffs[0];
3788 return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
3791 /* Likewise for rtx X. */
3793 bool
3794 aarch64_sve_rdvl_immediate_p (rtx x)
3796 poly_int64 value;
3797 return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
3800 /* Return the asm string for moving RDVL immediate OFFSET into register
3801 operand 0. */
3803 char *
3804 aarch64_output_sve_rdvl (rtx offset)
3806 static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3807 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3808 gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
3810 int factor = offset_value.coeffs[1];
3811 snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
3812 return buffer;
3815 /* Return true if we can add VALUE to a register using a single ADDVL
3816 or ADDPL instruction. */
3818 static bool
3819 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3821 HOST_WIDE_INT factor = value.coeffs[0];
3822 if (factor == 0 || value.coeffs[1] != factor)
3823 return false;
3824 return (aarch64_sve_rdvl_addvl_factor_p (factor)
3825 || aarch64_sve_addpl_factor_p (factor));
3828 /* Likewise for rtx X. */
3830 bool
3831 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3833 poly_int64 value;
3834 return (poly_int_rtx_p (x, &value)
3835 && aarch64_sve_addvl_addpl_immediate_p (value));
3838 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3839 to operand 1 and storing the result in operand 0. */
3841 char *
3842 aarch64_output_sve_addvl_addpl (rtx offset)
3844 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3845 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3846 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3848 int factor = offset_value.coeffs[1];
3849 if ((factor & 15) == 0)
3850 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3851 else
3852 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3853 return buffer;
3856 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3857 instruction. If it is, store the number of elements in each vector
3858 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3859 factor in *FACTOR_OUT (if nonnull). */
3861 bool
3862 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3863 unsigned int *nelts_per_vq_out)
3865 rtx elt;
3866 poly_int64 value;
3868 if (!const_vec_duplicate_p (x, &elt)
3869 || !poly_int_rtx_p (elt, &value))
3870 return false;
3872 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3873 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3874 /* There's no vector INCB. */
3875 return false;
3877 HOST_WIDE_INT factor = value.coeffs[0];
3878 if (value.coeffs[1] != factor)
3879 return false;
3881 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3882 if ((factor % nelts_per_vq) != 0
3883 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3884 return false;
3886 if (factor_out)
3887 *factor_out = factor;
3888 if (nelts_per_vq_out)
3889 *nelts_per_vq_out = nelts_per_vq;
3890 return true;
3893 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3894 instruction. */
3896 bool
3897 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3899 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3902 /* Return the asm template for an SVE vector INC or DEC instruction.
3903 OPERANDS gives the operands before the vector count and X is the
3904 value of the vector count operand itself. */
3906 char *
3907 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3909 int factor;
3910 unsigned int nelts_per_vq;
3911 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3912 gcc_unreachable ();
3913 if (factor < 0)
3914 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3915 -factor, nelts_per_vq);
3916 else
3917 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3918 factor, nelts_per_vq);
3921 /* Return a constant that represents FACTOR multiplied by the
3922 number of 128-bit quadwords in an SME vector. ISA_MODE is the
3923 ISA mode in which the calculation is being performed. */
3926 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
3927 aarch64_feature_flags isa_mode)
3929 gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
3930 if (isa_mode & AARCH64_FL_SM_ON)
3931 /* We're in streaming mode, so we can use normal poly-int values. */
3932 return gen_int_mode ({ factor, factor }, mode);
3934 rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
3935 rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
3936 return gen_rtx_CONST (mode, unspec);
3939 /* Return true if X is a constant that represents some number X
3940 multiplied by the number of quadwords in an SME vector. Store this X
3941 in *FACTOR if so. */
3943 static bool
3944 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
3946 if (!TARGET_SME || GET_CODE (x) != CONST)
3947 return false;
3949 x = XEXP (x, 0);
3950 if (GET_CODE (x) != UNSPEC
3951 || XINT (x, 1) != UNSPEC_SME_VQ
3952 || XVECLEN (x, 0) != 1)
3953 return false;
3955 x = XVECEXP (x, 0, 0);
3956 if (!CONST_INT_P (x))
3957 return false;
3959 *factor = INTVAL (x);
3960 return true;
3963 /* Return true if X is a constant that represents some number Y
3964 multiplied by the number of quadwords in an SME vector, and if
3965 that Y is in the range of RDSVL. */
3967 bool
3968 aarch64_rdsvl_immediate_p (const_rtx x)
3970 HOST_WIDE_INT factor;
3971 return (aarch64_sme_vq_unspec_p (x, &factor)
3972 && aarch64_sve_rdvl_addvl_factor_p (factor));
3975 /* Return the asm string for an RDSVL instruction that calculates X,
3976 which is a constant that satisfies aarch64_rdsvl_immediate_p. */
3978 char *
3979 aarch64_output_rdsvl (const_rtx x)
3981 gcc_assert (aarch64_rdsvl_immediate_p (x));
3982 static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
3983 x = XVECEXP (XEXP (x, 0), 0, 0);
3984 snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
3985 (int) INTVAL (x) / 16);
3986 return buffer;
3989 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL. */
3991 bool
3992 aarch64_addsvl_addspl_immediate_p (const_rtx x)
3994 HOST_WIDE_INT factor;
3995 return (aarch64_sme_vq_unspec_p (x, &factor)
3996 && (aarch64_sve_rdvl_addvl_factor_p (factor)
3997 || aarch64_sve_addpl_factor_p (factor)));
4000 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4001 Return the asm string for the associated instruction. */
4003 char *
4004 aarch64_output_addsvl_addspl (rtx x)
4006 static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4007 HOST_WIDE_INT factor;
4008 if (!aarch64_sme_vq_unspec_p (x, &factor))
4009 gcc_unreachable ();
4010 if (aarch64_sve_rdvl_addvl_factor_p (factor))
4011 snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4012 (int) factor / 16);
4013 else if (aarch64_sve_addpl_factor_p (factor))
4014 snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4015 (int) factor / 2);
4016 else
4017 gcc_unreachable ();
4018 return buffer;
4021 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4023 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4025 0x0000000100000001ull,
4026 0x0001000100010001ull,
4027 0x0101010101010101ull,
4028 0x1111111111111111ull,
4029 0x5555555555555555ull,
4034 /* Return true if 64-bit VAL is a valid bitmask immediate. */
4035 static bool
4036 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4038 unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4039 int bits;
4041 /* Check for a single sequence of one bits and return quickly if so.
4042 The special cases of all ones and all zeroes returns false. */
4043 tmp = val + (val & -val);
4045 if (tmp == (tmp & -tmp))
4046 return (val + 1) > 1;
4048 /* Invert if the immediate doesn't start with a zero bit - this means we
4049 only need to search for sequences of one bits. */
4050 if (val & 1)
4051 val = ~val;
4053 /* Find the first set bit and set tmp to val with the first sequence of one
4054 bits removed. Return success if there is a single sequence of ones. */
4055 first_one = val & -val;
4056 tmp = val & (val + first_one);
4058 if (tmp == 0)
4059 return true;
4061 /* Find the next set bit and compute the difference in bit position. */
4062 next_one = tmp & -tmp;
4063 bits = clz_hwi (first_one) - clz_hwi (next_one);
4064 mask = val ^ tmp;
4066 /* Check the bit position difference is a power of 2, and that the first
4067 sequence of one bits fits within 'bits' bits. */
4068 if ((mask >> bits) != 0 || bits != (bits & -bits))
4069 return false;
4071 /* Check the sequence of one bits is repeated 64/bits times. */
4072 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4076 /* Return true if VAL is a valid bitmask immediate for MODE. */
4077 bool
4078 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4080 if (mode == DImode)
4081 return aarch64_bitmask_imm (val);
4083 if (mode == SImode)
4084 return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4086 /* Replicate small immediates to fit 64 bits. */
4087 int size = GET_MODE_UNIT_PRECISION (mode);
4088 val &= (HOST_WIDE_INT_1U << size) - 1;
4089 val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4091 return aarch64_bitmask_imm (val);
4095 /* Return true if the immediate VAL can be a bitfield immediate
4096 by changing the given MASK bits in VAL to zeroes, ones or bits
4097 from the other half of VAL. Return the new immediate in VAL2. */
4098 static inline bool
4099 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4100 unsigned HOST_WIDE_INT &val2,
4101 unsigned HOST_WIDE_INT mask)
4103 val2 = val & ~mask;
4104 if (val2 != val && aarch64_bitmask_imm (val2))
4105 return true;
4106 val2 = val | mask;
4107 if (val2 != val && aarch64_bitmask_imm (val2))
4108 return true;
4109 val = val & ~mask;
4110 val2 = val | (((val >> 32) | (val << 32)) & mask);
4111 if (val2 != val && aarch64_bitmask_imm (val2))
4112 return true;
4113 val2 = val | (((val >> 16) | (val << 48)) & mask);
4114 if (val2 != val && aarch64_bitmask_imm (val2))
4115 return true;
4116 return false;
4120 /* Return true if VAL is a valid MOVZ immediate. */
4121 static inline bool
4122 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4124 return (val >> (ctz_hwi (val) & 48)) < 65536;
4128 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
4129 bool
4130 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4132 return aarch64_is_movz (val) || aarch64_is_movz (~val)
4133 || aarch64_bitmask_imm (val);
4137 /* Return true if VAL is an immediate that can be created by a single
4138 MOV instruction. */
4139 bool
4140 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4142 gcc_assert (mode == SImode || mode == DImode);
4144 if (val < 65536)
4145 return true;
4147 unsigned HOST_WIDE_INT mask =
4148 (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4150 if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4151 return true;
4153 val = (val & mask) | ((val << 32) & ~mask);
4154 return aarch64_bitmask_imm (val);
4158 static int
4159 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4160 machine_mode mode)
4162 int i;
4163 unsigned HOST_WIDE_INT val, val2, val3, mask;
4164 int one_match, zero_match;
4165 int num_insns;
4167 gcc_assert (mode == SImode || mode == DImode);
4169 val = INTVAL (imm);
4171 if (aarch64_move_imm (val, mode))
4173 if (generate)
4174 emit_insn (gen_rtx_SET (dest, imm));
4175 return 1;
4178 if ((val >> 32) == 0 || mode == SImode)
4180 if (generate)
4182 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4183 if (mode == SImode)
4184 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4185 GEN_INT ((val >> 16) & 0xffff)));
4186 else
4187 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4188 GEN_INT ((val >> 16) & 0xffff)));
4190 return 2;
4193 /* Remaining cases are all for DImode. */
4195 mask = 0xffff;
4196 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4197 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4198 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4199 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4201 /* Try a bitmask immediate and a movk to generate the immediate
4202 in 2 instructions. */
4204 if (zero_match < 2 && one_match < 2)
4206 for (i = 0; i < 64; i += 16)
4208 if (aarch64_check_bitmask (val, val2, mask << i))
4209 break;
4211 val2 = val & ~(mask << i);
4212 if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4213 break;
4216 if (i != 64)
4218 if (generate)
4220 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4221 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4222 GEN_INT ((val >> i) & 0xffff)));
4224 return 2;
4227 /* Try 2 bitmask immediates which are xor'd together. */
4228 for (i = 0; i < 64; i += 16)
4230 val2 = (val >> i) & mask;
4231 val2 |= val2 << 16;
4232 val2 |= val2 << 32;
4233 if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4234 break;
4237 if (i != 64)
4239 if (generate)
4241 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4242 emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4244 return 2;
4248 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
4249 if (zero_match + one_match == 0)
4251 for (i = 0; i < 48; i += 16)
4252 for (int j = i + 16; j < 64; j += 16)
4253 if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4255 if (generate)
4257 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4258 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4259 GEN_INT ((val >> i) & 0xffff)));
4260 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4261 GEN_INT ((val >> j) & 0xffff)));
4263 return 3;
4266 /* Try shifting and inserting the bottom 32-bits into the top bits. */
4267 val2 = val & 0xffffffff;
4268 val3 = 0xffffffff;
4269 val3 = val2 | (val3 << 32);
4270 for (i = 17; i < 48; i++)
4271 if ((val2 | (val2 << i)) == val)
4273 if (generate)
4275 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4276 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4277 GEN_INT (val2 >> 16)));
4278 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4280 return 3;
4282 else if ((val3 & ~(val3 << i)) == val)
4284 if (generate)
4286 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4287 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4288 GEN_INT (val2 >> 16)));
4289 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4290 dest));
4292 return 3;
4296 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4297 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4298 otherwise skip zero bits. */
4300 num_insns = 1;
4301 mask = 0xffff;
4302 val2 = one_match > zero_match ? ~val : val;
4303 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4305 if (generate)
4306 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4307 ? (val | ~(mask << i))
4308 : (val & (mask << i)))));
4309 for (i += 16; i < 64; i += 16)
4311 if ((val2 & (mask << i)) == 0)
4312 continue;
4313 if (generate)
4314 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4315 GEN_INT ((val >> i) & 0xffff)));
4316 num_insns ++;
4319 return num_insns;
4322 /* Return whether imm is a 128-bit immediate which is simple enough to
4323 expand inline. */
4324 bool
4325 aarch64_mov128_immediate (rtx imm)
4327 if (CONST_INT_P (imm))
4328 return true;
4330 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4332 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4333 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4335 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4336 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4340 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4341 a left shift of 0 or 12 bits. */
4342 bool
4343 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4345 return val < 4096 || (val & 0xfff000) == val;
4348 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4349 that can be created with a left shift of 0 or 12. */
4350 static HOST_WIDE_INT
4351 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4353 /* Check to see if the value fits in 24 bits, as that is the maximum we can
4354 handle correctly. */
4355 gcc_assert (val < 0x1000000);
4357 if (val < 4096)
4358 return val;
4360 return val & 0xfff000;
4364 /* Test whether:
4366 X = (X & AND_VAL) | IOR_VAL;
4368 can be implemented using:
4370 MOVK X, #(IOR_VAL >> shift), LSL #shift
4372 Return the shift if so, otherwise return -1. */
4374 aarch64_movk_shift (const wide_int_ref &and_val,
4375 const wide_int_ref &ior_val)
4377 unsigned int precision = and_val.get_precision ();
4378 unsigned HOST_WIDE_INT mask = 0xffff;
4379 for (unsigned int shift = 0; shift < precision; shift += 16)
4381 if (and_val == ~mask && (ior_val & mask) == ior_val)
4382 return shift;
4383 mask <<= 16;
4385 return -1;
4388 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4389 Assumed precondition: VAL_IN Is not zero. */
4391 unsigned HOST_WIDE_INT
4392 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4394 int lowest_bit_set = ctz_hwi (val_in);
4395 int highest_bit_set = floor_log2 (val_in);
4396 gcc_assert (val_in != 0);
4398 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4399 (HOST_WIDE_INT_1U << lowest_bit_set));
4402 /* Create constant where bits outside of lowest bit set to highest bit set
4403 are set to 1. */
4405 unsigned HOST_WIDE_INT
4406 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4408 return val_in | ~aarch64_and_split_imm1 (val_in);
4411 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4413 bool
4414 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4416 scalar_int_mode int_mode;
4417 if (!is_a <scalar_int_mode> (mode, &int_mode))
4418 return false;
4420 if (aarch64_bitmask_imm (val_in, int_mode))
4421 return false;
4423 if (aarch64_move_imm (val_in, int_mode))
4424 return false;
4426 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4428 return aarch64_bitmask_imm (imm2, int_mode);
4431 /* Return the number of temporary registers that aarch64_add_offset_1
4432 would need to add OFFSET to a register. */
4434 static unsigned int
4435 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4437 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4440 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4441 a non-polynomial OFFSET. MODE is the mode of the addition.
4442 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4443 be set and CFA adjustments added to the generated instructions.
4445 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4446 temporary if register allocation is already complete. This temporary
4447 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4448 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4449 the immediate again.
4451 Since this function may be used to adjust the stack pointer, we must
4452 ensure that it cannot cause transient stack deallocation (for example
4453 by first incrementing SP and then decrementing when adjusting by a
4454 large immediate). */
4456 static void
4457 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4458 rtx src, HOST_WIDE_INT offset, rtx temp1,
4459 bool frame_related_p, bool emit_move_imm)
4461 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4462 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4464 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4465 rtx_insn *insn;
4467 if (!moffset)
4469 if (!rtx_equal_p (dest, src))
4471 insn = emit_insn (gen_rtx_SET (dest, src));
4472 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4474 return;
4477 /* Single instruction adjustment. */
4478 if (aarch64_uimm12_shift (moffset))
4480 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4481 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4482 return;
4485 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4486 and either:
4488 a) the offset cannot be loaded by a 16-bit move or
4489 b) there is no spare register into which we can move it. */
4490 if (moffset < 0x1000000
4491 && ((!temp1 && !can_create_pseudo_p ())
4492 || !aarch64_move_imm (moffset, mode)))
4494 HOST_WIDE_INT low_off = moffset & 0xfff;
4496 low_off = offset < 0 ? -low_off : low_off;
4497 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4498 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4499 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4500 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4501 return;
4504 /* Emit a move immediate if required and an addition/subtraction. */
4505 if (emit_move_imm)
4507 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4508 temp1 = aarch64_force_temporary (mode, temp1,
4509 gen_int_mode (moffset, mode));
4511 insn = emit_insn (offset < 0
4512 ? gen_sub3_insn (dest, src, temp1)
4513 : gen_add3_insn (dest, src, temp1));
4514 if (frame_related_p)
4516 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4517 rtx adj = plus_constant (mode, src, offset);
4518 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4522 /* Return the number of temporary registers that aarch64_add_offset
4523 would need to move OFFSET into a register or add OFFSET to a register;
4524 ADD_P is true if we want the latter rather than the former. */
4526 static unsigned int
4527 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4529 /* This follows the same structure as aarch64_add_offset. */
4530 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4531 return 0;
4533 unsigned int count = 0;
4534 HOST_WIDE_INT factor = offset.coeffs[1];
4535 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4536 poly_int64 poly_offset (factor, factor);
4537 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4538 /* Need one register for the ADDVL/ADDPL result. */
4539 count += 1;
4540 else if (factor != 0)
4542 factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4543 if (!IN_RANGE (factor, -32, 31))
4544 /* Need one register for the CNT or RDVL result and one for the
4545 multiplication factor. If necessary, the second temporary
4546 can be reused for the constant part of the offset. */
4547 return 2;
4548 /* Need one register for the CNT or RDVL result (which might then
4549 be shifted). */
4550 count += 1;
4552 return count + aarch64_add_offset_1_temporaries (constant);
4555 /* If X can be represented as a poly_int64, return the number
4556 of temporaries that are required to add it to a register.
4557 Return -1 otherwise. */
4560 aarch64_add_offset_temporaries (rtx x)
4562 poly_int64 offset;
4563 if (!poly_int_rtx_p (x, &offset))
4564 return -1;
4565 return aarch64_offset_temporaries (true, offset);
4568 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4569 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4570 be set and CFA adjustments added to the generated instructions.
4572 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4573 temporary if register allocation is already complete. This temporary
4574 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4575 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4576 false to avoid emitting the immediate again.
4578 TEMP2, if nonnull, is a second temporary register that doesn't
4579 overlap either DEST or REG.
4581 FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
4582 is measured relative to the SME vector length instead of the current
4583 prevailing vector length. It is 0 otherwise.
4585 Since this function may be used to adjust the stack pointer, we must
4586 ensure that it cannot cause transient stack deallocation (for example
4587 by first incrementing SP and then decrementing when adjusting by a
4588 large immediate). */
4590 static void
4591 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4592 poly_int64 offset, rtx temp1, rtx temp2,
4593 aarch64_feature_flags force_isa_mode,
4594 bool frame_related_p, bool emit_move_imm = true)
4596 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4597 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4598 gcc_assert (temp1 == NULL_RTX
4599 || !frame_related_p
4600 || !reg_overlap_mentioned_p (temp1, dest));
4601 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4603 /* Try using ADDVL or ADDPL to add the whole value. */
4604 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4606 gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4607 rtx offset_rtx;
4608 if (force_isa_mode == 0)
4609 offset_rtx = gen_int_mode (offset, mode);
4610 else
4611 offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4612 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4613 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4614 if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
4615 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4616 gen_rtx_SET (dest, plus_constant (Pmode, src,
4617 offset)));
4618 return;
4621 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4622 SVE vector register, over and above the minimum size of 128 bits.
4623 This is equivalent to half the value returned by CNTD with a
4624 vector shape of ALL. */
4625 HOST_WIDE_INT factor = offset.coeffs[1];
4626 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4628 /* Try using ADDVL or ADDPL to add the VG-based part. */
4629 poly_int64 poly_offset (factor, factor);
4630 if (src != const0_rtx
4631 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4633 rtx offset_rtx;
4634 if (force_isa_mode == 0)
4635 offset_rtx = gen_int_mode (poly_offset, mode);
4636 else
4637 offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4638 if (frame_related_p)
4640 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4641 RTX_FRAME_RELATED_P (insn) = true;
4642 if (force_isa_mode & AARCH64_FL_SM_ON)
4643 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4644 gen_rtx_SET (dest, plus_constant (Pmode, src,
4645 poly_offset)));
4646 src = dest;
4648 else
4650 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4651 src = aarch64_force_temporary (mode, temp1, addr);
4652 temp1 = temp2;
4653 temp2 = NULL_RTX;
4656 /* Otherwise use a CNT-based sequence. */
4657 else if (factor != 0)
4659 /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4660 with negative shifts indicating a shift right. */
4661 HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4662 HOST_WIDE_INT rel_factor = factor / low_bit;
4663 int shift = exact_log2 (low_bit) - 4;
4664 gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4666 /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4667 equal to CNTB * FACTOR / 16, with CODE being the [+-].
4669 We can avoid a multiplication if REL_FACTOR is in the range
4670 of RDVL, although there are then various optimizations that
4671 we can try on top. */
4672 rtx_code code = PLUS;
4673 rtx val;
4674 if (IN_RANGE (rel_factor, -32, 31))
4676 if (force_isa_mode & AARCH64_FL_SM_ON)
4678 /* Try to use an unshifted RDSVL, otherwise fall back on
4679 a shifted RDSVL #1. */
4680 if (aarch64_sve_rdvl_addvl_factor_p (factor))
4681 shift = 0;
4682 else
4683 factor = rel_factor * 16;
4684 val = aarch64_sme_vq_immediate (mode, factor, 0);
4686 /* Try to use an unshifted CNT[BHWD] or RDVL. */
4687 else if (aarch64_sve_cnt_factor_p (factor)
4688 || aarch64_sve_rdvl_addvl_factor_p (factor))
4690 val = gen_int_mode (poly_int64 (factor, factor), mode);
4691 shift = 0;
4693 /* Try to subtract an unshifted CNT[BHWD]. */
4694 else if (aarch64_sve_cnt_factor_p (-factor))
4696 code = MINUS;
4697 val = gen_int_mode (poly_int64 (-factor, -factor), mode);
4698 shift = 0;
4700 /* If subtraction is free, prefer to load a positive constant.
4701 In the best case this will fit a shifted CNTB. */
4702 else if (src != const0_rtx && rel_factor < 0)
4704 code = MINUS;
4705 val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
4707 /* Otherwise use a shifted RDVL or CNT[BHWD]. */
4708 else
4709 val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
4711 else
4713 /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4714 since it should increase the chances of being able to use
4715 a shift and add sequence for the multiplication.
4716 If CNTB << SHIFT is out of range, stick with the current
4717 shift factor. */
4718 if (force_isa_mode == 0
4719 && IN_RANGE (low_bit, 2, 16 * 16))
4721 val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
4722 shift = 0;
4724 else if ((force_isa_mode & AARCH64_FL_SM_ON)
4725 && aarch64_sve_rdvl_addvl_factor_p (low_bit))
4727 val = aarch64_sme_vq_immediate (mode, low_bit, 0);
4728 shift = 0;
4730 else
4731 val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
4733 val = aarch64_force_temporary (mode, temp1, val);
4735 /* Prefer to multiply by a positive factor and subtract rather
4736 than multiply by a negative factor and add, since positive
4737 values are usually easier to move. */
4738 if (rel_factor < 0 && src != const0_rtx)
4740 rel_factor = -rel_factor;
4741 code = MINUS;
4744 if (can_create_pseudo_p ())
4746 rtx coeff1 = gen_int_mode (rel_factor, mode);
4747 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4749 else
4751 rtx coeff1 = gen_int_mode (rel_factor, mode);
4752 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4753 val = gen_rtx_MULT (mode, val, coeff1);
4757 /* Multiply by 2 ** SHIFT. */
4758 if (shift > 0)
4760 val = aarch64_force_temporary (mode, temp1, val);
4761 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4763 else if (shift < 0)
4765 val = aarch64_force_temporary (mode, temp1, val);
4766 val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
4769 /* Add the result to SRC or subtract the result from SRC. */
4770 if (src != const0_rtx)
4772 val = aarch64_force_temporary (mode, temp1, val);
4773 val = gen_rtx_fmt_ee (code, mode, src, val);
4775 else if (code == MINUS)
4777 val = aarch64_force_temporary (mode, temp1, val);
4778 val = gen_rtx_NEG (mode, val);
4781 if (constant == 0 || frame_related_p)
4783 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4784 if (frame_related_p)
4786 RTX_FRAME_RELATED_P (insn) = true;
4787 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4788 gen_rtx_SET (dest, plus_constant (Pmode, src,
4789 poly_offset)));
4791 src = dest;
4792 if (constant == 0)
4793 return;
4795 else
4797 src = aarch64_force_temporary (mode, temp1, val);
4798 temp1 = temp2;
4799 temp2 = NULL_RTX;
4802 emit_move_imm = true;
4805 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4806 frame_related_p, emit_move_imm);
4809 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4810 than a poly_int64. */
4812 void
4813 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4814 rtx offset_rtx, rtx temp1, rtx temp2)
4816 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4817 temp1, temp2, 0, false);
4820 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4821 TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
4822 for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
4823 contains abs (DELTA). */
4825 static inline void
4826 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
4827 aarch64_feature_flags force_isa_mode, bool emit_move_imm)
4829 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4830 temp1, temp2, force_isa_mode, true, emit_move_imm);
4833 /* Subtract DELTA from the stack pointer, marking the instructions
4834 frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
4835 aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
4837 static inline void
4838 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
4839 aarch64_feature_flags force_isa_mode,
4840 bool frame_related_p, bool emit_move_imm = true)
4842 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4843 temp1, temp2, force_isa_mode, frame_related_p,
4844 emit_move_imm);
4847 /* A streaming-compatible function needs to switch temporarily to the known
4848 PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains
4849 the runtime state of PSTATE.SM in the streaming-compatible code, before
4850 the start of the switch to LOCAL_MODE.
4852 Emit instructions to branch around the mode switch if PSTATE.SM already
4853 matches LOCAL_MODE. Return the label that the branch jumps to. */
4855 static rtx_insn *
4856 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
4858 local_mode &= AARCH64_FL_SM_STATE;
4859 gcc_assert (local_mode != 0);
4860 auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
4861 auto *label = gen_label_rtx ();
4862 auto *jump = emit_jump_insn (gen_aarch64_tb (already_ok_cond, DImode, DImode,
4863 old_svcr, const0_rtx, label));
4864 JUMP_LABEL (jump) = label;
4865 return label;
4868 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
4869 state in NEW_MODE. This is known to involve either an SMSTART SM or
4870 an SMSTOP SM. */
4872 static void
4873 aarch64_switch_pstate_sm (aarch64_feature_flags old_mode,
4874 aarch64_feature_flags new_mode)
4876 old_mode &= AARCH64_FL_SM_STATE;
4877 new_mode &= AARCH64_FL_SM_STATE;
4878 gcc_assert (old_mode != new_mode);
4880 if ((new_mode & AARCH64_FL_SM_ON)
4881 || (new_mode == 0 && (old_mode & AARCH64_FL_SM_OFF)))
4882 emit_insn (gen_aarch64_smstart_sm ());
4883 else
4884 emit_insn (gen_aarch64_smstop_sm ());
4887 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
4888 FP and predicate registers. This class emits code to preserve any
4889 necessary registers around the mode switch.
4891 The class uses four approaches to saving and restoring contents, enumerated
4892 by group_type:
4894 - GPR: save and restore the contents of FP registers using GPRs.
4895 This is used if the FP register contains no more than 64 significant
4896 bits. The registers used are FIRST_GPR onwards.
4898 - MEM_128: save and restore 128-bit SIMD registers using memory.
4900 - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
4902 - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
4904 The save slots within each memory group are consecutive, with the
4905 MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
4907 There will only be two mode switches for each use of SME, so they should
4908 not be particularly performance-sensitive. It's also rare for SIMD, SVE
4909 or predicate registers to be live across mode switches. We therefore
4910 don't preallocate the save slots but instead allocate them locally on
4911 demand. This makes the code emitted by the class self-contained. */
4913 class aarch64_sme_mode_switch_regs
4915 public:
4916 static const unsigned int FIRST_GPR = R10_REGNUM;
4918 void add_reg (machine_mode, unsigned int);
4919 void add_call_args (rtx_call_insn *);
4920 void add_call_result (rtx_call_insn *);
4921 void add_call_preserved_reg (unsigned int);
4922 void add_call_preserved_regs (bitmap);
4924 void emit_prologue ();
4925 void emit_epilogue ();
4927 /* The number of GPRs needed to save FP registers, starting from
4928 FIRST_GPR. */
4929 unsigned int num_gprs () { return m_group_count[GPR]; }
4931 private:
4932 enum sequence { PROLOGUE, EPILOGUE };
4933 enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
4935 /* Information about the save location for one FP, SIMD, SVE data, or
4936 SVE predicate register. */
4937 struct save_location {
4938 /* The register to be saved. */
4939 rtx reg;
4941 /* Which group the save location belongs to. */
4942 group_type group;
4944 /* A zero-based index of the register within the group. */
4945 unsigned int index;
4948 unsigned int sve_data_headroom ();
4949 rtx get_slot_mem (machine_mode, poly_int64);
4950 void emit_stack_adjust (sequence, poly_int64);
4951 void emit_mem_move (sequence, const save_location &, poly_int64);
4953 void emit_gpr_moves (sequence);
4954 void emit_mem_128_moves (sequence);
4955 void emit_sve_sp_adjust (sequence);
4956 void emit_sve_pred_moves (sequence);
4957 void emit_sve_data_moves (sequence);
4959 /* All save locations, in no particular order. */
4960 auto_vec<save_location, 12> m_save_locations;
4962 /* The number of registers in each group. */
4963 unsigned int m_group_count[NUM_GROUPS] = {};
4966 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
4967 switch. */
4969 void
4970 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
4972 if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
4973 return;
4975 unsigned int end_regno = end_hard_regno (mode, regno);
4976 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4977 gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
4978 for (; regno < end_regno; regno++)
4980 /* Force the mode of SVE saves and restores even for single registers.
4981 This is necessary because big-endian targets only allow LDR Z and
4982 STR Z to be used with byte modes. */
4983 machine_mode submode = mode;
4984 if (vec_flags & VEC_SVE_PRED)
4985 submode = VNx16BImode;
4986 else if (vec_flags & VEC_SVE_DATA)
4987 submode = SVE_BYTE_MODE;
4988 else if (vec_flags & VEC_STRUCT)
4990 if (vec_flags & VEC_PARTIAL)
4991 submode = V8QImode;
4992 else
4993 submode = V16QImode;
4995 save_location loc;
4996 loc.reg = gen_rtx_REG (submode, regno);
4997 if (vec_flags & VEC_SVE_PRED)
4999 gcc_assert (PR_REGNUM_P (regno));
5000 loc.group = MEM_SVE_PRED;
5002 else
5004 gcc_assert (FP_REGNUM_P (regno));
5005 if (known_le (GET_MODE_SIZE (submode), 8))
5006 loc.group = GPR;
5007 else if (known_eq (GET_MODE_SIZE (submode), 16))
5008 loc.group = MEM_128;
5009 else
5010 loc.group = MEM_SVE_DATA;
5012 loc.index = m_group_count[loc.group]++;
5013 m_save_locations.quick_push (loc);
5017 /* Record that the arguments to CALL_INSN need to be preserved around
5018 the mode switch. */
5020 void
5021 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5023 for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5024 node; node = XEXP (node, 1))
5026 rtx item = XEXP (node, 0);
5027 if (GET_CODE (item) != USE)
5028 continue;
5029 item = XEXP (item, 0);
5030 if (!REG_P (item))
5031 continue;
5032 add_reg (GET_MODE (item), REGNO (item));
5036 /* Record that the return value from CALL_INSN (if any) needs to be
5037 preserved around the mode switch. */
5039 void
5040 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5042 rtx pat = PATTERN (call_insn);
5043 gcc_assert (GET_CODE (pat) == PARALLEL);
5044 pat = XVECEXP (pat, 0, 0);
5045 if (GET_CODE (pat) == CALL)
5046 return;
5047 rtx dest = SET_DEST (pat);
5048 if (GET_CODE (dest) == PARALLEL)
5049 for (int i = 0; i < XVECLEN (dest, 0); ++i)
5051 rtx x = XVECEXP (dest, 0, i);
5052 gcc_assert (GET_CODE (x) == EXPR_LIST);
5053 rtx reg = XEXP (x, 0);
5054 add_reg (GET_MODE (reg), REGNO (reg));
5056 else
5057 add_reg (GET_MODE (dest), REGNO (dest));
5060 /* REGNO is a register that is call-preserved under the current function's ABI.
5061 Record that it must be preserved around the mode switch. */
5063 void
5064 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5066 if (FP_REGNUM_P (regno))
5067 switch (crtl->abi->id ())
5069 case ARM_PCS_SVE:
5070 add_reg (VNx16QImode, regno);
5071 break;
5072 case ARM_PCS_SIMD:
5073 add_reg (V16QImode, regno);
5074 break;
5075 case ARM_PCS_AAPCS64:
5076 add_reg (DImode, regno);
5077 break;
5078 default:
5079 gcc_unreachable ();
5081 else if (PR_REGNUM_P (regno))
5082 add_reg (VNx16BImode, regno);
5085 /* The hard registers in REGS are call-preserved under the current function's
5086 ABI. Record that they must be preserved around the mode switch. */
5088 void
5089 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5091 bitmap_iterator bi;
5092 unsigned int regno;
5093 EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5094 if (HARD_REGISTER_NUM_P (regno))
5095 add_call_preserved_reg (regno);
5096 else
5097 break;
5100 /* Emit code to save registers before the mode switch. */
5102 void
5103 aarch64_sme_mode_switch_regs::emit_prologue ()
5105 emit_sve_sp_adjust (PROLOGUE);
5106 emit_sve_pred_moves (PROLOGUE);
5107 emit_sve_data_moves (PROLOGUE);
5108 emit_mem_128_moves (PROLOGUE);
5109 emit_gpr_moves (PROLOGUE);
5112 /* Emit code to restore registers after the mode switch. */
5114 void
5115 aarch64_sme_mode_switch_regs::emit_epilogue ()
5117 emit_gpr_moves (EPILOGUE);
5118 emit_mem_128_moves (EPILOGUE);
5119 emit_sve_pred_moves (EPILOGUE);
5120 emit_sve_data_moves (EPILOGUE);
5121 emit_sve_sp_adjust (EPILOGUE);
5124 /* The SVE predicate registers are stored below the SVE data registers,
5125 with the predicate save area being padded to a data-register-sized
5126 boundary. Return the size of this padded area as a whole number
5127 of data register slots. */
5129 unsigned int
5130 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5132 return CEIL (m_group_count[MEM_SVE_PRED], 8);
5135 /* Return a memory reference of mode MODE to OFFSET bytes from the
5136 stack pointer. */
5139 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5140 poly_int64 offset)
5142 rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5143 return gen_rtx_MEM (mode, addr);
5146 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */
5148 void
5149 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5150 poly_int64 size)
5152 if (seq == PROLOGUE)
5153 size = -size;
5154 emit_insn (gen_rtx_SET (stack_pointer_rtx,
5155 plus_constant (Pmode, stack_pointer_rtx, size)));
5158 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5159 the stack pointer. SEQ chooses between saving and restoring. */
5161 void
5162 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5163 const save_location &loc,
5164 poly_int64 offset)
5166 rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5167 if (seq == PROLOGUE)
5168 emit_move_insn (mem, loc.reg);
5169 else
5170 emit_move_insn (loc.reg, mem);
5173 /* Emit instructions to save or restore the GPR group. SEQ chooses between
5174 saving and restoring. */
5176 void
5177 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5179 for (auto &loc : m_save_locations)
5180 if (loc.group == GPR)
5182 gcc_assert (loc.index < 8);
5183 rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5184 if (seq == PROLOGUE)
5185 emit_move_insn (gpr, loc.reg);
5186 else
5187 emit_move_insn (loc.reg, gpr);
5191 /* Emit instructions to save or restore the MEM_128 group. SEQ chooses
5192 between saving and restoring. */
5194 void
5195 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5197 HOST_WIDE_INT count = m_group_count[MEM_128];
5198 if (count == 0)
5199 return;
5201 auto sp = stack_pointer_rtx;
5202 auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5204 /* Pick a common mode that supports LDR & STR with pre/post-modification
5205 and LDP & STP with pre/post-modification. */
5206 auto mode = TFmode;
5208 /* An instruction pattern that should be emitted at the end. */
5209 rtx last_pat = NULL_RTX;
5211 /* A previous MEM_128 location that hasn't been handled yet. */
5212 save_location *prev_loc = nullptr;
5214 /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */
5215 for (auto &loc : m_save_locations)
5216 if (loc.group == MEM_128)
5218 if (!prev_loc)
5220 prev_loc = &loc;
5221 continue;
5223 gcc_assert (loc.index == prev_loc->index + 1);
5225 /* The offset of the base of the save area from the current
5226 stack pointer. */
5227 HOST_WIDE_INT bias = 0;
5228 if (prev_loc->index == 0 && seq == PROLOGUE)
5229 bias = sp_adjust;
5231 /* Get the two sets in the LDP/STP. */
5232 rtx ops[] = {
5233 gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5234 get_slot_mem (mode, prev_loc->index * 16 + bias),
5235 gen_rtx_REG (mode, REGNO (loc.reg)),
5236 get_slot_mem (mode, loc.index * 16 + bias)
5238 unsigned int lhs = (seq == PROLOGUE);
5239 rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5240 rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5242 /* Combine the sets with any stack allocation/deallocation. */
5243 rtx pat;
5244 if (prev_loc->index == 0)
5246 rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5247 rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5248 pat = gen_rtx_PARALLEL (VOIDmode, vec);
5250 else if (seq == PROLOGUE)
5251 pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5252 else
5253 pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5255 /* Queue a deallocation to the end, otherwise emit the
5256 instruction now. */
5257 if (seq == EPILOGUE && prev_loc->index == 0)
5258 last_pat = pat;
5259 else
5260 emit_insn (pat);
5261 prev_loc = nullptr;
5264 /* Handle any leftover LDR/STR. */
5265 if (prev_loc)
5267 rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5268 rtx addr;
5269 if (prev_loc->index != 0)
5270 addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5271 else if (seq == PROLOGUE)
5273 rtx allocate = plus_constant (Pmode, sp, -count * 16);
5274 addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5276 else
5278 rtx deallocate = plus_constant (Pmode, sp, count * 16);
5279 addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5281 rtx mem = gen_rtx_MEM (mode, addr);
5282 if (seq == PROLOGUE)
5283 emit_move_insn (mem, reg);
5284 else
5285 emit_move_insn (reg, mem);
5288 if (last_pat)
5289 emit_insn (last_pat);
5292 /* Allocate or deallocate the stack space needed by the SVE groups.
5293 SEQ chooses between allocating and deallocating. */
5295 void
5296 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5298 if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5299 emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5302 /* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving
5303 and restoring. */
5305 void
5306 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5308 for (auto &loc : m_save_locations)
5309 if (loc.group == MEM_SVE_DATA)
5311 auto index = loc.index + sve_data_headroom ();
5312 emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5316 /* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving
5317 and restoring. */
5319 void
5320 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5322 for (auto &loc : m_save_locations)
5323 if (loc.group == MEM_SVE_PRED)
5324 emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5327 /* Set DEST to (vec_series BASE STEP). */
5329 static void
5330 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5332 machine_mode mode = GET_MODE (dest);
5333 scalar_mode inner = GET_MODE_INNER (mode);
5335 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5336 if (!aarch64_sve_index_immediate_p (base))
5337 base = force_reg (inner, base);
5338 if (!aarch64_sve_index_immediate_p (step))
5339 step = force_reg (inner, step);
5341 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5344 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5345 register of mode MODE. Use TARGET for the result if it's nonnull
5346 and convenient.
5348 The two vector modes must have the same element mode. The behavior
5349 is to duplicate architectural lane N of SRC into architectural lanes
5350 N + I * STEP of the result. On big-endian targets, architectural
5351 lane 0 of an Advanced SIMD vector is the last element of the vector
5352 in memory layout, so for big-endian targets this operation has the
5353 effect of reversing SRC before duplicating it. Callers need to
5354 account for this. */
5357 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5359 machine_mode src_mode = GET_MODE (src);
5360 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5361 insn_code icode = (BYTES_BIG_ENDIAN
5362 ? code_for_aarch64_vec_duplicate_vq_be (mode)
5363 : code_for_aarch64_vec_duplicate_vq_le (mode));
5365 unsigned int i = 0;
5366 expand_operand ops[3];
5367 create_output_operand (&ops[i++], target, mode);
5368 create_output_operand (&ops[i++], src, src_mode);
5369 if (BYTES_BIG_ENDIAN)
5371 /* Create a PARALLEL describing the reversal of SRC. */
5372 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5373 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5374 nelts_per_vq - 1, -1);
5375 create_fixed_operand (&ops[i++], sel);
5377 expand_insn (icode, i, ops);
5378 return ops[0].value;
5381 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5382 the memory image into DEST. Return true on success. */
5384 static bool
5385 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5387 src = force_const_mem (GET_MODE (src), src);
5388 if (!src)
5389 return false;
5391 /* Make sure that the address is legitimate. */
5392 if (!aarch64_sve_ld1rq_operand_p (src))
5394 rtx addr = force_reg (Pmode, XEXP (src, 0));
5395 src = replace_equiv_address (src, addr);
5398 machine_mode mode = GET_MODE (dest);
5399 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5400 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5401 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5402 return true;
5405 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5406 by N "background" values. Try to move it into TARGET using:
5408 PTRUE PRED.<T>, VL<N>
5409 MOV TRUE.<T>, #<foreground>
5410 MOV FALSE.<T>, #<background>
5411 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5413 The PTRUE is always a single instruction but the MOVs might need a
5414 longer sequence. If the background value is zero (as it often is),
5415 the sequence can sometimes collapse to a PTRUE followed by a
5416 zero-predicated move.
5418 Return the target on success, otherwise return null. */
5420 static rtx
5421 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5423 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5425 /* Make sure that the PTRUE is valid. */
5426 machine_mode mode = GET_MODE (src);
5427 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5428 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5429 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5430 == AARCH64_NUM_SVPATTERNS)
5431 return NULL_RTX;
5433 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5434 rtx_vector_builder true_builder (mode, npatterns, 1);
5435 rtx_vector_builder false_builder (mode, npatterns, 1);
5436 for (unsigned int i = 0; i < npatterns; ++i)
5438 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5439 pred_builder.quick_push (CONST1_RTX (BImode));
5441 for (unsigned int i = 0; i < npatterns; ++i)
5443 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5444 pred_builder.quick_push (CONST0_RTX (BImode));
5446 expand_operand ops[4];
5447 create_output_operand (&ops[0], target, mode);
5448 create_input_operand (&ops[1], true_builder.build (), mode);
5449 create_input_operand (&ops[2], false_builder.build (), mode);
5450 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5451 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5452 return target;
5455 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5456 SVE data mode and isn't a legitimate constant. Use TARGET for the
5457 result if convenient.
5459 The returned register can have whatever mode seems most natural
5460 given the contents of SRC. */
5462 static rtx
5463 aarch64_expand_sve_const_vector (rtx target, rtx src)
5465 machine_mode mode = GET_MODE (src);
5466 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5467 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5468 scalar_mode elt_mode = GET_MODE_INNER (mode);
5469 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5470 unsigned int container_bits = aarch64_sve_container_bits (mode);
5471 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5473 if (nelts_per_pattern == 1
5474 && encoded_bits <= 128
5475 && container_bits != elt_bits)
5477 /* We have a partial vector mode and a constant whose full-vector
5478 equivalent would occupy a repeating 128-bit sequence. Build that
5479 full-vector equivalent instead, so that we have the option of
5480 using LD1RQ and Advanced SIMD operations. */
5481 unsigned int repeat = container_bits / elt_bits;
5482 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5483 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5484 for (unsigned int i = 0; i < npatterns; ++i)
5485 for (unsigned int j = 0; j < repeat; ++j)
5486 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5487 target = aarch64_target_reg (target, full_mode);
5488 return aarch64_expand_sve_const_vector (target, builder.build ());
5491 if (nelts_per_pattern == 1 && encoded_bits == 128)
5493 /* The constant is a duplicated quadword but can't be narrowed
5494 beyond a quadword. Get the memory image of the first quadword
5495 as a 128-bit vector and try using LD1RQ to load it from memory.
5497 The effect for both endiannesses is to load memory lane N into
5498 architectural lanes N + I * STEP of the result. On big-endian
5499 targets, the layout of the 128-bit vector in an Advanced SIMD
5500 register would be different from its layout in an SVE register,
5501 but this 128-bit vector is a memory value only. */
5502 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5503 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5504 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5505 return target;
5508 if (nelts_per_pattern == 1 && encoded_bits < 128)
5510 /* The vector is a repeating sequence of 64 bits or fewer.
5511 See if we can load them using an Advanced SIMD move and then
5512 duplicate it to fill a vector. This is better than using a GPR
5513 move because it keeps everything in the same register file. */
5514 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5515 rtx_vector_builder builder (vq_mode, npatterns, 1);
5516 for (unsigned int i = 0; i < npatterns; ++i)
5518 /* We want memory lane N to go into architectural lane N,
5519 so reverse for big-endian targets. The DUP .Q pattern
5520 has a compensating reverse built-in. */
5521 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5522 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5524 rtx vq_src = builder.build ();
5525 if (aarch64_simd_valid_immediate (vq_src, NULL))
5527 vq_src = force_reg (vq_mode, vq_src);
5528 return aarch64_expand_sve_dupq (target, mode, vq_src);
5531 /* Get an integer representation of the repeating part of Advanced
5532 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5533 which for big-endian targets is lane-swapped wrt a normal
5534 Advanced SIMD vector. This means that for both endiannesses,
5535 memory lane N of SVE vector SRC corresponds to architectural
5536 lane N of a register holding VQ_SRC. This in turn means that
5537 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5538 as a single 128-bit value) and thus that memory lane 0 of SRC is
5539 in the lsb of the integer. Duplicating the integer therefore
5540 ensures that memory lane N of SRC goes into architectural lane
5541 N + I * INDEX of the SVE register. */
5542 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5543 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5544 if (elt_value)
5546 /* Pretend that we had a vector of INT_MODE to start with. */
5547 elt_mode = int_mode;
5548 mode = aarch64_full_sve_mode (int_mode).require ();
5550 /* If the integer can be moved into a general register by a
5551 single instruction, do that and duplicate the result. */
5552 if (CONST_INT_P (elt_value)
5553 && aarch64_move_imm (INTVAL (elt_value),
5554 encoded_bits <= 32 ? SImode : DImode))
5556 elt_value = force_reg (elt_mode, elt_value);
5557 return expand_vector_broadcast (mode, elt_value);
5560 else if (npatterns == 1)
5561 /* We're duplicating a single value, but can't do better than
5562 force it to memory and load from there. This handles things
5563 like symbolic constants. */
5564 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5566 if (elt_value)
5568 /* Load the element from memory if we can, otherwise move it into
5569 a register and use a DUP. */
5570 rtx op = force_const_mem (elt_mode, elt_value);
5571 if (!op)
5572 op = force_reg (elt_mode, elt_value);
5573 return expand_vector_broadcast (mode, op);
5577 /* Try using INDEX. */
5578 rtx base, step;
5579 if (const_vec_series_p (src, &base, &step))
5581 aarch64_expand_vec_series (target, base, step);
5582 return target;
5585 /* From here on, it's better to force the whole constant to memory
5586 if we can. */
5587 if (GET_MODE_NUNITS (mode).is_constant ())
5588 return NULL_RTX;
5590 if (nelts_per_pattern == 2)
5591 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5592 return res;
5594 /* Expand each pattern individually. */
5595 gcc_assert (npatterns > 1);
5596 rtx_vector_builder builder;
5597 auto_vec<rtx, 16> vectors (npatterns);
5598 for (unsigned int i = 0; i < npatterns; ++i)
5600 builder.new_vector (mode, 1, nelts_per_pattern);
5601 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5602 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5603 vectors.quick_push (force_reg (mode, builder.build ()));
5606 /* Use permutes to interleave the separate vectors. */
5607 while (npatterns > 1)
5609 npatterns /= 2;
5610 for (unsigned int i = 0; i < npatterns; ++i)
5612 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5613 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5614 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5615 vectors[i] = tmp;
5618 gcc_assert (vectors[0] == target);
5619 return target;
5622 /* Use WHILE to set a predicate register of mode MODE in which the first
5623 VL bits are set and the rest are clear. Use TARGET for the register
5624 if it's nonnull and convenient. */
5626 static rtx
5627 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5628 unsigned int vl)
5630 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5631 target = aarch64_target_reg (target, mode);
5632 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5633 target, const0_rtx, limit));
5634 return target;
5637 static rtx
5638 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5640 /* BUILDER is a constant predicate in which the index of every set bit
5641 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5642 by inverting every element at a multiple of ELT_SIZE and EORing the
5643 result with an ELT_SIZE PTRUE.
5645 Return a register that contains the constant on success, otherwise
5646 return null. Use TARGET as the register if it is nonnull and
5647 convenient. */
5649 static rtx
5650 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5651 unsigned int elt_size)
5653 /* Invert every element at a multiple of ELT_SIZE, keeping the
5654 other bits zero. */
5655 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5656 builder.nelts_per_pattern ());
5657 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5658 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5659 inv_builder.quick_push (const1_rtx);
5660 else
5661 inv_builder.quick_push (const0_rtx);
5662 inv_builder.finalize ();
5664 /* See if we can load the constant cheaply. */
5665 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5666 if (!inv)
5667 return NULL_RTX;
5669 /* EOR the result with an ELT_SIZE PTRUE. */
5670 rtx mask = aarch64_ptrue_all (elt_size);
5671 mask = force_reg (VNx16BImode, mask);
5672 inv = gen_lowpart (VNx16BImode, inv);
5673 target = aarch64_target_reg (target, VNx16BImode);
5674 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5675 return target;
5678 /* BUILDER is a constant predicate in which the index of every set bit
5679 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5680 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5681 register on success, otherwise return null. Use TARGET as the register
5682 if nonnull and convenient. */
5684 static rtx
5685 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5686 unsigned int elt_size,
5687 unsigned int permute_size)
5689 /* We're going to split the constant into two new constants A and B,
5690 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5691 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5693 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5694 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5696 where _ indicates elements that will be discarded by the permute.
5698 First calculate the ELT_SIZEs for A and B. */
5699 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5700 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5701 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5702 if (INTVAL (builder.elt (i)) != 0)
5704 if (i & permute_size)
5705 b_elt_size |= i - permute_size;
5706 else
5707 a_elt_size |= i;
5709 a_elt_size &= -a_elt_size;
5710 b_elt_size &= -b_elt_size;
5712 /* Now construct the vectors themselves. */
5713 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5714 builder.nelts_per_pattern ());
5715 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5716 builder.nelts_per_pattern ());
5717 unsigned int nelts = builder.encoded_nelts ();
5718 for (unsigned int i = 0; i < nelts; ++i)
5719 if (i & (elt_size - 1))
5721 a_builder.quick_push (const0_rtx);
5722 b_builder.quick_push (const0_rtx);
5724 else if ((i & permute_size) == 0)
5726 /* The A and B elements are significant. */
5727 a_builder.quick_push (builder.elt (i));
5728 b_builder.quick_push (builder.elt (i + permute_size));
5730 else
5732 /* The A and B elements are going to be discarded, so pick whatever
5733 is likely to give a nice constant. We are targeting element
5734 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5735 with the aim of each being a sequence of ones followed by
5736 a sequence of zeros. So:
5738 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5739 duplicate the last X_ELT_SIZE element, to extend the
5740 current sequence of ones or zeros.
5742 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5743 zero, so that the constant really does have X_ELT_SIZE and
5744 not a smaller size. */
5745 if (a_elt_size > permute_size)
5746 a_builder.quick_push (const0_rtx);
5747 else
5748 a_builder.quick_push (a_builder.elt (i - a_elt_size));
5749 if (b_elt_size > permute_size)
5750 b_builder.quick_push (const0_rtx);
5751 else
5752 b_builder.quick_push (b_builder.elt (i - b_elt_size));
5754 a_builder.finalize ();
5755 b_builder.finalize ();
5757 /* Try loading A into a register. */
5758 rtx_insn *last = get_last_insn ();
5759 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5760 if (!a)
5761 return NULL_RTX;
5763 /* Try loading B into a register. */
5764 rtx b = a;
5765 if (a_builder != b_builder)
5767 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5768 if (!b)
5770 delete_insns_since (last);
5771 return NULL_RTX;
5775 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
5776 operands but permutes them as though they had mode MODE. */
5777 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5778 target = aarch64_target_reg (target, GET_MODE (a));
5779 rtx type_reg = CONST0_RTX (mode);
5780 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5781 return target;
5784 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5785 constant in BUILDER into an SVE predicate register. Return the register
5786 on success, otherwise return null. Use TARGET for the register if
5787 nonnull and convenient.
5789 ALLOW_RECURSE_P is true if we can use methods that would call this
5790 function recursively. */
5792 static rtx
5793 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5794 bool allow_recurse_p)
5796 if (builder.encoded_nelts () == 1)
5797 /* A PFALSE or a PTRUE .B ALL. */
5798 return aarch64_emit_set_immediate (target, builder);
5800 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5801 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5803 /* If we can load the constant using PTRUE, use it as-is. */
5804 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5805 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5806 return aarch64_emit_set_immediate (target, builder);
5808 /* Otherwise use WHILE to set the first VL bits. */
5809 return aarch64_sve_move_pred_via_while (target, mode, vl);
5812 if (!allow_recurse_p)
5813 return NULL_RTX;
5815 /* Try inverting the vector in element size ELT_SIZE and then EORing
5816 the result with an ELT_SIZE PTRUE. */
5817 if (INTVAL (builder.elt (0)) == 0)
5818 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5819 elt_size))
5820 return res;
5822 /* Try using TRN1 to permute two simpler constants. */
5823 for (unsigned int i = elt_size; i <= 8; i *= 2)
5824 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5825 elt_size, i))
5826 return res;
5828 return NULL_RTX;
5831 /* Return an SVE predicate register that contains the VNx16BImode
5832 constant in BUILDER, without going through the move expanders.
5834 The returned register can have whatever mode seems most natural
5835 given the contents of BUILDER. Use TARGET for the result if
5836 convenient. */
5838 static rtx
5839 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5841 /* Try loading the constant using pure predicate operations. */
5842 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5843 return res;
5845 /* Try forcing the constant to memory. */
5846 if (builder.full_nelts ().is_constant ())
5847 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5849 target = aarch64_target_reg (target, VNx16BImode);
5850 emit_move_insn (target, mem);
5851 return target;
5854 /* The last resort is to load the constant as an integer and then
5855 compare it against zero. Use -1 for set bits in order to increase
5856 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5857 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5858 builder.nelts_per_pattern ());
5859 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5860 int_builder.quick_push (INTVAL (builder.elt (i))
5861 ? constm1_rtx : const0_rtx);
5862 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5863 int_builder.build ());
5866 /* Set DEST to immediate IMM. */
5868 void
5869 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5871 machine_mode mode = GET_MODE (dest);
5873 /* Check on what type of symbol it is. */
5874 scalar_int_mode int_mode;
5875 if ((SYMBOL_REF_P (imm)
5876 || LABEL_REF_P (imm)
5877 || GET_CODE (imm) == CONST
5878 || GET_CODE (imm) == CONST_POLY_INT)
5879 && is_a <scalar_int_mode> (mode, &int_mode))
5881 rtx mem;
5882 poly_int64 offset;
5883 HOST_WIDE_INT const_offset;
5884 enum aarch64_symbol_type sty;
5886 /* If we have (const (plus symbol offset)), separate out the offset
5887 before we start classifying the symbol. */
5888 rtx base = strip_offset (imm, &offset);
5890 /* We must always add an offset involving VL separately, rather than
5891 folding it into the relocation. */
5892 if (!offset.is_constant (&const_offset))
5894 if (!TARGET_SVE)
5896 aarch64_report_sve_required ();
5897 return;
5899 if (base == const0_rtx
5900 && (aarch64_sve_cnt_immediate_p (offset)
5901 || aarch64_sve_rdvl_immediate_p (offset)))
5902 emit_insn (gen_rtx_SET (dest, imm));
5903 else
5905 /* Do arithmetic on 32-bit values if the result is smaller
5906 than that. */
5907 if (partial_subreg_p (int_mode, SImode))
5909 /* It is invalid to do symbol calculations in modes
5910 narrower than SImode. */
5911 gcc_assert (base == const0_rtx);
5912 dest = gen_lowpart (SImode, dest);
5913 int_mode = SImode;
5915 if (base != const0_rtx)
5917 base = aarch64_force_temporary (int_mode, dest, base);
5918 aarch64_add_offset (int_mode, dest, base, offset,
5919 NULL_RTX, NULL_RTX, 0, false);
5921 else
5922 aarch64_add_offset (int_mode, dest, base, offset,
5923 dest, NULL_RTX, 0, false);
5925 return;
5928 if (aarch64_rdsvl_immediate_p (base))
5930 /* We could handle non-constant offsets if they are ever
5931 generated. */
5932 gcc_assert (const_offset == 0);
5933 emit_insn (gen_rtx_SET (dest, imm));
5934 return;
5937 sty = aarch64_classify_symbol (base, const_offset);
5938 switch (sty)
5940 case SYMBOL_FORCE_TO_MEM:
5941 if (int_mode != ptr_mode)
5942 imm = convert_memory_address (ptr_mode, imm);
5944 if (const_offset != 0
5945 && targetm.cannot_force_const_mem (ptr_mode, imm))
5947 gcc_assert (can_create_pseudo_p ());
5948 base = aarch64_force_temporary (int_mode, dest, base);
5949 aarch64_add_offset (int_mode, dest, base, const_offset,
5950 NULL_RTX, NULL_RTX, 0, false);
5951 return;
5954 mem = force_const_mem (ptr_mode, imm);
5955 gcc_assert (mem);
5957 /* If we aren't generating PC relative literals, then
5958 we need to expand the literal pool access carefully.
5959 This is something that needs to be done in a number
5960 of places, so could well live as a separate function. */
5961 if (!aarch64_pcrelative_literal_loads)
5963 gcc_assert (can_create_pseudo_p ());
5964 base = gen_reg_rtx (ptr_mode);
5965 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5966 if (ptr_mode != Pmode)
5967 base = convert_memory_address (Pmode, base);
5968 mem = gen_rtx_MEM (ptr_mode, base);
5971 if (int_mode != ptr_mode)
5972 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5974 emit_insn (gen_rtx_SET (dest, mem));
5976 return;
5978 case SYMBOL_SMALL_TLSGD:
5979 case SYMBOL_SMALL_TLSDESC:
5980 case SYMBOL_SMALL_TLSIE:
5981 case SYMBOL_SMALL_GOT_28K:
5982 case SYMBOL_SMALL_GOT_4G:
5983 case SYMBOL_TINY_GOT:
5984 case SYMBOL_TINY_TLSIE:
5985 if (const_offset != 0)
5987 gcc_assert(can_create_pseudo_p ());
5988 base = aarch64_force_temporary (int_mode, dest, base);
5989 aarch64_add_offset (int_mode, dest, base, const_offset,
5990 NULL_RTX, NULL_RTX, 0, false);
5991 return;
5993 /* FALLTHRU */
5995 case SYMBOL_SMALL_ABSOLUTE:
5996 case SYMBOL_TINY_ABSOLUTE:
5997 case SYMBOL_TLSLE12:
5998 case SYMBOL_TLSLE24:
5999 case SYMBOL_TLSLE32:
6000 case SYMBOL_TLSLE48:
6001 aarch64_load_symref_appropriately (dest, imm, sty);
6002 return;
6004 default:
6005 gcc_unreachable ();
6009 if (!CONST_INT_P (imm))
6011 if (aarch64_sve_pred_mode_p (mode))
6013 /* Only the low bit of each .H, .S and .D element is defined,
6014 so we can set the upper bits to whatever we like. If the
6015 predicate is all-true in MODE, prefer to set all the undefined
6016 bits as well, so that we can share a single .B predicate for
6017 all modes. */
6018 if (imm == CONSTM1_RTX (mode))
6019 imm = CONSTM1_RTX (VNx16BImode);
6021 /* All methods for constructing predicate modes wider than VNx16BI
6022 will set the upper bits of each element to zero. Expose this
6023 by moving such constants as a VNx16BI, so that all bits are
6024 significant and so that constants for different modes can be
6025 shared. The wider constant will still be available as a
6026 REG_EQUAL note. */
6027 rtx_vector_builder builder;
6028 if (aarch64_get_sve_pred_bits (builder, imm))
6030 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6031 if (dest != res)
6032 emit_move_insn (dest, gen_lowpart (mode, res));
6033 return;
6037 if (GET_CODE (imm) == HIGH
6038 || aarch64_simd_valid_immediate (imm, NULL))
6040 emit_insn (gen_rtx_SET (dest, imm));
6041 return;
6044 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6045 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6047 if (dest != res)
6048 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6049 return;
6052 rtx mem = force_const_mem (mode, imm);
6053 gcc_assert (mem);
6054 emit_move_insn (dest, mem);
6055 return;
6058 aarch64_internal_mov_immediate (dest, imm, true, mode);
6061 /* Return the MEM rtx that provides the canary value that should be used
6062 for stack-smashing protection. MODE is the mode of the memory.
6063 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6064 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6065 indicates whether the caller is performing a SET or a TEST operation. */
6068 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6069 aarch64_salt_type salt_type)
6071 rtx addr;
6072 if (aarch64_stack_protector_guard == SSP_GLOBAL)
6074 gcc_assert (MEM_P (decl_rtl));
6075 addr = XEXP (decl_rtl, 0);
6076 poly_int64 offset;
6077 rtx base = strip_offset_and_salt (addr, &offset);
6078 if (!SYMBOL_REF_P (base))
6079 return decl_rtl;
6081 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6082 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6083 addr = gen_rtx_CONST (Pmode, addr);
6084 addr = plus_constant (Pmode, addr, offset);
6086 else
6088 /* Calculate the address from the system register. */
6089 rtx salt = GEN_INT (salt_type);
6090 addr = gen_reg_rtx (mode);
6091 if (mode == DImode)
6092 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6093 else
6095 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6096 addr = convert_memory_address (Pmode, addr);
6098 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6100 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6103 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6104 that is known to contain PTRUE. */
6106 void
6107 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6109 expand_operand ops[3];
6110 machine_mode mode = GET_MODE (dest);
6111 create_output_operand (&ops[0], dest, mode);
6112 create_input_operand (&ops[1], pred, GET_MODE(pred));
6113 create_input_operand (&ops[2], src, mode);
6114 temporary_volatile_ok v (true);
6115 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6118 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6119 operand is in memory. In this case we need to use the predicated LD1
6120 and ST1 instead of LDR and STR, both for correctness on big-endian
6121 targets and because LD1 and ST1 support a wider range of addressing modes.
6122 PRED_MODE is the mode of the predicate.
6124 See the comment at the head of aarch64-sve.md for details about the
6125 big-endian handling. */
6127 void
6128 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6130 machine_mode mode = GET_MODE (dest);
6131 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6132 if (!register_operand (src, mode)
6133 && !register_operand (dest, mode))
6135 rtx tmp = gen_reg_rtx (mode);
6136 if (MEM_P (src))
6137 aarch64_emit_sve_pred_move (tmp, ptrue, src);
6138 else
6139 emit_move_insn (tmp, src);
6140 src = tmp;
6142 aarch64_emit_sve_pred_move (dest, ptrue, src);
6145 /* Called only on big-endian targets. See whether an SVE vector move
6146 from SRC to DEST is effectively a REV[BHW] instruction, because at
6147 least one operand is a subreg of an SVE vector that has wider or
6148 narrower elements. Return true and emit the instruction if so.
6150 For example:
6152 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6154 represents a VIEW_CONVERT between the following vectors, viewed
6155 in memory order:
6157 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6158 R1: { [0], [1], [2], [3], ... }
6160 The high part of lane X in R2 should therefore correspond to lane X*2
6161 of R1, but the register representations are:
6163 msb lsb
6164 R2: ...... [1].high [1].low [0].high [0].low
6165 R1: ...... [3] [2] [1] [0]
6167 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6168 We therefore need a reverse operation to swap the high and low values
6169 around.
6171 This is purely an optimization. Without it we would spill the
6172 subreg operand to the stack in one mode and reload it in the
6173 other mode, which has the same effect as the REV. */
6175 bool
6176 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6178 gcc_assert (BYTES_BIG_ENDIAN);
6180 /* Do not try to optimize subregs that LRA has created for matched
6181 reloads. These subregs only exist as a temporary measure to make
6182 the RTL well-formed, but they are exempt from the usual
6183 TARGET_CAN_CHANGE_MODE_CLASS rules.
6185 For example, if we have:
6187 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6189 and the constraints require R1 and R2 to be in the same register,
6190 LRA may need to create RTL such as:
6192 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6193 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6194 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6196 which forces both the input and output of the original instruction
6197 to use the same hard register. But for this to work, the normal
6198 rules have to be suppressed on the subreg input, otherwise LRA
6199 would need to reload that input too, meaning that the process
6200 would never terminate. To compensate for this, the normal rules
6201 are also suppressed for the subreg output of the first move.
6202 Ignoring the special case and handling the first move normally
6203 would therefore generate wrong code: we would reverse the elements
6204 for the first subreg but not reverse them back for the second subreg. */
6205 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6206 dest = SUBREG_REG (dest);
6207 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6208 src = SUBREG_REG (src);
6210 /* The optimization handles two single SVE REGs with different element
6211 sizes. */
6212 if (!REG_P (dest)
6213 || !REG_P (src)
6214 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6215 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6216 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6217 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6218 return false;
6220 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
6221 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6222 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6223 UNSPEC_REV_SUBREG);
6224 emit_insn (gen_rtx_SET (dest, unspec));
6225 return true;
6228 /* Return a copy of X with mode MODE, without changing its other
6229 attributes. Unlike gen_lowpart, this doesn't care whether the
6230 mode change is valid. */
6233 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6235 if (GET_MODE (x) == mode)
6236 return x;
6238 x = shallow_copy_rtx (x);
6239 set_mode_and_regno (x, mode, REGNO (x));
6240 return x;
6243 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6244 stored in wider integer containers. */
6246 static unsigned int
6247 aarch64_sve_rev_unspec (machine_mode mode)
6249 switch (GET_MODE_UNIT_SIZE (mode))
6251 case 1: return UNSPEC_REVB;
6252 case 2: return UNSPEC_REVH;
6253 case 4: return UNSPEC_REVW;
6255 gcc_unreachable ();
6258 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6259 operands. */
6261 void
6262 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6264 /* Decide which REV operation we need. The mode with wider elements
6265 determines the mode of the operands and the mode with the narrower
6266 elements determines the reverse width. */
6267 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6268 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6269 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6270 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6271 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6273 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6274 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6276 /* Get the operands in the appropriate modes and emit the instruction. */
6277 ptrue = gen_lowpart (pred_mode, ptrue);
6278 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6279 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6280 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6281 dest, ptrue, src));
6284 static bool
6285 aarch64_function_ok_for_sibcall (tree, tree exp)
6287 if (crtl->abi->id () != expr_callee_abi (exp).id ())
6288 return false;
6290 tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6291 if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6292 return false;
6293 if (aarch64_fntype_pstate_za (fntype) != aarch64_cfun_incoming_pstate_za ())
6294 return false;
6295 return true;
6298 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6299 passed in SVE registers. */
6301 static bool
6302 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6303 const function_arg_info &arg)
6305 HOST_WIDE_INT size;
6306 machine_mode dummymode;
6307 int nregs;
6309 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6310 if (arg.mode == BLKmode && arg.type)
6311 size = int_size_in_bytes (arg.type);
6312 else
6313 /* No frontends can create types with variable-sized modes, so we
6314 shouldn't be asked to pass or return them. */
6315 size = GET_MODE_SIZE (arg.mode).to_constant ();
6317 /* Aggregates are passed by reference based on their size. */
6318 if (arg.aggregate_type_p ())
6319 size = int_size_in_bytes (arg.type);
6321 /* Variable sized arguments are always returned by reference. */
6322 if (size < 0)
6323 return true;
6325 /* Can this be a candidate to be passed in fp/simd register(s)? */
6326 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6327 &dummymode, &nregs, NULL,
6328 !pcum || pcum->silent_p))
6329 return false;
6331 /* Arguments which are variable sized or larger than 2 registers are
6332 passed by reference unless they are a homogenous floating point
6333 aggregate. */
6334 return size > 2 * UNITS_PER_WORD;
6337 /* Implement TARGET_PASS_BY_REFERENCE. */
6339 static bool
6340 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6341 const function_arg_info &arg)
6343 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6345 if (!arg.type)
6346 return aarch64_pass_by_reference_1 (pcum, arg);
6348 pure_scalable_type_info pst_info;
6349 switch (pst_info.analyze (arg.type))
6351 case pure_scalable_type_info::IS_PST:
6352 if (pcum && !pcum->silent_p && !TARGET_SVE)
6353 /* We can't gracefully recover at this point, so make this a
6354 fatal error. */
6355 fatal_error (input_location, "arguments of type %qT require"
6356 " the SVE ISA extension", arg.type);
6358 /* Variadic SVE types are passed by reference. Normal non-variadic
6359 arguments are too if we've run out of registers. */
6360 return (!arg.named
6361 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6362 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6364 case pure_scalable_type_info::DOESNT_MATTER:
6365 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6366 return true;
6368 case pure_scalable_type_info::NO_ABI_IDENTITY:
6369 case pure_scalable_type_info::ISNT_PST:
6370 return aarch64_pass_by_reference_1 (pcum, arg);
6372 gcc_unreachable ();
6375 /* Return TRUE if VALTYPE is padded to its least significant bits. */
6376 static bool
6377 aarch64_return_in_msb (const_tree valtype)
6379 machine_mode dummy_mode;
6380 int dummy_int;
6382 /* Never happens in little-endian mode. */
6383 if (!BYTES_BIG_ENDIAN)
6384 return false;
6386 /* Only composite types smaller than or equal to 16 bytes can
6387 be potentially returned in registers. */
6388 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6389 || int_size_in_bytes (valtype) <= 0
6390 || int_size_in_bytes (valtype) > 16)
6391 return false;
6393 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6394 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6395 is always passed/returned in the least significant bits of fp/simd
6396 register(s). */
6397 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6398 &dummy_mode, &dummy_int, NULL,
6399 false))
6400 return false;
6402 /* Likewise pure scalable types for SVE vector and predicate registers. */
6403 pure_scalable_type_info pst_info;
6404 if (pst_info.analyze_registers (valtype))
6405 return false;
6407 return true;
6410 /* Implement TARGET_FUNCTION_VALUE.
6411 Define how to find the value returned by a function. */
6413 static rtx
6414 aarch64_function_value (const_tree type, const_tree func,
6415 bool outgoing ATTRIBUTE_UNUSED)
6417 machine_mode mode;
6418 int unsignedp;
6420 mode = TYPE_MODE (type);
6421 if (INTEGRAL_TYPE_P (type))
6422 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6424 pure_scalable_type_info pst_info;
6425 if (type && pst_info.analyze_registers (type))
6426 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6428 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6429 are returned in memory, not by value. */
6430 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6431 bool sve_p = (vec_flags & VEC_ANY_SVE);
6433 if (aarch64_return_in_msb (type))
6435 HOST_WIDE_INT size = int_size_in_bytes (type);
6437 if (size % UNITS_PER_WORD != 0)
6439 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6440 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6444 int count;
6445 machine_mode ag_mode;
6446 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6447 NULL, false))
6449 gcc_assert (!sve_p);
6450 if (!aarch64_composite_type_p (type, mode))
6452 gcc_assert (count == 1 && mode == ag_mode);
6453 return gen_rtx_REG (mode, V0_REGNUM);
6455 else if (aarch64_advsimd_full_struct_mode_p (mode)
6456 && known_eq (GET_MODE_SIZE (ag_mode), 16))
6457 return gen_rtx_REG (mode, V0_REGNUM);
6458 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6459 && known_eq (GET_MODE_SIZE (ag_mode), 8))
6460 return gen_rtx_REG (mode, V0_REGNUM);
6461 else
6463 int i;
6464 rtx par;
6466 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6467 for (i = 0; i < count; i++)
6469 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6470 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6471 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6472 XVECEXP (par, 0, i) = tmp;
6474 return par;
6477 else
6479 if (sve_p)
6481 /* Vector types can acquire a partial SVE mode using things like
6482 __attribute__((vector_size(N))), and this is potentially useful.
6483 However, the choice of mode doesn't affect the type's ABI
6484 identity, so we should treat the types as though they had
6485 the associated integer mode, just like they did before SVE
6486 was introduced.
6488 We know that the vector must be 128 bits or smaller,
6489 otherwise we'd have returned it in memory instead. */
6490 gcc_assert (type
6491 && (aarch64_some_values_include_pst_objects_p (type)
6492 || (vec_flags & VEC_PARTIAL)));
6494 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6495 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6496 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6497 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6499 return gen_rtx_REG (mode, R0_REGNUM);
6503 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6504 Return true if REGNO is the number of a hard register in which the values
6505 of called function may come back. */
6507 static bool
6508 aarch64_function_value_regno_p (const unsigned int regno)
6510 /* Maximum of 16 bytes can be returned in the general registers. Examples
6511 of 16-byte return values are: 128-bit integers and 16-byte small
6512 structures (excluding homogeneous floating-point aggregates). */
6513 if (regno == R0_REGNUM || regno == R1_REGNUM)
6514 return true;
6516 /* Up to four fp/simd registers can return a function value, e.g. a
6517 homogeneous floating-point aggregate having four members. */
6518 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6519 return TARGET_FLOAT;
6521 if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6522 return TARGET_SVE;
6524 return false;
6527 /* Subroutine for aarch64_return_in_memory for types that are not returned
6528 in SVE registers. */
6530 static bool
6531 aarch64_return_in_memory_1 (const_tree type)
6533 HOST_WIDE_INT size;
6534 machine_mode ag_mode;
6535 int count;
6537 if (!AGGREGATE_TYPE_P (type)
6538 && TREE_CODE (type) != COMPLEX_TYPE
6539 && TREE_CODE (type) != VECTOR_TYPE)
6540 /* Simple scalar types always returned in registers. */
6541 return false;
6543 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6544 &ag_mode, &count, NULL, false))
6545 return false;
6547 /* Types larger than 2 registers returned in memory. */
6548 size = int_size_in_bytes (type);
6549 return (size < 0 || size > 2 * UNITS_PER_WORD);
6552 /* Implement TARGET_RETURN_IN_MEMORY.
6554 If the type T of the result of a function is such that
6555 void func (T arg)
6556 would require that arg be passed as a value in a register (or set of
6557 registers) according to the parameter passing rules, then the result
6558 is returned in the same registers as would be used for such an
6559 argument. */
6561 static bool
6562 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6564 pure_scalable_type_info pst_info;
6565 switch (pst_info.analyze (type))
6567 case pure_scalable_type_info::IS_PST:
6568 return (pst_info.num_zr () > NUM_FP_ARG_REGS
6569 || pst_info.num_pr () > NUM_PR_ARG_REGS);
6571 case pure_scalable_type_info::DOESNT_MATTER:
6572 gcc_assert (aarch64_return_in_memory_1 (type));
6573 return true;
6575 case pure_scalable_type_info::NO_ABI_IDENTITY:
6576 case pure_scalable_type_info::ISNT_PST:
6577 return aarch64_return_in_memory_1 (type);
6579 gcc_unreachable ();
6582 static bool
6583 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6584 const_tree type, int *nregs)
6586 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6587 return aarch64_vfp_is_call_or_return_candidate (mode, type,
6588 &pcum->aapcs_vfp_rmode,
6589 nregs, NULL, pcum->silent_p);
6592 /* Given MODE and TYPE of a function argument, return the alignment in
6593 bits. The idea is to suppress any stronger alignment requested by
6594 the user and opt for the natural alignment (specified in AAPCS64 \S
6595 4.1). ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6596 was incorrectly calculated in versions of GCC prior to GCC 9.
6597 ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6598 calculated in versions between GCC 9 and GCC 13. If the alignment
6599 might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6600 is the old GCC 13 alignment, otherwise it is zero.
6602 This is a helper function for local use only. */
6604 static unsigned int
6605 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6606 unsigned int *abi_break_gcc_9,
6607 unsigned int *abi_break_gcc_13,
6608 unsigned int *abi_break_gcc_14)
6610 *abi_break_gcc_9 = 0;
6611 *abi_break_gcc_13 = 0;
6612 *abi_break_gcc_14 = 0;
6613 if (!type)
6614 return GET_MODE_ALIGNMENT (mode);
6616 if (integer_zerop (TYPE_SIZE (type)))
6617 return 0;
6619 gcc_assert (TYPE_MODE (type) == mode);
6621 if (!AGGREGATE_TYPE_P (type))
6623 /* The ABI alignment is the natural alignment of the type, without
6624 any attributes applied. Normally this is the alignment of the
6625 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6626 For now we just handle the known exceptions explicitly. */
6627 type = TYPE_MAIN_VARIANT (type);
6628 if (POINTER_TYPE_P (type))
6630 gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6631 return POINTER_SIZE;
6633 if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6635 *abi_break_gcc_14 = TYPE_ALIGN (type);
6636 type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6638 gcc_assert (!TYPE_USER_ALIGN (type));
6639 return TYPE_ALIGN (type);
6642 if (TREE_CODE (type) == ARRAY_TYPE)
6643 return TYPE_ALIGN (TREE_TYPE (type));
6645 unsigned int alignment = 0;
6646 unsigned int bitfield_alignment_with_packed = 0;
6647 unsigned int bitfield_alignment = 0;
6648 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6649 if (TREE_CODE (field) == FIELD_DECL)
6651 /* Note that we explicitly consider zero-sized fields here,
6652 even though they don't map to AAPCS64 machine types.
6653 For example, in:
6655 struct __attribute__((aligned(8))) empty {};
6657 struct s {
6658 [[no_unique_address]] empty e;
6659 int x;
6662 "s" contains only one Fundamental Data Type (the int field)
6663 but gains 8-byte alignment and size thanks to "e". */
6664 alignment = std::max (alignment, DECL_ALIGN (field));
6665 if (DECL_BIT_FIELD_TYPE (field))
6667 /* Take the bit-field type's alignment into account only
6668 if the user didn't reduce this field's alignment with
6669 the packed attribute. */
6670 if (!DECL_PACKED (field))
6671 bitfield_alignment
6672 = std::max (bitfield_alignment,
6673 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6675 /* Compute the alignment even if the bit-field is
6676 packed, so that we can emit a warning in case the
6677 alignment changed between GCC versions. */
6678 bitfield_alignment_with_packed
6679 = std::max (bitfield_alignment_with_packed,
6680 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6684 /* Emit a warning if the alignment is different when taking the
6685 'packed' attribute into account. */
6686 if (bitfield_alignment != bitfield_alignment_with_packed
6687 && bitfield_alignment_with_packed > alignment)
6688 *abi_break_gcc_13 = bitfield_alignment_with_packed;
6690 if (bitfield_alignment > alignment)
6692 *abi_break_gcc_9 = alignment;
6693 return bitfield_alignment;
6696 return alignment;
6699 /* Layout a function argument according to the AAPCS64 rules. The rule
6700 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
6701 mode that was originally given to us by the target hook, whereas the
6702 mode in ARG might be the result of replacing partial SVE modes with
6703 the equivalent integer mode. */
6705 static void
6706 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6708 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6709 tree type = arg.type;
6710 machine_mode mode = arg.mode;
6711 int ncrn, nvrn, nregs;
6712 bool allocate_ncrn, allocate_nvrn;
6713 HOST_WIDE_INT size;
6714 unsigned int abi_break_gcc_9;
6715 unsigned int abi_break_gcc_13;
6716 unsigned int abi_break_gcc_14;
6718 /* We need to do this once per argument. */
6719 if (pcum->aapcs_arg_processed)
6720 return;
6722 bool warn_pcs_change
6723 = (warn_psabi
6724 && !pcum->silent_p
6725 && (currently_expanding_function_start
6726 || currently_expanding_gimple_stmt));
6728 /* HFAs and HVAs can have an alignment greater than 16 bytes. For example:
6730 typedef struct foo {
6731 __Int8x16_t foo[2] __attribute__((aligned(32)));
6732 } foo;
6734 is still a HVA despite its larger-than-normal alignment.
6735 However, such over-aligned HFAs and HVAs are guaranteed to have
6736 no padding.
6738 If we exclude HFAs and HVAs from the discussion below, then there
6739 are several things to note:
6741 - Both the C and AAPCS64 interpretations of a type's alignment should
6742 give a value that is no greater than the type's size.
6744 - Types bigger than 16 bytes are passed indirectly.
6746 - If an argument of type T is passed indirectly, TYPE and MODE describe
6747 a pointer to T rather than T iself.
6749 It follows that the AAPCS64 alignment of TYPE must be no greater
6750 than 16 bytes.
6752 Versions prior to GCC 9.1 ignored a bitfield's underlying type
6753 and so could calculate an alignment that was too small. If this
6754 happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6756 Although GCC 9.1 fixed that bug, it introduced a different one:
6757 it would consider the alignment of a bitfield's underlying type even
6758 if the field was packed (which should have the effect of overriding
6759 the alignment of the underlying type). This was fixed in GCC 13.1.
6761 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6762 that was too big. If this happened for TYPE, ABI_BREAK_GCC_13 is
6763 this older, too-big alignment.
6765 Also, the fact that GCC 9 to GCC 12 considered irrelevant
6766 alignments meant they could calculate type alignments that were
6767 bigger than the type's size, contrary to the assumption above.
6768 The handling of register arguments was nevertheless (and justifiably)
6769 written to follow the assumption that the alignment can never be
6770 greater than the size. The same was not true for stack arguments;
6771 their alignment was instead handled by MIN bounds in
6772 aarch64_function_arg_boundary.
6774 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6775 an alignment of more than 16 bytes for TYPE then:
6777 - If the argument was passed in registers, these GCC versions
6778 would treat the alignment as though it was *less than* 16 bytes.
6780 - If the argument was passed on the stack, these GCC versions
6781 would treat the alignment as though it was *equal to* 16 bytes.
6783 Both behaviors were wrong, but in different cases. */
6785 pcum->aapcs_arg_processed = true;
6787 pure_scalable_type_info pst_info;
6788 if (type && pst_info.analyze_registers (type))
6790 /* aarch64_function_arg_alignment has never had an effect on
6791 this case. */
6793 /* The PCS says that it is invalid to pass an SVE value to an
6794 unprototyped function. There is no ABI-defined location we
6795 can return in this case, so we have no real choice but to raise
6796 an error immediately, even though this is only a query function. */
6797 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6799 gcc_assert (!pcum->silent_p);
6800 error ("SVE type %qT cannot be passed to an unprototyped function",
6801 arg.type);
6802 /* Avoid repeating the message, and avoid tripping the assert
6803 below. */
6804 pcum->pcs_variant = ARM_PCS_SVE;
6807 /* We would have converted the argument into pass-by-reference
6808 form if it didn't fit in registers. */
6809 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6810 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6811 gcc_assert (arg.named
6812 && pcum->pcs_variant == ARM_PCS_SVE
6813 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6814 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6815 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6816 P0_REGNUM + pcum->aapcs_nprn);
6817 return;
6820 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6821 are passed by reference, not by value. */
6822 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6823 bool sve_p = (vec_flags & VEC_ANY_SVE);
6824 if (sve_p)
6825 /* Vector types can acquire a partial SVE mode using things like
6826 __attribute__((vector_size(N))), and this is potentially useful.
6827 However, the choice of mode doesn't affect the type's ABI
6828 identity, so we should treat the types as though they had
6829 the associated integer mode, just like they did before SVE
6830 was introduced.
6832 We know that the vector must be 128 bits or smaller,
6833 otherwise we'd have passed it in memory instead. */
6834 gcc_assert (type
6835 && (aarch64_some_values_include_pst_objects_p (type)
6836 || (vec_flags & VEC_PARTIAL)));
6838 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6839 if (type)
6840 size = int_size_in_bytes (type);
6841 else
6842 /* No frontends can create types with variable-sized modes, so we
6843 shouldn't be asked to pass or return them. */
6844 size = GET_MODE_SIZE (mode).to_constant ();
6845 size = ROUND_UP (size, UNITS_PER_WORD);
6847 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6848 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6849 mode,
6850 type,
6851 &nregs);
6852 gcc_assert (!sve_p || !allocate_nvrn);
6854 unsigned int alignment
6855 = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
6856 &abi_break_gcc_13, &abi_break_gcc_14);
6858 gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
6859 && (!alignment || abi_break_gcc_9 < alignment)
6860 && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
6862 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6863 The following code thus handles passing by SIMD/FP registers first. */
6865 nvrn = pcum->aapcs_nvrn;
6867 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6868 and homogenous short-vector aggregates (HVA). */
6869 if (allocate_nvrn)
6871 /* aarch64_function_arg_alignment has never had an effect on
6872 this case. */
6873 if (!pcum->silent_p && !TARGET_FLOAT)
6874 aarch64_err_no_fpadvsimd (mode);
6876 if (nvrn + nregs <= NUM_FP_ARG_REGS)
6878 pcum->aapcs_nextnvrn = nvrn + nregs;
6879 if (!aarch64_composite_type_p (type, mode))
6881 gcc_assert (nregs == 1);
6882 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6884 else if (aarch64_advsimd_full_struct_mode_p (mode)
6885 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
6886 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6887 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6888 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
6889 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6890 else
6892 rtx par;
6893 int i;
6894 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6895 for (i = 0; i < nregs; i++)
6897 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6898 V0_REGNUM + nvrn + i);
6899 rtx offset = gen_int_mode
6900 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6901 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6902 XVECEXP (par, 0, i) = tmp;
6904 pcum->aapcs_reg = par;
6906 return;
6908 else
6910 /* C.3 NSRN is set to 8. */
6911 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6912 goto on_stack;
6916 ncrn = pcum->aapcs_ncrn;
6917 nregs = size / UNITS_PER_WORD;
6919 /* C6 - C9. though the sign and zero extension semantics are
6920 handled elsewhere. This is the case where the argument fits
6921 entirely general registers. */
6922 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6924 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6926 /* C.8 if the argument has an alignment of 16 then the NGRN is
6927 rounded up to the next even number. */
6928 if (nregs == 2
6929 && ncrn % 2)
6931 /* Emit a warning if the alignment changed when taking the
6932 'packed' attribute into account. */
6933 if (warn_pcs_change
6934 && abi_break_gcc_13
6935 && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
6936 != (alignment == 16 * BITS_PER_UNIT)))
6937 inform (input_location, "parameter passing for argument of type "
6938 "%qT changed in GCC 13.1", type);
6940 if (warn_pcs_change
6941 && abi_break_gcc_14
6942 && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
6943 != (alignment == 16 * BITS_PER_UNIT)))
6944 inform (input_location, "parameter passing for argument of type "
6945 "%qT changed in GCC 14.1", type);
6947 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
6948 comparison is there because for > 16 * BITS_PER_UNIT
6949 alignment nregs should be > 2 and therefore it should be
6950 passed by reference rather than value. */
6951 if (alignment == 16 * BITS_PER_UNIT)
6953 if (warn_pcs_change && abi_break_gcc_9)
6954 inform (input_location, "parameter passing for argument of type "
6955 "%qT changed in GCC 9.1", type);
6956 ++ncrn;
6957 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
6961 /* If an argument with an SVE mode needs to be shifted up to the
6962 high part of the register, treat it as though it had an integer mode.
6963 Using the normal (parallel [...]) would suppress the shifting. */
6964 if (sve_p
6965 && BYTES_BIG_ENDIAN
6966 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6967 && aarch64_pad_reg_upward (mode, type, false))
6969 mode = int_mode_for_mode (mode).require ();
6970 sve_p = false;
6973 /* NREGS can be 0 when e.g. an empty structure is to be passed.
6974 A reg is still generated for it, but the caller should be smart
6975 enough not to use it. */
6976 if (nregs == 0
6977 || (nregs == 1 && !sve_p)
6978 || GET_MODE_CLASS (mode) == MODE_INT)
6979 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
6980 else
6982 rtx par;
6983 int i;
6985 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6986 for (i = 0; i < nregs; i++)
6988 scalar_int_mode reg_mode = word_mode;
6989 if (nregs == 1)
6990 reg_mode = int_mode_for_mode (mode).require ();
6991 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
6992 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6993 GEN_INT (i * UNITS_PER_WORD));
6994 XVECEXP (par, 0, i) = tmp;
6996 pcum->aapcs_reg = par;
6999 pcum->aapcs_nextncrn = ncrn + nregs;
7000 return;
7003 /* C.11 */
7004 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7006 /* The argument is passed on stack; record the needed number of words for
7007 this argument and align the total size if necessary. */
7008 on_stack:
7009 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7011 if (warn_pcs_change
7012 && abi_break_gcc_13
7013 && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7014 != (alignment >= 16 * BITS_PER_UNIT)))
7015 inform (input_location, "parameter passing for argument of type "
7016 "%qT changed in GCC 13.1", type);
7018 if (warn_pcs_change
7019 && abi_break_gcc_14
7020 && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7021 != (alignment >= 16 * BITS_PER_UNIT)))
7022 inform (input_location, "parameter passing for argument of type "
7023 "%qT changed in GCC 14.1", type);
7025 if (alignment == 16 * BITS_PER_UNIT)
7027 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7028 if (pcum->aapcs_stack_size != new_size)
7030 if (warn_pcs_change && abi_break_gcc_9)
7031 inform (input_location, "parameter passing for argument of type "
7032 "%qT changed in GCC 9.1", type);
7033 pcum->aapcs_stack_size = new_size;
7036 return;
7039 /* Add the current argument register to the set of those that need
7040 to be saved and restored around a change to PSTATE.SM. */
7042 static void
7043 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7045 subrtx_var_iterator::array_type array;
7046 FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7048 rtx x = *iter;
7049 if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7051 unsigned int i = pcum->num_sme_mode_switch_args++;
7052 gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7053 pcum->sme_mode_switch_args[i] = x;
7058 /* Return a parallel that contains all the registers that need to be
7059 saved around a change to PSTATE.SM. Return const0_rtx if there is
7060 no such mode switch, or if no registers need to be saved. */
7062 static rtx
7063 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7065 if (!pcum->num_sme_mode_switch_args)
7066 return const0_rtx;
7068 auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7069 pcum->sme_mode_switch_args);
7070 return gen_rtx_PARALLEL (VOIDmode, argvec);
7073 /* Implement TARGET_FUNCTION_ARG. */
7075 static rtx
7076 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7078 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7079 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7080 || pcum->pcs_variant == ARM_PCS_SIMD
7081 || pcum->pcs_variant == ARM_PCS_SVE);
7083 if (arg.end_marker_p ())
7085 rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7086 pcum->pcs_variant);
7087 rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7088 rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7089 rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7090 return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7091 sme_mode_switch_args,
7092 shared_za_flags,
7093 shared_zt0_flags));
7096 aarch64_layout_arg (pcum_v, arg);
7097 return pcum->aapcs_reg;
7100 void
7101 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7102 const_tree fntype,
7103 rtx libname ATTRIBUTE_UNUSED,
7104 const_tree fndecl,
7105 unsigned n_named ATTRIBUTE_UNUSED,
7106 bool silent_p)
7108 pcum->aapcs_ncrn = 0;
7109 pcum->aapcs_nvrn = 0;
7110 pcum->aapcs_nprn = 0;
7111 pcum->aapcs_nextncrn = 0;
7112 pcum->aapcs_nextnvrn = 0;
7113 pcum->aapcs_nextnprn = 0;
7114 if (fntype)
7116 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7117 pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7119 else
7121 pcum->pcs_variant = ARM_PCS_AAPCS64;
7122 pcum->isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
7124 pcum->aapcs_reg = NULL_RTX;
7125 pcum->aapcs_arg_processed = false;
7126 pcum->aapcs_stack_words = 0;
7127 pcum->aapcs_stack_size = 0;
7128 pcum->silent_p = silent_p;
7129 pcum->shared_za_flags
7130 = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7131 pcum->shared_zt0_flags
7132 = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7133 pcum->num_sme_mode_switch_args = 0;
7135 if (!silent_p
7136 && !TARGET_FLOAT
7137 && fntype && fntype != error_mark_node)
7139 const_tree type = TREE_TYPE (fntype);
7140 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7141 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7142 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7143 &mode, &nregs, NULL, false))
7144 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7147 if (!silent_p
7148 && !TARGET_SVE
7149 && pcum->pcs_variant == ARM_PCS_SVE)
7151 /* We can't gracefully recover at this point, so make this a
7152 fatal error. */
7153 if (fndecl)
7154 fatal_error (input_location, "%qE requires the SVE ISA extension",
7155 fndecl);
7156 else
7157 fatal_error (input_location, "calls to functions of type %qT require"
7158 " the SVE ISA extension", fntype);
7162 static void
7163 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7164 const function_arg_info &arg)
7166 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7167 if (pcum->pcs_variant == ARM_PCS_AAPCS64
7168 || pcum->pcs_variant == ARM_PCS_SIMD
7169 || pcum->pcs_variant == ARM_PCS_SVE)
7171 aarch64_layout_arg (pcum_v, arg);
7172 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7173 != (pcum->aapcs_stack_words != 0));
7174 if (pcum->aapcs_reg
7175 && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7176 aarch64_record_sme_mode_switch_args (pcum);
7178 pcum->aapcs_arg_processed = false;
7179 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7180 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7181 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7182 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7183 pcum->aapcs_stack_words = 0;
7184 pcum->aapcs_reg = NULL_RTX;
7188 bool
7189 aarch64_function_arg_regno_p (unsigned regno)
7191 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7192 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7193 || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7196 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7197 PARM_BOUNDARY bits of alignment, but will be given anything up
7198 to STACK_BOUNDARY bits if the type requires it. This makes sure
7199 that both before and after the layout of each argument, the Next
7200 Stacked Argument Address (NSAA) will have a minimum alignment of
7201 8 bytes. */
7203 static unsigned int
7204 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7206 unsigned int abi_break_gcc_9;
7207 unsigned int abi_break_gcc_13;
7208 unsigned int abi_break_gcc_14;
7209 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7210 &abi_break_gcc_9,
7211 &abi_break_gcc_13,
7212 &abi_break_gcc_14);
7213 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7214 to emit warnings about ABI incompatibility. */
7215 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7216 return alignment;
7219 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7221 static fixed_size_mode
7222 aarch64_get_reg_raw_mode (int regno)
7224 if (TARGET_SVE && FP_REGNUM_P (regno))
7225 /* Don't use the SVE part of the register for __builtin_apply and
7226 __builtin_return. The SVE registers aren't used by the normal PCS,
7227 so using them there would be a waste of time. The PCS extensions
7228 for SVE types are fundamentally incompatible with the
7229 __builtin_return/__builtin_apply interface. */
7230 return as_a <fixed_size_mode> (V16QImode);
7231 if (PR_REGNUM_P (regno))
7232 /* For SVE PR regs, indicate that they should be ignored for
7233 __builtin_apply/__builtin_return. */
7234 return as_a <fixed_size_mode> (VOIDmode);
7235 return default_get_reg_raw_mode (regno);
7238 /* Implement TARGET_FUNCTION_ARG_PADDING.
7240 Small aggregate types are placed in the lowest memory address.
7242 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7244 static pad_direction
7245 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7247 /* On little-endian targets, the least significant byte of every stack
7248 argument is passed at the lowest byte address of the stack slot. */
7249 if (!BYTES_BIG_ENDIAN)
7250 return PAD_UPWARD;
7252 /* Otherwise, integral, floating-point and pointer types are padded downward:
7253 the least significant byte of a stack argument is passed at the highest
7254 byte address of the stack slot. */
7255 if (type
7256 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7257 || POINTER_TYPE_P (type))
7258 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7259 return PAD_DOWNWARD;
7261 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7262 return PAD_UPWARD;
7265 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7267 It specifies padding for the last (may also be the only)
7268 element of a block move between registers and memory. If
7269 assuming the block is in the memory, padding upward means that
7270 the last element is padded after its highest significant byte,
7271 while in downward padding, the last element is padded at the
7272 its least significant byte side.
7274 Small aggregates and small complex types are always padded
7275 upwards.
7277 We don't need to worry about homogeneous floating-point or
7278 short-vector aggregates; their move is not affected by the
7279 padding direction determined here. Regardless of endianness,
7280 each element of such an aggregate is put in the least
7281 significant bits of a fp/simd register.
7283 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7284 register has useful data, and return the opposite if the most
7285 significant byte does. */
7287 bool
7288 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7289 bool first ATTRIBUTE_UNUSED)
7292 /* Aside from pure scalable types, small composite types are always
7293 padded upward. */
7294 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7296 HOST_WIDE_INT size;
7297 if (type)
7298 size = int_size_in_bytes (type);
7299 else
7300 /* No frontends can create types with variable-sized modes, so we
7301 shouldn't be asked to pass or return them. */
7302 size = GET_MODE_SIZE (mode).to_constant ();
7303 if (size < 2 * UNITS_PER_WORD)
7305 pure_scalable_type_info pst_info;
7306 if (pst_info.analyze_registers (type))
7307 return false;
7308 return true;
7312 /* Otherwise, use the default padding. */
7313 return !BYTES_BIG_ENDIAN;
7316 static scalar_int_mode
7317 aarch64_libgcc_cmp_return_mode (void)
7319 return SImode;
7322 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7324 /* We use the 12-bit shifted immediate arithmetic instructions so values
7325 must be multiple of (1 << 12), i.e. 4096. */
7326 #define ARITH_FACTOR 4096
7328 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7329 #error Cannot use simple address calculation for stack probing
7330 #endif
7332 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7333 inclusive. These are offsets from the current stack pointer. */
7335 static void
7336 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7338 HOST_WIDE_INT size;
7339 if (!poly_size.is_constant (&size))
7341 sorry ("stack probes for SVE frames");
7342 return;
7345 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7347 /* See the same assertion on PROBE_INTERVAL above. */
7348 gcc_assert ((first % ARITH_FACTOR) == 0);
7350 /* See if we have a constant small number of probes to generate. If so,
7351 that's the easy case. */
7352 if (size <= PROBE_INTERVAL)
7354 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7356 emit_set_insn (reg1,
7357 plus_constant (Pmode,
7358 stack_pointer_rtx, -(first + base)));
7359 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7362 /* The run-time loop is made up of 8 insns in the generic case while the
7363 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7364 else if (size <= 4 * PROBE_INTERVAL)
7366 HOST_WIDE_INT i, rem;
7368 emit_set_insn (reg1,
7369 plus_constant (Pmode,
7370 stack_pointer_rtx,
7371 -(first + PROBE_INTERVAL)));
7372 emit_stack_probe (reg1);
7374 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7375 it exceeds SIZE. If only two probes are needed, this will not
7376 generate any code. Then probe at FIRST + SIZE. */
7377 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7379 emit_set_insn (reg1,
7380 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7381 emit_stack_probe (reg1);
7384 rem = size - (i - PROBE_INTERVAL);
7385 if (rem > 256)
7387 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7389 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7390 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7392 else
7393 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7396 /* Otherwise, do the same as above, but in a loop. Note that we must be
7397 extra careful with variables wrapping around because we might be at
7398 the very top (or the very bottom) of the address space and we have
7399 to be able to handle this case properly; in particular, we use an
7400 equality test for the loop condition. */
7401 else
7403 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7405 /* Step 1: round SIZE to the previous multiple of the interval. */
7407 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7410 /* Step 2: compute initial and final value of the loop counter. */
7412 /* TEST_ADDR = SP + FIRST. */
7413 emit_set_insn (reg1,
7414 plus_constant (Pmode, stack_pointer_rtx, -first));
7416 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
7417 HOST_WIDE_INT adjustment = - (first + rounded_size);
7418 if (! aarch64_uimm12_shift (adjustment))
7420 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7421 true, Pmode);
7422 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7424 else
7425 emit_set_insn (reg2,
7426 plus_constant (Pmode, stack_pointer_rtx, adjustment));
7428 /* Step 3: the loop
7432 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7433 probe at TEST_ADDR
7435 while (TEST_ADDR != LAST_ADDR)
7437 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7438 until it is equal to ROUNDED_SIZE. */
7440 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7443 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7444 that SIZE is equal to ROUNDED_SIZE. */
7446 if (size != rounded_size)
7448 HOST_WIDE_INT rem = size - rounded_size;
7450 if (rem > 256)
7452 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7454 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7455 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7457 else
7458 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7462 /* Make sure nothing is scheduled before we are done. */
7463 emit_insn (gen_blockage ());
7466 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7467 absolute addresses. */
7469 const char *
7470 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7472 static int labelno = 0;
7473 char loop_lab[32];
7474 rtx xops[2];
7476 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7478 /* Loop. */
7479 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7481 HOST_WIDE_INT stack_clash_probe_interval
7482 = 1 << param_stack_clash_protection_guard_size;
7484 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7485 xops[0] = reg1;
7486 HOST_WIDE_INT interval;
7487 if (flag_stack_clash_protection)
7488 interval = stack_clash_probe_interval;
7489 else
7490 interval = PROBE_INTERVAL;
7492 gcc_assert (aarch64_uimm12_shift (interval));
7493 xops[1] = GEN_INT (interval);
7495 output_asm_insn ("sub\t%0, %0, %1", xops);
7497 /* If doing stack clash protection then we probe up by the ABI specified
7498 amount. We do this because we're dropping full pages at a time in the
7499 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7500 if (flag_stack_clash_protection)
7501 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7502 else
7503 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7505 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7506 by this amount for each iteration. */
7507 output_asm_insn ("str\txzr, [%0, %1]", xops);
7509 /* Test if TEST_ADDR == LAST_ADDR. */
7510 xops[1] = reg2;
7511 output_asm_insn ("cmp\t%0, %1", xops);
7513 /* Branch. */
7514 fputs ("\tb.ne\t", asm_out_file);
7515 assemble_name_raw (asm_out_file, loop_lab);
7516 fputc ('\n', asm_out_file);
7518 return "";
7521 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7522 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7523 of GUARD_SIZE. When a probe is emitted it is done at most
7524 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7525 at most MIN_PROBE_THRESHOLD. By the end of this function
7526 BASE = BASE - ADJUSTMENT. */
7528 const char *
7529 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7530 rtx min_probe_threshold, rtx guard_size)
7532 /* This function is not allowed to use any instruction generation function
7533 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7534 so instead emit the code you want using output_asm_insn. */
7535 gcc_assert (flag_stack_clash_protection);
7536 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7537 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7539 /* The minimum required allocation before the residual requires probing. */
7540 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7542 /* Clamp the value down to the nearest value that can be used with a cmp. */
7543 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7544 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7546 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7547 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7549 static int labelno = 0;
7550 char loop_start_lab[32];
7551 char loop_end_lab[32];
7552 rtx xops[2];
7554 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7555 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7557 /* Emit loop start label. */
7558 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7560 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7561 xops[0] = adjustment;
7562 xops[1] = probe_offset_value_rtx;
7563 output_asm_insn ("cmp\t%0, %1", xops);
7565 /* Branch to end if not enough adjustment to probe. */
7566 fputs ("\tb.lt\t", asm_out_file);
7567 assemble_name_raw (asm_out_file, loop_end_lab);
7568 fputc ('\n', asm_out_file);
7570 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7571 xops[0] = base;
7572 xops[1] = probe_offset_value_rtx;
7573 output_asm_insn ("sub\t%0, %0, %1", xops);
7575 /* Probe at BASE. */
7576 xops[1] = const0_rtx;
7577 output_asm_insn ("str\txzr, [%0, %1]", xops);
7579 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7580 xops[0] = adjustment;
7581 xops[1] = probe_offset_value_rtx;
7582 output_asm_insn ("sub\t%0, %0, %1", xops);
7584 /* Branch to start if still more bytes to allocate. */
7585 fputs ("\tb\t", asm_out_file);
7586 assemble_name_raw (asm_out_file, loop_start_lab);
7587 fputc ('\n', asm_out_file);
7589 /* No probe leave. */
7590 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7592 /* BASE = BASE - ADJUSTMENT. */
7593 xops[0] = base;
7594 xops[1] = adjustment;
7595 output_asm_insn ("sub\t%0, %0, %1", xops);
7596 return "";
7599 /* Determine whether a frame chain needs to be generated. */
7600 static bool
7601 aarch64_needs_frame_chain (void)
7603 if (frame_pointer_needed)
7604 return true;
7606 /* A leaf function cannot have calls or write LR. */
7607 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7609 /* Don't use a frame chain in leaf functions if leaf frame pointers
7610 are disabled. */
7611 if (flag_omit_leaf_frame_pointer && is_leaf)
7612 return false;
7614 return aarch64_use_frame_pointer;
7617 /* Return true if the current function should save registers above
7618 the locals area, rather than below it. */
7620 static bool
7621 aarch64_save_regs_above_locals_p ()
7623 /* When using stack smash protection, make sure that the canary slot
7624 comes between the locals and the saved registers. Otherwise,
7625 it would be possible for a carefully sized smash attack to change
7626 the saved registers (particularly LR and FP) without reaching the
7627 canary. */
7628 return crtl->stack_protect_guard;
7631 /* Return true if the current function needs to record the incoming
7632 value of PSTATE.SM. */
7633 static bool
7634 aarch64_need_old_pstate_sm ()
7636 /* Exit early if the incoming value of PSTATE.SM is known at
7637 compile time. */
7638 if (aarch64_cfun_incoming_pstate_sm () != 0)
7639 return false;
7641 if (aarch64_cfun_enables_pstate_sm ())
7642 return true;
7644 /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7645 but the function needs to return with PSTATE.SM unchanged. */
7646 if (nonlocal_goto_handler_labels)
7647 return true;
7649 /* Likewise for exception handlers. */
7650 eh_landing_pad lp;
7651 for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
7652 if (lp && lp->post_landing_pad)
7653 return true;
7655 /* Non-local gotos need to set PSTATE.SM to zero. It's possible to call
7656 streaming-compatible functions without SME being available, so PSTATE.SM
7657 should only be changed if it is currently set to one. */
7658 if (crtl->has_nonlocal_goto)
7659 return true;
7661 if (cfun->machine->call_switches_pstate_sm)
7662 for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
7663 if (auto *call = dyn_cast<rtx_call_insn *> (insn))
7664 if (!SIBLING_CALL_P (call))
7666 /* Return true if there is a call to a non-streaming-compatible
7667 function. */
7668 auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
7669 if (aarch64_call_switches_pstate_sm (callee_isa_mode))
7670 return true;
7672 return false;
7675 /* Mark the registers that need to be saved by the callee and calculate
7676 the size of the callee-saved registers area and frame record (both FP
7677 and LR may be omitted). */
7678 static void
7679 aarch64_layout_frame (void)
7681 unsigned regno, last_fp_reg = INVALID_REGNUM;
7682 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7683 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7684 bool frame_related_fp_reg_p = false;
7685 aarch64_frame &frame = cfun->machine->frame;
7686 poly_int64 top_of_locals = -1;
7687 bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
7689 vec_safe_truncate (frame.saved_gprs, 0);
7690 vec_safe_truncate (frame.saved_fprs, 0);
7691 vec_safe_truncate (frame.saved_prs, 0);
7693 frame.emit_frame_chain = aarch64_needs_frame_chain ();
7695 /* Adjust the outgoing arguments size if required. Keep it in sync with what
7696 the mid-end is doing. */
7697 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7699 #define SLOT_NOT_REQUIRED (-2)
7700 #define SLOT_REQUIRED (-1)
7702 frame.wb_push_candidate1 = INVALID_REGNUM;
7703 frame.wb_push_candidate2 = INVALID_REGNUM;
7704 frame.spare_pred_reg = INVALID_REGNUM;
7706 /* First mark all the registers that really need to be saved... */
7707 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7708 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7709 frame.old_svcr_offset = SLOT_NOT_REQUIRED;
7711 /* ... that includes the eh data registers (if needed)... */
7712 if (crtl->calls_eh_return)
7713 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7714 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7716 /* ... and any callee saved register that dataflow says is live. */
7717 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7718 if (df_regs_ever_live_p (regno)
7719 && !fixed_regs[regno]
7720 && (regno == R30_REGNUM
7721 || !crtl->abi->clobbers_full_reg_p (regno)))
7722 frame.reg_offset[regno] = SLOT_REQUIRED;
7724 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7725 if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7726 && !fixed_regs[regno]
7727 && !crtl->abi->clobbers_full_reg_p (regno))
7729 frame.reg_offset[regno] = SLOT_REQUIRED;
7730 last_fp_reg = regno;
7731 if (aarch64_emit_cfi_for_reg_p (regno))
7732 frame_related_fp_reg_p = true;
7735 /* Big-endian SVE frames need a spare predicate register in order
7736 to save Z8-Z15. Decide which register they should use. Prefer
7737 an unused argument register if possible, so that we don't force P4
7738 to be saved unnecessarily. */
7739 if (frame_related_fp_reg_p
7740 && crtl->abi->id () == ARM_PCS_SVE
7741 && BYTES_BIG_ENDIAN)
7743 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7744 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7745 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7746 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7747 break;
7748 gcc_assert (regno <= P7_REGNUM);
7749 frame.spare_pred_reg = regno;
7750 df_set_regs_ever_live (regno, true);
7753 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7754 if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7755 && !fixed_regs[regno]
7756 && !crtl->abi->clobbers_full_reg_p (regno))
7757 frame.reg_offset[regno] = SLOT_REQUIRED;
7759 bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
7761 poly_int64 offset = crtl->outgoing_args_size;
7762 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7763 if (regs_at_top_p)
7765 offset += get_frame_size ();
7766 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7767 top_of_locals = offset;
7769 frame.bytes_below_saved_regs = offset;
7770 frame.sve_save_and_probe = INVALID_REGNUM;
7772 /* Now assign stack slots for the registers. Start with the predicate
7773 registers, since predicate LDR and STR have a relatively small
7774 offset range. These saves happen below the hard frame pointer. */
7775 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7776 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7778 vec_safe_push (frame.saved_prs, regno);
7779 if (frame.sve_save_and_probe == INVALID_REGNUM)
7780 frame.sve_save_and_probe = regno;
7781 frame.reg_offset[regno] = offset;
7782 offset += BYTES_PER_SVE_PRED;
7785 poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
7786 if (maybe_ne (saved_prs_size, 0))
7788 /* If we have any vector registers to save above the predicate registers,
7789 the offset of the vector register save slots need to be a multiple
7790 of the vector size. This lets us use the immediate forms of LDR/STR
7791 (or LD1/ST1 for big-endian).
7793 A vector register is 8 times the size of a predicate register,
7794 and we need to save a maximum of 12 predicate registers, so the
7795 first vector register will be at either #1, MUL VL or #2, MUL VL.
7797 If we don't have any vector registers to save, and we know how
7798 big the predicate save area is, we can just round it up to the
7799 next 16-byte boundary. */
7800 if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
7801 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7802 else
7804 if (known_le (saved_prs_size, vector_save_size))
7805 offset = frame.bytes_below_saved_regs + vector_save_size;
7806 else if (known_le (saved_prs_size, vector_save_size * 2))
7807 offset = frame.bytes_below_saved_regs + vector_save_size * 2;
7808 else
7809 gcc_unreachable ();
7813 /* If we need to save any SVE vector registers, add them next. */
7814 if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7815 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7816 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7818 vec_safe_push (frame.saved_fprs, regno);
7819 if (frame.sve_save_and_probe == INVALID_REGNUM)
7820 frame.sve_save_and_probe = regno;
7821 frame.reg_offset[regno] = offset;
7822 offset += vector_save_size;
7825 /* OFFSET is now the offset of the hard frame pointer from the bottom
7826 of the callee save area. */
7827 auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
7828 bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
7829 gcc_assert (!saves_below_hard_fp_p
7830 || (frame.sve_save_and_probe != INVALID_REGNUM
7831 && known_eq (frame.reg_offset[frame.sve_save_and_probe],
7832 frame.bytes_below_saved_regs)));
7834 frame.bytes_below_hard_fp = offset;
7835 frame.hard_fp_save_and_probe = INVALID_REGNUM;
7837 auto allocate_gpr_slot = [&](unsigned int regno)
7839 vec_safe_push (frame.saved_gprs, regno);
7840 frame.reg_offset[regno] = offset;
7841 offset += UNITS_PER_WORD;
7844 if (frame.emit_frame_chain)
7846 /* FP and LR are placed in the linkage record. */
7847 allocate_gpr_slot (R29_REGNUM);
7848 allocate_gpr_slot (R30_REGNUM);
7850 else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
7851 && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
7852 /* Put the LR save slot first, since it makes a good choice of probe
7853 for stack clash purposes. The idea is that the link register usually
7854 has to be saved before a call anyway, and so we lose little by
7855 stopping it from being individually shrink-wrapped. */
7856 allocate_gpr_slot (R30_REGNUM);
7858 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7859 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7860 allocate_gpr_slot (regno);
7862 if (aarch64_need_old_pstate_sm ())
7864 frame.old_svcr_offset = offset;
7865 offset += UNITS_PER_WORD;
7868 /* If the current function changes the SVE vector length, ensure that the
7869 old value of the DWARF VG register is saved and available in the CFI,
7870 so that outer frames with VL-sized offsets can be processed correctly. */
7871 if (cfun->machine->call_switches_pstate_sm
7872 || aarch64_cfun_enables_pstate_sm ())
7874 frame.reg_offset[VG_REGNUM] = offset;
7875 offset += UNITS_PER_WORD;
7878 poly_int64 max_int_offset = offset;
7879 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7880 bool has_align_gap = maybe_ne (offset, max_int_offset);
7882 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7883 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7885 vec_safe_push (frame.saved_fprs, regno);
7886 /* If there is an alignment gap between integer and fp callee-saves,
7887 allocate the last fp register to it if possible. */
7888 if (regno == last_fp_reg
7889 && has_align_gap
7890 && known_eq (vector_save_size, 8)
7891 && multiple_p (offset, 16))
7893 frame.reg_offset[regno] = max_int_offset;
7894 break;
7897 frame.reg_offset[regno] = offset;
7898 offset += vector_save_size;
7901 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7902 auto saved_regs_size = offset - frame.bytes_below_saved_regs;
7904 array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
7905 ? frame.saved_gprs
7906 : frame.saved_fprs);
7907 if (!push_regs.empty ()
7908 && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
7910 frame.hard_fp_save_and_probe = push_regs[0];
7911 frame.wb_push_candidate1 = push_regs[0];
7912 if (push_regs.size () > 1)
7913 frame.wb_push_candidate2 = push_regs[1];
7916 /* With stack-clash, a register must be saved in non-leaf functions.
7917 The saving of the bottommost register counts as an implicit probe,
7918 which allows us to maintain the invariant described in the comment
7919 at expand_prologue. */
7920 gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
7922 if (!regs_at_top_p)
7924 offset += get_frame_size ();
7925 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7926 top_of_locals = offset;
7928 offset += frame.saved_varargs_size;
7929 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7930 frame.frame_size = offset;
7932 frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
7933 gcc_assert (known_ge (top_of_locals, 0));
7934 frame.bytes_above_locals = frame.frame_size - top_of_locals;
7936 frame.initial_adjust = 0;
7937 frame.final_adjust = 0;
7938 frame.callee_adjust = 0;
7939 frame.sve_callee_adjust = 0;
7941 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
7942 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
7944 /* Shadow call stack only deals with functions where the LR is pushed
7945 onto the stack and without specifying the "no_sanitize" attribute
7946 with the argument "shadow-call-stack". */
7947 frame.is_scs_enabled
7948 = (!crtl->calls_eh_return
7949 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
7950 && known_ge (frame.reg_offset[LR_REGNUM], 0));
7952 /* When shadow call stack is enabled, the scs_pop in the epilogue will
7953 restore x30, and we don't need to pop x30 again in the traditional
7954 way. Pop candidates record the registers that need to be popped
7955 eventually. */
7956 if (frame.is_scs_enabled)
7958 if (frame.wb_pop_candidate2 == R30_REGNUM)
7959 frame.wb_pop_candidate2 = INVALID_REGNUM;
7960 else if (frame.wb_pop_candidate1 == R30_REGNUM)
7961 frame.wb_pop_candidate1 = INVALID_REGNUM;
7964 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
7965 256 to ensure that the offset meets the requirements of emit_move_insn.
7966 Similarly, if candidate1 is INVALID_REGNUM, we need to set
7967 max_push_offset to 0, because no registers are popped at this time,
7968 so callee_adjust cannot be adjusted. */
7969 HOST_WIDE_INT max_push_offset = 0;
7970 if (frame.wb_pop_candidate1 != INVALID_REGNUM)
7972 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
7973 max_push_offset = 512;
7974 else
7975 max_push_offset = 256;
7978 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
7979 HOST_WIDE_INT const_saved_regs_size;
7980 if (known_eq (saved_regs_size, 0))
7981 frame.initial_adjust = frame.frame_size;
7982 else if (frame.frame_size.is_constant (&const_size)
7983 && const_size < max_push_offset
7984 && known_eq (frame.bytes_above_hard_fp, const_size))
7986 /* Simple, small frame with no data below the saved registers.
7988 stp reg1, reg2, [sp, -frame_size]!
7989 stp reg3, reg4, [sp, 16] */
7990 frame.callee_adjust = const_size;
7992 else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
7993 && saved_regs_size.is_constant (&const_saved_regs_size)
7994 && const_below_saved_regs + const_saved_regs_size < 512
7995 /* We could handle this case even with data below the saved
7996 registers, provided that that data left us with valid offsets
7997 for all predicate and vector save slots. It's such a rare
7998 case that it hardly seems worth the effort though. */
7999 && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8000 && !(cfun->calls_alloca
8001 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8002 && const_above_fp < max_push_offset))
8004 /* Frame with small area below the saved registers:
8006 sub sp, sp, frame_size
8007 stp reg1, reg2, [sp, bytes_below_saved_regs]
8008 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
8009 frame.initial_adjust = frame.frame_size;
8011 else if (saves_below_hard_fp_p
8012 && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8014 /* Frame in which all saves are SVE saves:
8016 sub sp, sp, frame_size - bytes_below_saved_regs
8017 save SVE registers relative to SP
8018 sub sp, sp, bytes_below_saved_regs */
8019 frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8020 frame.final_adjust = frame.bytes_below_saved_regs;
8022 else if (frame.wb_push_candidate1 != INVALID_REGNUM
8023 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8024 && const_above_fp < max_push_offset)
8026 /* Frame with large area below the saved registers, or with SVE saves,
8027 but with a small area above:
8029 stp reg1, reg2, [sp, -hard_fp_offset]!
8030 stp reg3, reg4, [sp, 16]
8031 [sub sp, sp, below_hard_fp_saved_regs_size]
8032 [save SVE registers relative to SP]
8033 sub sp, sp, bytes_below_saved_regs */
8034 frame.callee_adjust = const_above_fp;
8035 frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8036 frame.final_adjust = frame.bytes_below_saved_regs;
8038 else
8040 /* General case:
8042 sub sp, sp, hard_fp_offset
8043 stp x29, x30, [sp, 0]
8044 add x29, sp, 0
8045 stp reg3, reg4, [sp, 16]
8046 [sub sp, sp, below_hard_fp_saved_regs_size]
8047 [save SVE registers relative to SP]
8048 sub sp, sp, bytes_below_saved_regs */
8049 frame.initial_adjust = frame.bytes_above_hard_fp;
8050 frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8051 frame.final_adjust = frame.bytes_below_saved_regs;
8054 /* The frame is allocated in pieces, with each non-final piece
8055 including a register save at offset 0 that acts as a probe for
8056 the following piece. In addition, the save of the bottommost register
8057 acts as a probe for callees and allocas. Roll back any probes that
8058 aren't needed.
8060 A probe isn't needed if it is associated with the final allocation
8061 (including callees and allocas) that happens before the epilogue is
8062 executed. */
8063 if (crtl->is_leaf
8064 && !cfun->calls_alloca
8065 && known_eq (frame.final_adjust, 0))
8067 if (maybe_ne (frame.sve_callee_adjust, 0))
8068 frame.sve_save_and_probe = INVALID_REGNUM;
8069 else
8070 frame.hard_fp_save_and_probe = INVALID_REGNUM;
8073 /* Make sure the individual adjustments add up to the full frame size. */
8074 gcc_assert (known_eq (frame.initial_adjust
8075 + frame.callee_adjust
8076 + frame.sve_callee_adjust
8077 + frame.final_adjust, frame.frame_size));
8079 if (frame.callee_adjust == 0)
8081 /* We've decided not to do a "real" push and pop. However,
8082 setting up the frame chain is treated as being essentially
8083 a multi-instruction push. */
8084 frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8085 if (!frame.emit_frame_chain)
8086 frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8089 frame.laid_out = true;
8092 /* Return true if the register REGNO is saved on entry to
8093 the current function. */
8095 static bool
8096 aarch64_register_saved_on_entry (int regno)
8098 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8101 /* Push the register number REGNO of mode MODE to the stack with write-back
8102 adjusting the stack by ADJUSTMENT. */
8104 static void
8105 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8106 HOST_WIDE_INT adjustment)
8108 rtx base_rtx = stack_pointer_rtx;
8109 rtx insn, reg, mem;
8111 reg = gen_rtx_REG (mode, regno);
8112 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8113 plus_constant (Pmode, base_rtx, -adjustment));
8114 mem = gen_frame_mem (mode, mem);
8116 insn = emit_move_insn (mem, reg);
8117 RTX_FRAME_RELATED_P (insn) = 1;
8120 /* Generate and return an instruction to store the pair of registers
8121 REG and REG2 of mode MODE to location BASE with write-back adjusting
8122 the stack location BASE by ADJUSTMENT. */
8124 static rtx
8125 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8126 HOST_WIDE_INT adjustment)
8128 rtx new_base = plus_constant (Pmode, base, -adjustment);
8129 rtx mem = gen_frame_mem (mode, new_base);
8130 rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8132 return gen_rtx_PARALLEL (VOIDmode,
8133 gen_rtvec (3,
8134 gen_rtx_SET (base, new_base),
8135 gen_rtx_SET (mem, reg),
8136 gen_rtx_SET (mem2, reg2)));
8139 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8140 stack pointer by ADJUSTMENT. */
8142 static void
8143 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8145 rtx_insn *insn;
8146 machine_mode mode = aarch64_reg_save_mode (regno1);
8148 if (regno2 == INVALID_REGNUM)
8149 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8151 rtx reg1 = gen_rtx_REG (mode, regno1);
8152 rtx reg2 = gen_rtx_REG (mode, regno2);
8154 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8155 reg2, adjustment));
8156 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8157 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8158 RTX_FRAME_RELATED_P (insn) = 1;
8161 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8162 adjusting it by ADJUSTMENT afterwards. */
8164 static rtx
8165 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8166 HOST_WIDE_INT adjustment)
8168 rtx mem = gen_frame_mem (mode, base);
8169 rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8170 rtx new_base = plus_constant (Pmode, base, adjustment);
8172 return gen_rtx_PARALLEL (VOIDmode,
8173 gen_rtvec (3,
8174 gen_rtx_SET (base, new_base),
8175 gen_rtx_SET (reg, mem),
8176 gen_rtx_SET (reg2, mem2)));
8179 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8180 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8181 into CFI_OPS. */
8183 static void
8184 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8185 rtx *cfi_ops)
8187 machine_mode mode = aarch64_reg_save_mode (regno1);
8188 rtx reg1 = gen_rtx_REG (mode, regno1);
8190 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8192 if (regno2 == INVALID_REGNUM)
8194 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8195 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8196 emit_move_insn (reg1, gen_frame_mem (mode, mem));
8198 else
8200 rtx reg2 = gen_rtx_REG (mode, regno2);
8201 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8202 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8203 reg2, adjustment));
8207 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8208 for a mem rtx representing the entire pair. */
8210 static machine_mode
8211 aarch64_pair_mode_for_mode (machine_mode mode)
8213 if (known_eq (GET_MODE_SIZE (mode), 4))
8214 return V2x4QImode;
8215 else if (known_eq (GET_MODE_SIZE (mode), 8))
8216 return V2x8QImode;
8217 else if (known_eq (GET_MODE_SIZE (mode), 16))
8218 return V2x16QImode;
8219 else
8220 gcc_unreachable ();
8223 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8224 operand, return an rtx like MEM which instead represents the entire pair. */
8226 static rtx
8227 aarch64_pair_mem_from_base (rtx mem)
8229 auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8230 mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8231 gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8232 return mem;
8235 /* Generate and return a store pair instruction to store REG1 and REG2
8236 into memory starting at BASE_MEM. All three rtxes should have modes of the
8237 same size. */
8240 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8242 rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8244 return gen_rtx_SET (pair_mem,
8245 gen_rtx_UNSPEC (GET_MODE (pair_mem),
8246 gen_rtvec (2, reg1, reg2),
8247 UNSPEC_STP));
8250 /* Generate and return a load pair instruction to load a pair of
8251 registers starting at BASE_MEM into REG1 and REG2. If CODE is
8252 UNKNOWN, all three rtxes should have modes of the same size.
8253 Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8254 and REG{1,2} should be in DImode. */
8257 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8259 rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8261 const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8262 if (any_extend_p)
8263 gcc_checking_assert (GET_MODE (base_mem) == SImode
8264 && GET_MODE (reg1) == DImode
8265 && GET_MODE (reg2) == DImode);
8266 else
8267 gcc_assert (code == UNKNOWN);
8269 rtx unspecs[2] = {
8270 gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8271 gen_rtvec (1, pair_mem),
8272 UNSPEC_LDP_FST),
8273 gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8274 gen_rtvec (1, copy_rtx (pair_mem)),
8275 UNSPEC_LDP_SND)
8278 if (any_extend_p)
8279 for (int i = 0; i < 2; i++)
8280 unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8282 return gen_rtx_PARALLEL (VOIDmode,
8283 gen_rtvec (2,
8284 gen_rtx_SET (reg1, unspecs[0]),
8285 gen_rtx_SET (reg2, unspecs[1])));
8288 /* Return TRUE if return address signing should be enabled for the current
8289 function, otherwise return FALSE. */
8291 bool
8292 aarch64_return_address_signing_enabled (void)
8294 /* This function should only be called after frame laid out. */
8295 gcc_assert (cfun->machine->frame.laid_out);
8297 /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8298 if its LR is pushed onto stack. */
8299 return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8300 || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8301 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8304 /* Only used by the arm backend. */
8305 void aarch_bti_arch_check (void)
8308 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8309 bool
8310 aarch_bti_enabled (void)
8312 return (aarch_enable_bti == 1);
8315 /* Check if INSN is a BTI J insn. */
8316 bool
8317 aarch_bti_j_insn_p (rtx_insn *insn)
8319 if (!insn || !INSN_P (insn))
8320 return false;
8322 rtx pat = PATTERN (insn);
8323 return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8326 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction. */
8327 bool
8328 aarch_pac_insn_p (rtx x)
8330 if (!INSN_P (x))
8331 return false;
8333 subrtx_var_iterator::array_type array;
8334 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8336 rtx sub = *iter;
8337 if (sub && GET_CODE (sub) == UNSPEC)
8339 int unspec_val = XINT (sub, 1);
8340 switch (unspec_val)
8342 case UNSPEC_PACIASP:
8343 case UNSPEC_PACIBSP:
8344 return true;
8346 default:
8347 return false;
8349 iter.skip_subrtxes ();
8352 return false;
8355 rtx aarch_gen_bti_c (void)
8357 return gen_bti_c ();
8360 rtx aarch_gen_bti_j (void)
8362 return gen_bti_j ();
8365 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8366 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8367 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8369 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8370 or LD1D address
8372 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8373 if the variable isn't already nonnull
8375 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8376 Handle this case using a temporary base register that is suitable for
8377 all offsets in that range. Use ANCHOR_REG as this base register if it
8378 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8380 static inline void
8381 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8382 rtx &anchor_reg, poly_int64 &offset,
8383 rtx &ptrue)
8385 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8387 /* This is the maximum valid offset of the anchor from the base.
8388 Lower values would be valid too. */
8389 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8390 if (!anchor_reg)
8392 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8393 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8394 gen_int_mode (anchor_offset, Pmode)));
8396 base_rtx = anchor_reg;
8397 offset -= anchor_offset;
8399 if (!ptrue)
8401 int pred_reg = cfun->machine->frame.spare_pred_reg;
8402 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8403 CONSTM1_RTX (VNx16BImode));
8404 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8408 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8409 is saved at BASE + OFFSET. */
8411 static void
8412 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8413 rtx base, poly_int64 offset)
8415 rtx mem = gen_frame_mem (GET_MODE (reg),
8416 plus_constant (Pmode, base, offset));
8417 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8420 /* Emit code to save the callee-saved registers in REGS. Skip any
8421 write-back candidates if SKIP_WB is true, otherwise consider only
8422 write-back candidates.
8424 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8425 of the static frame. HARD_FP_VALID_P is true if the hard frame pointer
8426 has been set up. */
8428 static void
8429 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8430 array_slice<unsigned int> regs, bool skip_wb,
8431 bool hard_fp_valid_p)
8433 aarch64_frame &frame = cfun->machine->frame;
8434 rtx_insn *insn;
8435 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8437 auto skip_save_p = [&](unsigned int regno)
8439 if (cfun->machine->reg_is_wrapped_separately[regno])
8440 return true;
8442 if (skip_wb == (regno == frame.wb_push_candidate1
8443 || regno == frame.wb_push_candidate2))
8444 return true;
8446 return false;
8449 for (unsigned int i = 0; i < regs.size (); ++i)
8451 unsigned int regno = regs[i];
8452 poly_int64 offset;
8453 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8455 if (skip_save_p (regno))
8456 continue;
8458 machine_mode mode = aarch64_reg_save_mode (regno);
8459 rtx reg = gen_rtx_REG (mode, regno);
8460 rtx move_src = reg;
8461 offset = frame.reg_offset[regno] - bytes_below_sp;
8462 if (regno == VG_REGNUM)
8464 move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8465 emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8467 rtx base_rtx = stack_pointer_rtx;
8468 poly_int64 sp_offset = offset;
8470 HOST_WIDE_INT const_offset;
8471 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8472 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8473 offset, ptrue);
8474 else if (GP_REGNUM_P (REGNO (reg))
8475 && (!offset.is_constant (&const_offset) || const_offset >= 512))
8477 poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8478 if (hard_fp_valid_p)
8479 base_rtx = hard_frame_pointer_rtx;
8480 else
8482 if (!anchor_reg)
8484 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8485 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8486 gen_int_mode (fp_offset, Pmode)));
8488 base_rtx = anchor_reg;
8490 offset -= fp_offset;
8492 rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8493 rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8494 stack_pointer_rtx,
8495 sp_offset));
8496 rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8497 bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8499 unsigned int regno2;
8500 if (!aarch64_sve_mode_p (mode)
8501 && reg == move_src
8502 && i + 1 < regs.size ()
8503 && (regno2 = regs[i + 1], !skip_save_p (regno2))
8504 && known_eq (GET_MODE_SIZE (mode),
8505 frame.reg_offset[regno2] - frame.reg_offset[regno]))
8507 rtx reg2 = gen_rtx_REG (mode, regno2);
8509 offset += GET_MODE_SIZE (mode);
8510 insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8512 rtx cfi_mem2
8513 = gen_frame_mem (mode,
8514 plus_constant (Pmode,
8515 stack_pointer_rtx,
8516 sp_offset + GET_MODE_SIZE (mode)));
8517 rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8519 /* The first part of a frame-related parallel insn is always
8520 assumed to be relevant to the frame calculations;
8521 subsequent parts, are only frame-related if
8522 explicitly marked. */
8523 if (aarch64_emit_cfi_for_reg_p (regno2))
8524 RTX_FRAME_RELATED_P (cfi_set2) = 1;
8526 /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8527 representation of stp cannot be understood directly by
8528 dwarf2cfi. */
8529 rtx par = gen_rtx_PARALLEL (VOIDmode,
8530 gen_rtvec (2, cfi_set, cfi_set2));
8531 add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8533 regno = regno2;
8534 ++i;
8536 else
8538 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8540 insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8541 ptrue, move_src));
8542 need_cfi_note_p = true;
8544 else if (aarch64_sve_mode_p (mode))
8545 insn = emit_insn (gen_rtx_SET (mem, move_src));
8546 else
8547 insn = emit_move_insn (mem, move_src);
8549 if (frame_related_p && (need_cfi_note_p || move_src != reg))
8550 add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8553 RTX_FRAME_RELATED_P (insn) = frame_related_p;
8555 /* Emit a fake instruction to indicate that the VG save slot has
8556 been initialized. */
8557 if (regno == VG_REGNUM)
8558 emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8562 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8563 and any other registers that are handled separately. Write the appropriate
8564 REG_CFA_RESTORE notes into CFI_OPS.
8566 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8567 of the static frame. */
8569 static void
8570 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8571 array_slice<unsigned int> regs, rtx *cfi_ops)
8573 aarch64_frame &frame = cfun->machine->frame;
8574 poly_int64 offset;
8575 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8577 auto skip_restore_p = [&](unsigned int regno)
8579 if (cfun->machine->reg_is_wrapped_separately[regno])
8580 return true;
8582 if (regno == frame.wb_pop_candidate1
8583 || regno == frame.wb_pop_candidate2)
8584 return true;
8586 /* The shadow call stack code restores LR separately. */
8587 if (frame.is_scs_enabled && regno == LR_REGNUM)
8588 return true;
8590 return false;
8593 for (unsigned int i = 0; i < regs.size (); ++i)
8595 unsigned int regno = regs[i];
8596 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8597 if (skip_restore_p (regno))
8598 continue;
8600 machine_mode mode = aarch64_reg_save_mode (regno);
8601 rtx reg = gen_rtx_REG (mode, regno);
8602 offset = frame.reg_offset[regno] - bytes_below_sp;
8603 rtx base_rtx = stack_pointer_rtx;
8604 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8605 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8606 offset, ptrue);
8607 rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8609 unsigned int regno2;
8610 if (!aarch64_sve_mode_p (mode)
8611 && i + 1 < regs.size ()
8612 && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8613 && known_eq (GET_MODE_SIZE (mode),
8614 frame.reg_offset[regno2] - frame.reg_offset[regno]))
8616 rtx reg2 = gen_rtx_REG (mode, regno2);
8618 offset += GET_MODE_SIZE (mode);
8619 emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8621 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8622 regno = regno2;
8623 ++i;
8625 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8626 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8627 else if (aarch64_sve_mode_p (mode))
8628 emit_insn (gen_rtx_SET (reg, mem));
8629 else
8630 emit_move_insn (reg, mem);
8631 if (frame_related_p)
8632 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8636 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8637 of MODE. */
8639 static inline bool
8640 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8642 HOST_WIDE_INT multiple;
8643 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8644 && IN_RANGE (multiple, -8, 7));
8647 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8648 of MODE. */
8650 static inline bool
8651 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8653 HOST_WIDE_INT multiple;
8654 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8655 && IN_RANGE (multiple, -32, 31));
8658 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8659 of MODE. */
8661 static inline bool
8662 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8664 HOST_WIDE_INT multiple;
8665 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8666 && IN_RANGE (multiple, 0, 63));
8669 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8670 of MODE. */
8672 bool
8673 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8675 HOST_WIDE_INT multiple;
8676 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8677 && IN_RANGE (multiple, -64, 63));
8680 /* Return true if OFFSET is a signed 9-bit value. */
8682 bool
8683 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8684 poly_int64 offset)
8686 HOST_WIDE_INT const_offset;
8687 return (offset.is_constant (&const_offset)
8688 && IN_RANGE (const_offset, -256, 255));
8691 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8692 of MODE. */
8694 static inline bool
8695 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8697 HOST_WIDE_INT multiple;
8698 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8699 && IN_RANGE (multiple, -256, 255));
8702 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8703 of MODE. */
8705 static inline bool
8706 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8708 HOST_WIDE_INT multiple;
8709 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8710 && IN_RANGE (multiple, 0, 4095));
8713 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
8715 static sbitmap
8716 aarch64_get_separate_components (void)
8718 aarch64_frame &frame = cfun->machine->frame;
8719 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8720 bitmap_clear (components);
8722 /* The registers we need saved to the frame. */
8723 bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8724 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8725 if (aarch64_register_saved_on_entry (regno))
8727 /* Disallow shrink wrapping for registers that will be clobbered
8728 by an SMSTART SM in the prologue. */
8729 if (enables_pstate_sm
8730 && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
8731 continue;
8733 /* Punt on saves and restores that use ST1D and LD1D. We could
8734 try to be smarter, but it would involve making sure that the
8735 spare predicate register itself is safe to use at the save
8736 and restore points. Also, when a frame pointer is being used,
8737 the slots are often out of reach of ST1D and LD1D anyway. */
8738 machine_mode mode = aarch64_reg_save_mode (regno);
8739 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8740 continue;
8742 poly_int64 offset = frame.reg_offset[regno];
8744 /* Get the offset relative to the register we'll use. */
8745 if (frame_pointer_needed)
8746 offset -= frame.bytes_below_hard_fp;
8748 /* Check that we can access the stack slot of the register with one
8749 direct load with no adjustments needed. */
8750 if (aarch64_sve_mode_p (mode)
8751 ? offset_9bit_signed_scaled_p (mode, offset)
8752 : offset_12bit_unsigned_scaled_p (mode, offset))
8753 bitmap_set_bit (components, regno);
8756 /* Don't mess with the hard frame pointer. */
8757 if (frame_pointer_needed)
8758 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8760 /* If the spare predicate register used by big-endian SVE code
8761 is call-preserved, it must be saved in the main prologue
8762 before any saves that use it. */
8763 if (frame.spare_pred_reg != INVALID_REGNUM)
8764 bitmap_clear_bit (components, frame.spare_pred_reg);
8766 unsigned reg1 = frame.wb_push_candidate1;
8767 unsigned reg2 = frame.wb_push_candidate2;
8768 /* If registers have been chosen to be stored/restored with
8769 writeback don't interfere with them to avoid having to output explicit
8770 stack adjustment instructions. */
8771 if (reg2 != INVALID_REGNUM)
8772 bitmap_clear_bit (components, reg2);
8773 if (reg1 != INVALID_REGNUM)
8774 bitmap_clear_bit (components, reg1);
8776 bitmap_clear_bit (components, LR_REGNUM);
8777 bitmap_clear_bit (components, SP_REGNUM);
8778 if (flag_stack_clash_protection)
8780 if (frame.sve_save_and_probe != INVALID_REGNUM)
8781 bitmap_clear_bit (components, frame.sve_save_and_probe);
8782 if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
8783 bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
8786 /* The VG save sequence needs a temporary GPR. Punt for now on trying
8787 to find one. */
8788 bitmap_clear_bit (components, VG_REGNUM);
8790 return components;
8793 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
8795 static sbitmap
8796 aarch64_components_for_bb (basic_block bb)
8798 bitmap in = DF_LIVE_IN (bb);
8799 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8800 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8802 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8803 bitmap_clear (components);
8805 /* Clobbered registers don't generate values in any meaningful sense,
8806 since nothing after the clobber can rely on their value. And we can't
8807 say that partially-clobbered registers are unconditionally killed,
8808 because whether they're killed or not depends on the mode of the
8809 value they're holding. Thus partially call-clobbered registers
8810 appear in neither the kill set nor the gen set.
8812 Check manually for any calls that clobber more of a register than the
8813 current function can. */
8814 function_abi_aggregator callee_abis;
8815 rtx_insn *insn;
8816 FOR_BB_INSNS (bb, insn)
8817 if (CALL_P (insn))
8818 callee_abis.note_callee_abi (insn_callee_abi (insn));
8819 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8821 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
8822 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8823 if (!fixed_regs[regno]
8824 && !crtl->abi->clobbers_full_reg_p (regno)
8825 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8826 || bitmap_bit_p (in, regno)
8827 || bitmap_bit_p (gen, regno)
8828 || bitmap_bit_p (kill, regno)))
8830 bitmap_set_bit (components, regno);
8832 /* If there is a callee-save at an adjacent offset, add it too
8833 to increase the use of LDP/STP. */
8834 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8835 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8837 if (regno2 <= LAST_SAVED_REGNUM)
8839 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8840 if (regno < regno2
8841 ? known_eq (offset + 8, offset2)
8842 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8843 bitmap_set_bit (components, regno2);
8847 return components;
8850 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8851 Nothing to do for aarch64. */
8853 static void
8854 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8858 /* Return the next set bit in BMP from START onwards. Return the total number
8859 of bits in BMP if no set bit is found at or after START. */
8861 static unsigned int
8862 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8864 unsigned int nbits = SBITMAP_SIZE (bmp);
8865 if (start == nbits)
8866 return start;
8868 gcc_assert (start < nbits);
8869 for (unsigned int i = start; i < nbits; i++)
8870 if (bitmap_bit_p (bmp, i))
8871 return i;
8873 return nbits;
8876 /* Do the work for aarch64_emit_prologue_components and
8877 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
8878 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8879 for these components or the epilogue sequence. That is, it determines
8880 whether we should emit stores or loads and what kind of CFA notes to attach
8881 to the insns. Otherwise the logic for the two sequences is very
8882 similar. */
8884 static void
8885 aarch64_process_components (sbitmap components, bool prologue_p)
8887 aarch64_frame &frame = cfun->machine->frame;
8888 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8889 ? HARD_FRAME_POINTER_REGNUM
8890 : STACK_POINTER_REGNUM);
8892 unsigned last_regno = SBITMAP_SIZE (components);
8893 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
8894 rtx_insn *insn = NULL;
8896 while (regno != last_regno)
8898 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8899 machine_mode mode = aarch64_reg_save_mode (regno);
8901 rtx reg = gen_rtx_REG (mode, regno);
8902 poly_int64 offset = frame.reg_offset[regno];
8903 if (frame_pointer_needed)
8904 offset -= frame.bytes_below_hard_fp;
8906 rtx addr = plus_constant (Pmode, ptr_reg, offset);
8907 rtx mem = gen_frame_mem (mode, addr);
8909 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
8910 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
8911 /* No more registers to handle after REGNO.
8912 Emit a single save/restore and exit. */
8913 if (regno2 == last_regno)
8915 insn = emit_insn (set);
8916 if (frame_related_p)
8918 RTX_FRAME_RELATED_P (insn) = 1;
8919 if (prologue_p)
8920 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8921 else
8922 add_reg_note (insn, REG_CFA_RESTORE, reg);
8924 break;
8927 poly_int64 offset2 = frame.reg_offset[regno2];
8928 /* The next register is not of the same class or its offset is not
8929 mergeable with the current one into a pair. */
8930 if (aarch64_sve_mode_p (mode)
8931 || !satisfies_constraint_Ump (mem)
8932 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
8933 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
8934 || maybe_ne ((offset2 - frame.reg_offset[regno]),
8935 GET_MODE_SIZE (mode)))
8937 insn = emit_insn (set);
8938 if (frame_related_p)
8940 RTX_FRAME_RELATED_P (insn) = 1;
8941 if (prologue_p)
8942 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8943 else
8944 add_reg_note (insn, REG_CFA_RESTORE, reg);
8947 regno = regno2;
8948 continue;
8951 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
8953 /* REGNO2 can be saved/restored in a pair with REGNO. */
8954 rtx reg2 = gen_rtx_REG (mode, regno2);
8955 if (frame_pointer_needed)
8956 offset2 -= frame.bytes_below_hard_fp;
8957 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
8958 rtx mem2 = gen_frame_mem (mode, addr2);
8959 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
8960 : gen_rtx_SET (reg2, mem2);
8962 if (prologue_p)
8963 insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8964 else
8965 insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8967 if (frame_related_p || frame_related2_p)
8969 RTX_FRAME_RELATED_P (insn) = 1;
8970 if (prologue_p)
8972 if (frame_related_p)
8973 add_reg_note (insn, REG_CFA_OFFSET, set);
8974 if (frame_related2_p)
8975 add_reg_note (insn, REG_CFA_OFFSET, set2);
8977 else
8979 if (frame_related_p)
8980 add_reg_note (insn, REG_CFA_RESTORE, reg);
8981 if (frame_related2_p)
8982 add_reg_note (insn, REG_CFA_RESTORE, reg2);
8986 regno = aarch64_get_next_set_bit (components, regno2 + 1);
8990 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
8992 static void
8993 aarch64_emit_prologue_components (sbitmap components)
8995 aarch64_process_components (components, true);
8998 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9000 static void
9001 aarch64_emit_epilogue_components (sbitmap components)
9003 aarch64_process_components (components, false);
9006 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9008 static void
9009 aarch64_set_handled_components (sbitmap components)
9011 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9012 if (bitmap_bit_p (components, regno))
9013 cfun->machine->reg_is_wrapped_separately[regno] = true;
9016 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9017 determining the probe offset for alloca. */
9019 static HOST_WIDE_INT
9020 aarch64_stack_clash_protection_alloca_probe_range (void)
9022 return STACK_CLASH_CALLER_GUARD;
9025 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9026 subsequent memory accesses and that requires the stack pointer and REG
9027 to have their current values. REG can be stack_pointer_rtx if no
9028 other register's value needs to be fixed. */
9030 static void
9031 aarch64_emit_stack_tie (rtx reg)
9033 emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9036 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9037 registers. If POLY_SIZE is not large enough to require a probe this function
9038 will only adjust the stack. When allocating the stack space
9039 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9040 FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9041 the saved registers. If we are then we ensure that any allocation
9042 larger than the ABI defined buffer needs a probe so that the
9043 invariant of having a 1KB buffer is maintained.
9045 We emit barriers after each stack adjustment to prevent optimizations from
9046 breaking the invariant that we never drop the stack more than a page. This
9047 invariant is needed to make it easier to correctly handle asynchronous
9048 events, e.g. if we were to allow the stack to be dropped by more than a page
9049 and then have multiple probes up and we take a signal somewhere in between
9050 then the signal handler doesn't know the state of the stack and can make no
9051 assumptions about which pages have been probed.
9053 FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
9054 is measured relative to the SME vector length instead of the current
9055 prevailing vector length. It is 0 otherwise. */
9057 static void
9058 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9059 poly_int64 poly_size,
9060 aarch64_feature_flags force_isa_mode,
9061 bool frame_related_p,
9062 bool final_adjustment_p)
9064 aarch64_frame &frame = cfun->machine->frame;
9065 HOST_WIDE_INT guard_size
9066 = 1 << param_stack_clash_protection_guard_size;
9067 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9068 HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9069 gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9070 HOST_WIDE_INT min_probe_threshold
9071 = (final_adjustment_p
9072 ? guard_used_by_caller + byte_sp_alignment
9073 : guard_size - guard_used_by_caller);
9074 poly_int64 frame_size = frame.frame_size;
9076 /* We should always have a positive probe threshold. */
9077 gcc_assert (min_probe_threshold > 0);
9079 if (flag_stack_clash_protection && !final_adjustment_p)
9081 poly_int64 initial_adjust = frame.initial_adjust;
9082 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9083 poly_int64 final_adjust = frame.final_adjust;
9085 if (known_eq (frame_size, 0))
9087 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9089 else if (known_lt (initial_adjust + sve_callee_adjust,
9090 guard_size - guard_used_by_caller)
9091 && known_lt (final_adjust, guard_used_by_caller))
9093 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9097 /* If SIZE is not large enough to require probing, just adjust the stack and
9098 exit. */
9099 if (known_lt (poly_size, min_probe_threshold)
9100 || !flag_stack_clash_protection)
9102 aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9103 frame_related_p);
9104 return;
9107 HOST_WIDE_INT size;
9108 /* Handle the SVE non-constant case first. */
9109 if (!poly_size.is_constant (&size))
9111 if (dump_file)
9113 fprintf (dump_file, "Stack clash SVE prologue: ");
9114 print_dec (poly_size, dump_file);
9115 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9118 /* First calculate the amount of bytes we're actually spilling. */
9119 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9120 poly_size, temp1, temp2, force_isa_mode,
9121 false, true);
9123 rtx_insn *insn = get_last_insn ();
9125 if (frame_related_p)
9127 /* This is done to provide unwinding information for the stack
9128 adjustments we're about to do, however to prevent the optimizers
9129 from removing the R11 move and leaving the CFA note (which would be
9130 very wrong) we tie the old and new stack pointer together.
9131 The tie will expand to nothing but the optimizers will not touch
9132 the instruction. */
9133 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9134 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9135 aarch64_emit_stack_tie (stack_ptr_copy);
9137 /* We want the CFA independent of the stack pointer for the
9138 duration of the loop. */
9139 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9140 RTX_FRAME_RELATED_P (insn) = 1;
9143 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9144 rtx guard_const = gen_int_mode (guard_size, Pmode);
9146 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9147 stack_pointer_rtx, temp1,
9148 probe_const, guard_const));
9150 /* Now reset the CFA register if needed. */
9151 if (frame_related_p)
9153 add_reg_note (insn, REG_CFA_DEF_CFA,
9154 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9155 gen_int_mode (poly_size, Pmode)));
9156 RTX_FRAME_RELATED_P (insn) = 1;
9159 return;
9162 if (dump_file)
9163 fprintf (dump_file,
9164 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9165 " bytes, probing will be required.\n", size);
9167 /* Round size to the nearest multiple of guard_size, and calculate the
9168 residual as the difference between the original size and the rounded
9169 size. */
9170 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9171 HOST_WIDE_INT residual = size - rounded_size;
9173 /* We can handle a small number of allocations/probes inline. Otherwise
9174 punt to a loop. */
9175 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9177 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9179 aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9180 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9181 guard_used_by_caller));
9182 emit_insn (gen_blockage ());
9184 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9186 else
9188 /* Compute the ending address. */
9189 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9190 temp1, NULL, force_isa_mode, false, true);
9191 rtx_insn *insn = get_last_insn ();
9193 /* For the initial allocation, we don't have a frame pointer
9194 set up, so we always need CFI notes. If we're doing the
9195 final allocation, then we may have a frame pointer, in which
9196 case it is the CFA, otherwise we need CFI notes.
9198 We can determine which allocation we are doing by looking at
9199 the value of FRAME_RELATED_P since the final allocations are not
9200 frame related. */
9201 if (frame_related_p)
9203 /* We want the CFA independent of the stack pointer for the
9204 duration of the loop. */
9205 add_reg_note (insn, REG_CFA_DEF_CFA,
9206 plus_constant (Pmode, temp1, rounded_size));
9207 RTX_FRAME_RELATED_P (insn) = 1;
9210 /* This allocates and probes the stack. Note that this re-uses some of
9211 the existing Ada stack protection code. However we are guaranteed not
9212 to enter the non loop or residual branches of that code.
9214 The non-loop part won't be entered because if our allocation amount
9215 doesn't require a loop, the case above would handle it.
9217 The residual amount won't be entered because TEMP1 is a mutliple of
9218 the allocation size. The residual will always be 0. As such, the only
9219 part we are actually using from that code is the loop setup. The
9220 actual probing is done in aarch64_output_probe_stack_range. */
9221 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9222 stack_pointer_rtx, temp1));
9224 /* Now reset the CFA register if needed. */
9225 if (frame_related_p)
9227 add_reg_note (insn, REG_CFA_DEF_CFA,
9228 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9229 RTX_FRAME_RELATED_P (insn) = 1;
9232 emit_insn (gen_blockage ());
9233 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9236 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9237 be probed. This maintains the requirement that each page is probed at
9238 least once. For initial probing we probe only if the allocation is
9239 more than GUARD_SIZE - buffer, and below the saved registers we probe
9240 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9241 GUARD_SIZE. This works that for any allocation that is large enough to
9242 trigger a probe here, we'll have at least one, and if they're not large
9243 enough for this code to emit anything for them, The page would have been
9244 probed by the saving of FP/LR either by this function or any callees. If
9245 we don't have any callees then we won't have more stack adjustments and so
9246 are still safe. */
9247 if (residual)
9249 gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9251 /* If we're doing final adjustments, and we've done any full page
9252 allocations then any residual needs to be probed. */
9253 if (final_adjustment_p && rounded_size != 0)
9254 min_probe_threshold = 0;
9256 aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9257 if (residual >= min_probe_threshold)
9259 if (dump_file)
9260 fprintf (dump_file,
9261 "Stack clash AArch64 prologue residuals: "
9262 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9263 "\n", residual);
9265 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9266 guard_used_by_caller));
9267 emit_insn (gen_blockage ());
9272 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY. */
9274 void
9275 aarch64_extra_live_on_entry (bitmap regs)
9277 if (TARGET_ZA)
9279 bitmap_set_bit (regs, LOWERING_REGNUM);
9280 bitmap_set_bit (regs, SME_STATE_REGNUM);
9281 bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9282 bitmap_set_bit (regs, ZA_FREE_REGNUM);
9283 bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9285 /* The only time ZA can't have live contents on entry is when
9286 the function explicitly treats it as a pure output. */
9287 auto za_flags = aarch64_cfun_shared_flags ("za");
9288 if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9289 bitmap_set_bit (regs, ZA_REGNUM);
9291 /* Since ZT0 is call-clobbered, it is only live on input if
9292 it is explicitly shared, and is not a pure output. */
9293 auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9294 if (zt0_flags != 0
9295 && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9296 bitmap_set_bit (regs, ZT0_REGNUM);
9300 /* Return 1 if the register is used by the epilogue. We need to say the
9301 return register is used, but only after epilogue generation is complete.
9302 Note that in the case of sibcalls, the values "used by the epilogue" are
9303 considered live at the start of the called function. */
9306 aarch64_epilogue_uses (int regno)
9308 if (epilogue_completed)
9310 if (regno == LR_REGNUM)
9311 return 1;
9313 if (regno == LOWERING_REGNUM && TARGET_ZA)
9314 return 1;
9315 if (regno == SME_STATE_REGNUM && TARGET_ZA)
9316 return 1;
9317 if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9318 return 1;
9319 /* If the function shares SME state with its caller, ensure that that
9320 data is not in the lazy save buffer on exit. */
9321 if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9322 return 1;
9323 if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9324 return 1;
9325 if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9326 return 1;
9327 return 0;
9330 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
9332 static bool
9333 aarch64_use_late_prologue_epilogue ()
9335 return aarch64_cfun_enables_pstate_sm ();
9338 /* The current function's frame has a save slot for the incoming state
9339 of SVCR. Return a legitimate memory for the slot, based on the hard
9340 frame pointer. */
9342 static rtx
9343 aarch64_old_svcr_mem ()
9345 gcc_assert (frame_pointer_needed
9346 && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9347 rtx base = hard_frame_pointer_rtx;
9348 poly_int64 offset = (0
9349 /* hard fp -> bottom of frame. */
9350 - cfun->machine->frame.bytes_below_hard_fp
9351 /* bottom of frame -> save slot. */
9352 + cfun->machine->frame.old_svcr_offset);
9353 return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9356 /* The current function's frame has a save slot for the incoming state
9357 of SVCR. Load the slot into register REGNO and return the register. */
9359 static rtx
9360 aarch64_read_old_svcr (unsigned int regno)
9362 rtx svcr = gen_rtx_REG (DImode, regno);
9363 emit_move_insn (svcr, aarch64_old_svcr_mem ());
9364 return svcr;
9367 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9368 load the incoming value of SVCR from its save slot into temporary
9369 register REGNO. */
9371 static rtx_insn *
9372 aarch64_guard_switch_pstate_sm (unsigned int regno,
9373 aarch64_feature_flags local_mode)
9375 rtx old_svcr = aarch64_read_old_svcr (regno);
9376 return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9379 /* AArch64 stack frames generated by this compiler look like:
9381 +-------------------------------+
9383 | incoming stack arguments |
9385 +-------------------------------+
9386 | | <-- incoming stack pointer (aligned)
9387 | callee-allocated save area |
9388 | for register varargs |
9390 +-------------------------------+
9391 | local variables (1) | <-- frame_pointer_rtx
9393 +-------------------------------+
9394 | padding (1) |
9395 +-------------------------------+
9396 | callee-saved registers |
9397 +-------------------------------+
9398 | LR' |
9399 +-------------------------------+
9400 | FP' |
9401 +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9402 | SVE vector registers |
9403 +-------------------------------+
9404 | SVE predicate registers |
9405 +-------------------------------+
9406 | local variables (2) |
9407 +-------------------------------+
9408 | padding (2) |
9409 +-------------------------------+
9410 | dynamic allocation |
9411 +-------------------------------+
9412 | padding |
9413 +-------------------------------+
9414 | outgoing stack arguments | <-- arg_pointer
9416 +-------------------------------+
9417 | | <-- stack_pointer_rtx (aligned)
9419 The regions marked (1) and (2) are mutually exclusive. (2) is used
9420 when aarch64_save_regs_above_locals_p is true.
9422 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9423 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9424 unchanged.
9426 By default for stack-clash we assume the guard is at least 64KB, but this
9427 value is configurable to either 4KB or 64KB. We also force the guard size to
9428 be the same as the probing interval and both values are kept in sync.
9430 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9431 on the guard size) of stack space without probing.
9433 When probing is needed, we emit a probe at the start of the prologue
9434 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9436 We can also use register saves as probes. These are stored in
9437 sve_save_and_probe and hard_fp_save_and_probe.
9439 For outgoing arguments we probe if the size is larger than 1KB, such that
9440 the ABI specified buffer is maintained for the next callee.
9442 The following registers are reserved during frame layout and should not be
9443 used for any other purpose:
9445 - r11: Used by stack clash protection when SVE is enabled, and also
9446 as an anchor register when saving and restoring registers
9447 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9448 - r14 and r15: Used for speculation tracking.
9449 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9450 - r30(LR), r29(FP): Used by standard frame layout.
9452 These registers must be avoided in frame layout related code unless the
9453 explicit intention is to interact with one of the features listed above. */
9455 /* Generate the prologue instructions for entry into a function.
9456 Establish the stack frame by decreasing the stack pointer with a
9457 properly calculated size and, if necessary, create a frame record
9458 filled with the values of LR and previous frame pointer. The
9459 current FP is also set up if it is in use. */
9461 void
9462 aarch64_expand_prologue (void)
9464 aarch64_frame &frame = cfun->machine->frame;
9465 poly_int64 frame_size = frame.frame_size;
9466 poly_int64 initial_adjust = frame.initial_adjust;
9467 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9468 poly_int64 final_adjust = frame.final_adjust;
9469 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9470 unsigned reg1 = frame.wb_push_candidate1;
9471 unsigned reg2 = frame.wb_push_candidate2;
9472 bool emit_frame_chain = frame.emit_frame_chain;
9473 rtx_insn *insn;
9474 aarch64_feature_flags force_isa_mode = 0;
9475 if (aarch64_cfun_enables_pstate_sm ())
9476 force_isa_mode = AARCH64_FL_SM_ON;
9478 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
9480 /* Fold the SVE allocation into the initial allocation.
9481 We don't do this in aarch64_layout_arg to avoid pessimizing
9482 the epilogue code. */
9483 initial_adjust += sve_callee_adjust;
9484 sve_callee_adjust = 0;
9487 /* Sign return address for functions. */
9488 if (aarch64_return_address_signing_enabled ())
9490 switch (aarch_ra_sign_key)
9492 case AARCH_KEY_A:
9493 insn = emit_insn (gen_paciasp ());
9494 break;
9495 case AARCH_KEY_B:
9496 insn = emit_insn (gen_pacibsp ());
9497 break;
9498 default:
9499 gcc_unreachable ();
9501 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9502 RTX_FRAME_RELATED_P (insn) = 1;
9505 /* Push return address to shadow call stack. */
9506 if (frame.is_scs_enabled)
9507 emit_insn (gen_scs_push ());
9509 if (flag_stack_usage_info)
9510 current_function_static_stack_size = constant_lower_bound (frame_size);
9512 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9514 if (crtl->is_leaf && !cfun->calls_alloca)
9516 if (maybe_gt (frame_size, PROBE_INTERVAL)
9517 && maybe_gt (frame_size, get_stack_check_protect ()))
9518 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9519 (frame_size
9520 - get_stack_check_protect ()));
9522 else if (maybe_gt (frame_size, 0))
9523 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9526 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9527 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9529 /* In theory we should never have both an initial adjustment
9530 and a callee save adjustment. Verify that is the case since the
9531 code below does not handle it for -fstack-clash-protection. */
9532 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9534 /* Will only probe if the initial adjustment is larger than the guard
9535 less the amount of the guard reserved for use by the caller's
9536 outgoing args. */
9537 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9538 force_isa_mode, true, false);
9540 if (callee_adjust != 0)
9541 aarch64_push_regs (reg1, reg2, callee_adjust);
9543 /* The offset of the current SP from the bottom of the static frame. */
9544 poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9546 if (emit_frame_chain)
9548 /* The offset of the frame chain record (if any) from the current SP. */
9549 poly_int64 chain_offset = (initial_adjust + callee_adjust
9550 - frame.bytes_above_hard_fp);
9551 gcc_assert (known_ge (chain_offset, 0));
9553 gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9554 if (callee_adjust == 0)
9555 aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9556 false, false);
9557 else
9558 gcc_assert (known_eq (chain_offset, 0));
9559 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9560 stack_pointer_rtx, chain_offset,
9561 tmp1_rtx, tmp0_rtx, force_isa_mode,
9562 frame_pointer_needed);
9563 if (frame_pointer_needed && !frame_size.is_constant ())
9565 /* Variable-sized frames need to describe the save slot
9566 address using DW_CFA_expression rather than DW_CFA_offset.
9567 This means that, without taking further action, the
9568 locations of the registers that we've already saved would
9569 remain based on the stack pointer even after we redefine
9570 the CFA based on the frame pointer. We therefore need new
9571 DW_CFA_expressions to re-express the save slots with addresses
9572 based on the frame pointer. */
9573 rtx_insn *insn = get_last_insn ();
9574 gcc_assert (RTX_FRAME_RELATED_P (insn));
9576 /* Add an explicit CFA definition if this was previously
9577 implicit. */
9578 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9580 rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9581 add_reg_note (insn, REG_CFA_ADJUST_CFA,
9582 gen_rtx_SET (hard_frame_pointer_rtx, src));
9585 /* Change the save slot expressions for the registers that
9586 we've already saved. */
9587 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9588 hard_frame_pointer_rtx, UNITS_PER_WORD);
9589 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9590 hard_frame_pointer_rtx, 0);
9592 aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9595 aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9596 emit_frame_chain);
9597 if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9599 unsigned int saved_regs[] = { VG_REGNUM };
9600 aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9601 emit_frame_chain);
9603 if (maybe_ne (sve_callee_adjust, 0))
9605 gcc_assert (!flag_stack_clash_protection
9606 || known_eq (initial_adjust, 0));
9607 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9608 sve_callee_adjust,
9609 force_isa_mode,
9610 !frame_pointer_needed, false);
9611 bytes_below_sp -= sve_callee_adjust;
9613 aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9614 emit_frame_chain);
9615 aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
9616 emit_frame_chain);
9618 /* We may need to probe the final adjustment if it is larger than the guard
9619 that is assumed by the called. */
9620 gcc_assert (known_eq (bytes_below_sp, final_adjust));
9621 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9622 force_isa_mode,
9623 !frame_pointer_needed, true);
9624 if (emit_frame_chain && maybe_ne (final_adjust, 0))
9625 aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9627 /* Save the incoming value of PSTATE.SM, if required. Code further
9628 down does this for locally-streaming functions. */
9629 if (known_ge (frame.old_svcr_offset, 0)
9630 && !aarch64_cfun_enables_pstate_sm ())
9632 rtx mem = aarch64_old_svcr_mem ();
9633 MEM_VOLATILE_P (mem) = 1;
9634 if (TARGET_SME)
9636 rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
9637 emit_insn (gen_aarch64_read_svcr (reg));
9638 emit_move_insn (mem, reg);
9640 else
9642 rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
9643 auto &args = crtl->args.info;
9644 if (args.aapcs_ncrn > 0)
9646 old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
9647 emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
9649 if (args.aapcs_ncrn > 1)
9651 old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
9652 emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
9654 emit_insn (gen_aarch64_get_sme_state ());
9655 emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
9656 if (old_r0)
9657 emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
9658 if (old_r1)
9659 emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
9663 /* Enable PSTATE.SM, if required. */
9664 if (aarch64_cfun_enables_pstate_sm ())
9666 rtx_insn *guard_label = nullptr;
9667 if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9669 /* The current function is streaming-compatible. Save the
9670 original state of PSTATE.SM. */
9671 rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
9672 emit_insn (gen_aarch64_read_svcr (svcr));
9673 emit_move_insn (aarch64_old_svcr_mem (), svcr);
9674 guard_label = aarch64_guard_switch_pstate_sm (svcr,
9675 aarch64_isa_flags);
9677 aarch64_sme_mode_switch_regs args_switch;
9678 auto &args = crtl->args.info;
9679 for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
9681 rtx x = args.sme_mode_switch_args[i];
9682 args_switch.add_reg (GET_MODE (x), REGNO (x));
9684 args_switch.emit_prologue ();
9685 emit_insn (gen_aarch64_smstart_sm ());
9686 args_switch.emit_epilogue ();
9687 if (guard_label)
9688 emit_label (guard_label);
9692 /* Return TRUE if we can use a simple_return insn.
9694 This function checks whether the callee saved stack is empty, which
9695 means no restore actions are need. The pro_and_epilogue will use
9696 this to check whether shrink-wrapping opt is feasible. */
9698 bool
9699 aarch64_use_return_insn_p (void)
9701 if (!reload_completed)
9702 return false;
9704 if (crtl->profile)
9705 return false;
9707 return known_eq (cfun->machine->frame.frame_size, 0);
9710 /* Generate the epilogue instructions for returning from a function.
9711 This is almost exactly the reverse of the prolog sequence, except
9712 that we need to insert barriers to avoid scheduling loads that read
9713 from a deallocated stack, and we optimize the unwind records by
9714 emitting them all together if possible. */
9715 void
9716 aarch64_expand_epilogue (rtx_call_insn *sibcall)
9718 aarch64_frame &frame = cfun->machine->frame;
9719 poly_int64 initial_adjust = frame.initial_adjust;
9720 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9721 poly_int64 final_adjust = frame.final_adjust;
9722 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9723 poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
9724 unsigned reg1 = frame.wb_pop_candidate1;
9725 unsigned reg2 = frame.wb_pop_candidate2;
9726 rtx cfi_ops = NULL;
9727 rtx_insn *insn;
9728 /* A stack clash protection prologue may not have left EP0_REGNUM or
9729 EP1_REGNUM in a usable state. The same is true for allocations
9730 with an SVE component, since we then need both temporary registers
9731 for each allocation. For stack clash we are in a usable state if
9732 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
9733 HOST_WIDE_INT guard_size
9734 = 1 << param_stack_clash_protection_guard_size;
9735 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9736 aarch64_feature_flags force_isa_mode = 0;
9737 if (aarch64_cfun_enables_pstate_sm ())
9738 force_isa_mode = AARCH64_FL_SM_ON;
9740 /* We can re-use the registers when:
9742 (a) the deallocation amount is the same as the corresponding
9743 allocation amount (which is false if we combine the initial
9744 and SVE callee save allocations in the prologue); and
9746 (b) the allocation amount doesn't need a probe (which is false
9747 if the amount is guard_size - guard_used_by_caller or greater).
9749 In such situations the register should remain live with the correct
9750 value. */
9751 bool can_inherit_p = (initial_adjust.is_constant ()
9752 && final_adjust.is_constant ()
9753 && (!flag_stack_clash_protection
9754 || (known_lt (initial_adjust,
9755 guard_size - guard_used_by_caller)
9756 && known_eq (sve_callee_adjust, 0))));
9758 /* We need to add memory barrier to prevent read from deallocated stack. */
9759 bool need_barrier_p
9760 = maybe_ne (get_frame_size ()
9761 + frame.saved_varargs_size, 0);
9763 /* Reset PSTATE.SM, if required. */
9764 if (aarch64_cfun_enables_pstate_sm ())
9766 rtx_insn *guard_label = nullptr;
9767 if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9768 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
9769 aarch64_isa_flags);
9770 aarch64_sme_mode_switch_regs return_switch;
9771 if (sibcall)
9772 return_switch.add_call_args (sibcall);
9773 else if (crtl->return_rtx && REG_P (crtl->return_rtx))
9774 return_switch.add_reg (GET_MODE (crtl->return_rtx),
9775 REGNO (crtl->return_rtx));
9776 return_switch.emit_prologue ();
9777 emit_insn (gen_aarch64_smstop_sm ());
9778 return_switch.emit_epilogue ();
9779 if (guard_label)
9780 emit_label (guard_label);
9783 /* Emit a barrier to prevent loads from a deallocated stack. */
9784 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9785 || cfun->calls_alloca
9786 || crtl->calls_eh_return)
9788 aarch64_emit_stack_tie (stack_pointer_rtx);
9789 need_barrier_p = false;
9792 /* Restore the stack pointer from the frame pointer if it may not
9793 be the same as the stack pointer. */
9794 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9795 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9796 if (frame_pointer_needed
9797 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9798 /* If writeback is used when restoring callee-saves, the CFA
9799 is restored on the instruction doing the writeback. */
9800 aarch64_add_offset (Pmode, stack_pointer_rtx,
9801 hard_frame_pointer_rtx,
9802 -bytes_below_hard_fp + final_adjust,
9803 tmp1_rtx, tmp0_rtx, force_isa_mode,
9804 callee_adjust == 0);
9805 else
9806 /* The case where we need to re-use the register here is very rare, so
9807 avoid the complicated condition and just always emit a move if the
9808 immediate doesn't fit. */
9809 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
9811 /* Restore the vector registers before the predicate registers,
9812 so that we can use P4 as a temporary for big-endian SVE frames. */
9813 aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
9814 aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
9815 if (maybe_ne (sve_callee_adjust, 0))
9816 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
9817 force_isa_mode, true);
9819 /* When shadow call stack is enabled, the scs_pop in the epilogue will
9820 restore x30, we don't need to restore x30 again in the traditional
9821 way. */
9822 aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
9823 frame.saved_gprs, &cfi_ops);
9825 if (need_barrier_p)
9826 aarch64_emit_stack_tie (stack_pointer_rtx);
9828 if (callee_adjust != 0)
9829 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9831 /* If we have no register restore information, the CFA must have been
9832 defined in terms of the stack pointer since the end of the prologue. */
9833 gcc_assert (cfi_ops || !frame_pointer_needed);
9835 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9837 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
9838 insn = get_last_insn ();
9839 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9840 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9841 RTX_FRAME_RELATED_P (insn) = 1;
9842 cfi_ops = NULL;
9845 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9846 add restriction on emit_move optimization to leaf functions. */
9847 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
9848 (!can_inherit_p || !crtl->is_leaf
9849 || df_regs_ever_live_p (EP0_REGNUM)));
9851 if (cfi_ops)
9853 /* Emit delayed restores and reset the CFA to be SP. */
9854 insn = get_last_insn ();
9855 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9856 REG_NOTES (insn) = cfi_ops;
9857 RTX_FRAME_RELATED_P (insn) = 1;
9860 /* Pop return address from shadow call stack. */
9861 if (frame.is_scs_enabled)
9863 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
9864 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
9866 insn = emit_insn (gen_scs_pop ());
9867 add_reg_note (insn, REG_CFA_RESTORE, reg);
9868 RTX_FRAME_RELATED_P (insn) = 1;
9871 /* Stack adjustment for exception handler. */
9872 if (crtl->calls_eh_return && !sibcall)
9874 /* If the EH_RETURN_TAKEN_RTX flag is set then we need
9875 to unwind the stack and jump to the handler, otherwise
9876 skip this eh_return logic and continue with normal
9877 return after the label. We have already reset the CFA
9878 to be SP; letting the CFA move during this adjustment
9879 is just as correct as retaining the CFA from the body
9880 of the function. Therefore, do nothing special. */
9881 rtx label = gen_label_rtx ();
9882 rtx x = gen_rtx_EQ (VOIDmode, EH_RETURN_TAKEN_RTX, const0_rtx);
9883 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9884 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9885 rtx jump = emit_jump_insn (gen_rtx_SET (pc_rtx, x));
9886 JUMP_LABEL (jump) = label;
9887 LABEL_NUSES (label)++;
9888 emit_insn (gen_add2_insn (stack_pointer_rtx,
9889 EH_RETURN_STACKADJ_RTX));
9890 emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
9891 emit_barrier ();
9892 emit_label (label);
9895 /* We prefer to emit the combined return/authenticate instruction RETAA,
9896 however there are three cases in which we must instead emit an explicit
9897 authentication instruction.
9899 1) Sibcalls don't return in a normal way, so if we're about to call one
9900 we must authenticate.
9902 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
9903 generating code for !TARGET_ARMV8_3 we can't use it and must
9904 explicitly authenticate.
9906 if (aarch64_return_address_signing_enabled ()
9907 && (sibcall || !TARGET_ARMV8_3))
9909 switch (aarch_ra_sign_key)
9911 case AARCH_KEY_A:
9912 insn = emit_insn (gen_autiasp ());
9913 break;
9914 case AARCH_KEY_B:
9915 insn = emit_insn (gen_autibsp ());
9916 break;
9917 default:
9918 gcc_unreachable ();
9920 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9921 RTX_FRAME_RELATED_P (insn) = 1;
9924 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
9925 if (!sibcall)
9926 emit_jump_insn (ret_rtx);
9929 /* Output code to add DELTA to the first argument, and then jump
9930 to FUNCTION. Used for C++ multiple inheritance. */
9931 static void
9932 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
9933 HOST_WIDE_INT delta,
9934 HOST_WIDE_INT vcall_offset,
9935 tree function)
9937 /* The this pointer is always in x0. Note that this differs from
9938 Arm where the this pointer maybe bumped to r1 if r0 is required
9939 to return a pointer to an aggregate. On AArch64 a result value
9940 pointer will be in x8. */
9941 int this_regno = R0_REGNUM;
9942 rtx this_rtx, temp0, temp1, addr, funexp;
9943 rtx_insn *insn;
9944 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
9946 if (aarch_bti_enabled ())
9947 emit_insn (gen_bti_c());
9949 reload_completed = 1;
9950 emit_note (NOTE_INSN_PROLOGUE_END);
9952 this_rtx = gen_rtx_REG (Pmode, this_regno);
9953 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
9954 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
9956 if (vcall_offset == 0)
9957 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
9958 0, false);
9959 else
9961 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
9963 addr = this_rtx;
9964 if (delta != 0)
9966 if (delta >= -256 && delta < 256)
9967 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
9968 plus_constant (Pmode, this_rtx, delta));
9969 else
9970 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
9971 temp1, temp0, 0, false);
9974 if (Pmode == ptr_mode)
9975 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
9976 else
9977 aarch64_emit_move (temp0,
9978 gen_rtx_ZERO_EXTEND (Pmode,
9979 gen_rtx_MEM (ptr_mode, addr)));
9981 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
9982 addr = plus_constant (Pmode, temp0, vcall_offset);
9983 else
9985 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
9986 Pmode);
9987 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
9990 if (Pmode == ptr_mode)
9991 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
9992 else
9993 aarch64_emit_move (temp1,
9994 gen_rtx_SIGN_EXTEND (Pmode,
9995 gen_rtx_MEM (ptr_mode, addr)));
9997 emit_insn (gen_add2_insn (this_rtx, temp1));
10000 /* Generate a tail call to the target function. */
10001 if (!TREE_USED (function))
10003 assemble_external (function);
10004 TREE_USED (function) = 1;
10006 funexp = XEXP (DECL_RTL (function), 0);
10007 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10008 auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10009 auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10010 rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant);
10011 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10012 SIBLING_CALL_P (insn) = 1;
10014 insn = get_insns ();
10015 shorten_branches (insn);
10017 assemble_start_function (thunk, fnname);
10018 final_start_function (insn, file, 1);
10019 final (insn, file, 1);
10020 final_end_function ();
10021 assemble_end_function (thunk, fnname);
10023 /* Stop pretending to be a post-reload pass. */
10024 reload_completed = 0;
10027 static bool
10028 aarch64_tls_referenced_p (rtx x)
10030 if (!TARGET_HAVE_TLS)
10031 return false;
10032 subrtx_iterator::array_type array;
10033 FOR_EACH_SUBRTX (iter, array, x, ALL)
10035 const_rtx x = *iter;
10036 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10037 return true;
10038 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10039 TLS offsets, not real symbol references. */
10040 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10041 iter.skip_subrtxes ();
10043 return false;
10047 static bool
10048 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10050 if (GET_CODE (x) == HIGH)
10051 return true;
10053 /* There's no way to calculate VL-based values using relocations. */
10054 subrtx_iterator::array_type array;
10055 HOST_WIDE_INT factor;
10056 FOR_EACH_SUBRTX (iter, array, x, ALL)
10057 if (GET_CODE (*iter) == CONST_POLY_INT
10058 || aarch64_sme_vq_unspec_p (x, &factor))
10059 return true;
10061 poly_int64 offset;
10062 rtx base = strip_offset_and_salt (x, &offset);
10063 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10065 /* We checked for POLY_INT_CST offsets above. */
10066 if (aarch64_classify_symbol (base, offset.to_constant ())
10067 != SYMBOL_FORCE_TO_MEM)
10068 return true;
10069 else
10070 /* Avoid generating a 64-bit relocation in ILP32; leave
10071 to aarch64_expand_mov_immediate to handle it properly. */
10072 return mode != ptr_mode;
10075 return aarch64_tls_referenced_p (x);
10078 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10079 The expansion for a table switch is quite expensive due to the number
10080 of instructions, the table lookup and hard to predict indirect jump.
10081 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10082 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10083 performance. When optimizing for size, use 8 for smallest codesize. */
10085 static unsigned int
10086 aarch64_case_values_threshold (void)
10088 /* Use the specified limit for the number of cases before using jump
10089 tables at higher optimization levels. */
10090 if (optimize > 2
10091 && aarch64_tune_params.max_case_values != 0)
10092 return aarch64_tune_params.max_case_values;
10093 else
10094 return optimize_size ? 8 : 11;
10097 /* Return true if register REGNO is a valid index register.
10098 STRICT_P is true if REG_OK_STRICT is in effect. */
10100 bool
10101 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10103 if (!HARD_REGISTER_NUM_P (regno))
10105 if (!strict_p)
10106 return true;
10108 if (!reg_renumber)
10109 return false;
10111 regno = reg_renumber[regno];
10113 return GP_REGNUM_P (regno);
10116 /* Return true if register REGNO is a valid base register for mode MODE.
10117 STRICT_P is true if REG_OK_STRICT is in effect. */
10119 bool
10120 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10122 if (!HARD_REGISTER_NUM_P (regno))
10124 if (!strict_p)
10125 return true;
10127 if (!reg_renumber)
10128 return false;
10130 regno = reg_renumber[regno];
10133 /* The fake registers will be eliminated to either the stack or
10134 hard frame pointer, both of which are usually valid base registers.
10135 Reload deals with the cases where the eliminated form isn't valid. */
10136 return (GP_REGNUM_P (regno)
10137 || regno == SP_REGNUM
10138 || regno == FRAME_POINTER_REGNUM
10139 || regno == ARG_POINTER_REGNUM);
10142 /* Return true if X is a valid base register for mode MODE.
10143 STRICT_P is true if REG_OK_STRICT is in effect. */
10145 static bool
10146 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10148 if (!strict_p
10149 && SUBREG_P (x)
10150 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10151 x = SUBREG_REG (x);
10153 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10156 /* Return true if address offset is a valid index. If it is, fill in INFO
10157 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10159 static bool
10160 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10161 machine_mode mode, bool strict_p)
10163 enum aarch64_address_type type;
10164 rtx index;
10165 int shift;
10167 /* (reg:P) */
10168 if ((REG_P (x) || SUBREG_P (x))
10169 && GET_MODE (x) == Pmode)
10171 type = ADDRESS_REG_REG;
10172 index = x;
10173 shift = 0;
10175 /* (sign_extend:DI (reg:SI)) */
10176 else if ((GET_CODE (x) == SIGN_EXTEND
10177 || GET_CODE (x) == ZERO_EXTEND)
10178 && GET_MODE (x) == DImode
10179 && GET_MODE (XEXP (x, 0)) == SImode)
10181 type = (GET_CODE (x) == SIGN_EXTEND)
10182 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10183 index = XEXP (x, 0);
10184 shift = 0;
10186 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10187 else if (GET_CODE (x) == MULT
10188 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10189 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10190 && GET_MODE (XEXP (x, 0)) == DImode
10191 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10192 && CONST_INT_P (XEXP (x, 1)))
10194 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10195 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10196 index = XEXP (XEXP (x, 0), 0);
10197 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10199 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10200 else if (GET_CODE (x) == ASHIFT
10201 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10202 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10203 && GET_MODE (XEXP (x, 0)) == DImode
10204 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10205 && CONST_INT_P (XEXP (x, 1)))
10207 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10208 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10209 index = XEXP (XEXP (x, 0), 0);
10210 shift = INTVAL (XEXP (x, 1));
10212 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10213 (const_int 0xffffffff<<shift)) */
10214 else if (GET_CODE (x) == AND
10215 && GET_MODE (x) == DImode
10216 && GET_CODE (XEXP (x, 0)) == MULT
10217 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10218 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10219 && CONST_INT_P (XEXP (x, 1)))
10221 type = ADDRESS_REG_UXTW;
10222 index = XEXP (XEXP (x, 0), 0);
10223 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10224 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10225 shift = -1;
10227 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10228 (const_int 0xffffffff<<shift)) */
10229 else if (GET_CODE (x) == AND
10230 && GET_MODE (x) == DImode
10231 && GET_CODE (XEXP (x, 0)) == ASHIFT
10232 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10233 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10234 && CONST_INT_P (XEXP (x, 1)))
10236 type = ADDRESS_REG_UXTW;
10237 index = XEXP (XEXP (x, 0), 0);
10238 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10239 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10240 shift = -1;
10242 /* (mult:P (reg:P) (const_int scale)) */
10243 else if (GET_CODE (x) == MULT
10244 && GET_MODE (x) == Pmode
10245 && GET_MODE (XEXP (x, 0)) == Pmode
10246 && CONST_INT_P (XEXP (x, 1)))
10248 type = ADDRESS_REG_REG;
10249 index = XEXP (x, 0);
10250 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10252 /* (ashift:P (reg:P) (const_int shift)) */
10253 else if (GET_CODE (x) == ASHIFT
10254 && GET_MODE (x) == Pmode
10255 && GET_MODE (XEXP (x, 0)) == Pmode
10256 && CONST_INT_P (XEXP (x, 1)))
10258 type = ADDRESS_REG_REG;
10259 index = XEXP (x, 0);
10260 shift = INTVAL (XEXP (x, 1));
10262 else
10263 return false;
10265 if (!strict_p
10266 && SUBREG_P (index)
10267 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10268 index = SUBREG_REG (index);
10270 if (aarch64_sve_data_mode_p (mode) || mode == VNx1TImode)
10272 if (type != ADDRESS_REG_REG
10273 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10274 return false;
10276 else
10278 if (shift != 0
10279 && !(IN_RANGE (shift, 1, 3)
10280 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10281 return false;
10284 if (REG_P (index)
10285 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10287 info->type = type;
10288 info->offset = index;
10289 info->shift = shift;
10290 return true;
10293 return false;
10296 /* Return true if MODE is one of the modes for which we
10297 support LDP/STP operations. */
10299 static bool
10300 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10302 return mode == SImode || mode == DImode
10303 || mode == SFmode || mode == DFmode
10304 || mode == SDmode || mode == DDmode
10305 || (aarch64_vector_mode_supported_p (mode)
10306 && (known_eq (GET_MODE_SIZE (mode), 8)
10307 || (known_eq (GET_MODE_SIZE (mode), 16)
10308 && (aarch64_tune_params.extra_tuning_flags
10309 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10312 /* Return true if REGNO is a virtual pointer register, or an eliminable
10313 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10314 include stack_pointer or hard_frame_pointer. */
10315 static bool
10316 virt_or_elim_regno_p (unsigned regno)
10318 return ((regno >= FIRST_VIRTUAL_REGISTER
10319 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10320 || regno == FRAME_POINTER_REGNUM
10321 || regno == ARG_POINTER_REGNUM);
10324 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10325 If it is, fill in INFO appropriately. STRICT_P is true if
10326 REG_OK_STRICT is in effect. */
10328 bool
10329 aarch64_classify_address (struct aarch64_address_info *info,
10330 rtx x, machine_mode mode, bool strict_p,
10331 aarch64_addr_query_type type)
10333 enum rtx_code code = GET_CODE (x);
10334 rtx op0, op1;
10335 poly_int64 offset;
10337 HOST_WIDE_INT const_size;
10339 /* Whether a vector mode is partial doesn't affect address legitimacy.
10340 Partial vectors like VNx8QImode allow the same indexed addressing
10341 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10342 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10343 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10344 vec_flags &= ~VEC_PARTIAL;
10346 /* On BE, we use load/store pair for all large int mode load/stores.
10347 TI/TF/TDmode may also use a load/store pair. */
10348 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10349 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10350 || type == ADDR_QUERY_LDP_STP_N
10351 || mode == TImode
10352 || mode == TFmode
10353 || mode == TDmode
10354 || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10355 && advsimd_struct_p));
10356 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10357 corresponds to the actual size of the memory being loaded/stored and the
10358 mode of the corresponding addressing mode is half of that. */
10359 if (type == ADDR_QUERY_LDP_STP_N)
10361 if (known_eq (GET_MODE_SIZE (mode), 32))
10362 mode = V16QImode;
10363 else if (known_eq (GET_MODE_SIZE (mode), 16))
10364 mode = DFmode;
10365 else if (known_eq (GET_MODE_SIZE (mode), 8))
10366 mode = SFmode;
10367 else
10368 return false;
10370 /* This isn't really an Advanced SIMD struct mode, but a mode
10371 used to represent the complete mem in a load/store pair. */
10372 advsimd_struct_p = false;
10375 bool allow_reg_index_p = (!load_store_pair_p
10376 && ((vec_flags == 0
10377 && known_lt (GET_MODE_SIZE (mode), 16))
10378 || vec_flags == VEC_ADVSIMD
10379 || vec_flags & VEC_SVE_DATA
10380 || mode == VNx1TImode));
10382 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10383 The latter is not valid for SVE predicates, and that's rejected through
10384 allow_reg_index_p above. */
10385 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10386 && (code != REG && code != PLUS))
10387 return false;
10389 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10390 REG addressing. */
10391 if (advsimd_struct_p
10392 && TARGET_SIMD
10393 && !BYTES_BIG_ENDIAN
10394 && (code != POST_INC && code != REG))
10395 return false;
10397 gcc_checking_assert (GET_MODE (x) == VOIDmode
10398 || SCALAR_INT_MODE_P (GET_MODE (x)));
10400 switch (code)
10402 case REG:
10403 case SUBREG:
10404 info->type = ADDRESS_REG_IMM;
10405 info->base = x;
10406 info->offset = const0_rtx;
10407 info->const_offset = 0;
10408 return aarch64_base_register_rtx_p (x, strict_p);
10410 case PLUS:
10411 op0 = XEXP (x, 0);
10412 op1 = XEXP (x, 1);
10414 if (! strict_p
10415 && REG_P (op0)
10416 && virt_or_elim_regno_p (REGNO (op0))
10417 && poly_int_rtx_p (op1, &offset))
10419 info->type = ADDRESS_REG_IMM;
10420 info->base = op0;
10421 info->offset = op1;
10422 info->const_offset = offset;
10424 return true;
10427 if (maybe_ne (GET_MODE_SIZE (mode), 0)
10428 && aarch64_base_register_rtx_p (op0, strict_p)
10429 && poly_int_rtx_p (op1, &offset))
10431 info->type = ADDRESS_REG_IMM;
10432 info->base = op0;
10433 info->offset = op1;
10434 info->const_offset = offset;
10436 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10437 registers and individual Q registers. The available
10438 address modes are:
10439 X,X: 7-bit signed scaled offset
10440 Q: 9-bit signed offset
10441 We conservatively require an offset representable in either mode.
10442 When performing the check for pairs of X registers i.e. LDP/STP
10443 pass down DImode since that is the natural size of the LDP/STP
10444 instruction memory accesses. */
10445 if (mode == TImode || mode == TFmode || mode == TDmode)
10446 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10447 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10448 || offset_12bit_unsigned_scaled_p (mode, offset)));
10450 if (mode == V8DImode)
10451 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10452 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10454 /* A 7bit offset check because OImode will emit a ldp/stp
10455 instruction (only !TARGET_SIMD or big endian will get here).
10456 For ldp/stp instructions, the offset is scaled for the size of a
10457 single element of the pair. */
10458 if (aarch64_advsimd_partial_struct_mode_p (mode)
10459 && known_eq (GET_MODE_SIZE (mode), 16))
10460 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10461 if (aarch64_advsimd_full_struct_mode_p (mode)
10462 && known_eq (GET_MODE_SIZE (mode), 32))
10463 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10465 /* Three 9/12 bit offsets checks because CImode will emit three
10466 ldr/str instructions (only !TARGET_SIMD or big endian will
10467 get here). */
10468 if (aarch64_advsimd_partial_struct_mode_p (mode)
10469 && known_eq (GET_MODE_SIZE (mode), 24))
10470 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10471 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10472 offset + 16)
10473 || offset_12bit_unsigned_scaled_p (DImode,
10474 offset + 16)));
10475 if (aarch64_advsimd_full_struct_mode_p (mode)
10476 && known_eq (GET_MODE_SIZE (mode), 48))
10477 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10478 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10479 offset + 32)
10480 || offset_12bit_unsigned_scaled_p (TImode,
10481 offset + 32)));
10483 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10484 instructions (only big endian will get here). */
10485 if (aarch64_advsimd_partial_struct_mode_p (mode)
10486 && known_eq (GET_MODE_SIZE (mode), 32))
10487 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10488 && aarch64_offset_7bit_signed_scaled_p (DImode,
10489 offset + 16));
10490 if (aarch64_advsimd_full_struct_mode_p (mode)
10491 && known_eq (GET_MODE_SIZE (mode), 64))
10492 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10493 && aarch64_offset_7bit_signed_scaled_p (TImode,
10494 offset + 32));
10496 /* Make "m" use the LD1 offset range for SVE data modes, so
10497 that pre-RTL optimizers like ivopts will work to that
10498 instead of the wider LDR/STR range. */
10499 if (vec_flags == VEC_SVE_DATA || mode == VNx1TImode)
10500 return (type == ADDR_QUERY_M
10501 ? offset_4bit_signed_scaled_p (mode, offset)
10502 : offset_9bit_signed_scaled_p (mode, offset));
10504 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10506 poly_int64 end_offset = (offset
10507 + GET_MODE_SIZE (mode)
10508 - BYTES_PER_SVE_VECTOR);
10509 return (type == ADDR_QUERY_M
10510 ? offset_4bit_signed_scaled_p (mode, offset)
10511 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10512 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10513 end_offset)));
10516 if (vec_flags == VEC_SVE_PRED)
10517 return offset_9bit_signed_scaled_p (mode, offset);
10519 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10521 poly_int64 end_offset = (offset
10522 + GET_MODE_SIZE (mode)
10523 - BYTES_PER_SVE_PRED);
10524 return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10525 && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10528 if (load_store_pair_p)
10529 return ((known_eq (GET_MODE_SIZE (mode), 4)
10530 || known_eq (GET_MODE_SIZE (mode), 8)
10531 || known_eq (GET_MODE_SIZE (mode), 16))
10532 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10533 else
10534 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10535 || offset_12bit_unsigned_scaled_p (mode, offset));
10538 if (allow_reg_index_p)
10540 /* Look for base + (scaled/extended) index register. */
10541 if (aarch64_base_register_rtx_p (op0, strict_p)
10542 && aarch64_classify_index (info, op1, mode, strict_p))
10544 info->base = op0;
10545 return true;
10547 if (aarch64_base_register_rtx_p (op1, strict_p)
10548 && aarch64_classify_index (info, op0, mode, strict_p))
10550 info->base = op1;
10551 return true;
10555 return false;
10557 case POST_INC:
10558 case POST_DEC:
10559 case PRE_INC:
10560 case PRE_DEC:
10561 info->type = ADDRESS_REG_WB;
10562 info->base = XEXP (x, 0);
10563 info->offset = NULL_RTX;
10564 return aarch64_base_register_rtx_p (info->base, strict_p);
10566 case POST_MODIFY:
10567 case PRE_MODIFY:
10568 info->type = ADDRESS_REG_WB;
10569 info->base = XEXP (x, 0);
10570 if (GET_CODE (XEXP (x, 1)) == PLUS
10571 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10572 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10573 && aarch64_base_register_rtx_p (info->base, strict_p))
10575 info->offset = XEXP (XEXP (x, 1), 1);
10576 info->const_offset = offset;
10578 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10579 registers and individual Q registers. The available
10580 address modes are:
10581 X,X: 7-bit signed scaled offset
10582 Q: 9-bit signed offset
10583 We conservatively require an offset representable in either mode.
10585 if (mode == TImode || mode == TFmode || mode == TDmode)
10586 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10587 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10589 if (load_store_pair_p)
10590 return ((known_eq (GET_MODE_SIZE (mode), 4)
10591 || known_eq (GET_MODE_SIZE (mode), 8)
10592 || known_eq (GET_MODE_SIZE (mode), 16))
10593 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10594 else
10595 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10597 return false;
10599 case CONST:
10600 case SYMBOL_REF:
10601 case LABEL_REF:
10602 /* load literal: pc-relative constant pool entry. Only supported
10603 for SI mode or larger. */
10604 info->type = ADDRESS_SYMBOLIC;
10606 if (!load_store_pair_p
10607 && GET_MODE_SIZE (mode).is_constant (&const_size)
10608 && const_size >= 4)
10610 poly_int64 offset;
10611 rtx sym = strip_offset_and_salt (x, &offset);
10612 return ((LABEL_REF_P (sym)
10613 || (SYMBOL_REF_P (sym)
10614 && CONSTANT_POOL_ADDRESS_P (sym)
10615 && aarch64_pcrelative_literal_loads)));
10617 return false;
10619 case LO_SUM:
10620 info->type = ADDRESS_LO_SUM;
10621 info->base = XEXP (x, 0);
10622 info->offset = XEXP (x, 1);
10623 if (allow_reg_index_p
10624 && aarch64_base_register_rtx_p (info->base, strict_p))
10626 poly_int64 offset;
10627 HOST_WIDE_INT const_offset;
10628 rtx sym = strip_offset_and_salt (info->offset, &offset);
10629 if (SYMBOL_REF_P (sym)
10630 && offset.is_constant (&const_offset)
10631 && (aarch64_classify_symbol (sym, const_offset)
10632 == SYMBOL_SMALL_ABSOLUTE))
10634 /* The symbol and offset must be aligned to the access size. */
10635 unsigned int align;
10637 if (CONSTANT_POOL_ADDRESS_P (sym))
10638 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10639 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10641 tree exp = SYMBOL_REF_DECL (sym);
10642 align = TYPE_ALIGN (TREE_TYPE (exp));
10643 align = aarch64_constant_alignment (exp, align);
10645 else if (SYMBOL_REF_DECL (sym))
10646 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10647 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10648 && SYMBOL_REF_BLOCK (sym) != NULL)
10649 align = SYMBOL_REF_BLOCK (sym)->alignment;
10650 else
10651 align = BITS_PER_UNIT;
10653 poly_int64 ref_size = GET_MODE_SIZE (mode);
10654 if (known_eq (ref_size, 0))
10655 ref_size = GET_MODE_SIZE (DImode);
10657 return (multiple_p (const_offset, ref_size)
10658 && multiple_p (align / BITS_PER_UNIT, ref_size));
10661 return false;
10663 default:
10664 return false;
10668 /* Return true if the address X is valid for a PRFM instruction.
10669 STRICT_P is true if we should do strict checking with
10670 aarch64_classify_address. */
10672 bool
10673 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10675 struct aarch64_address_info addr;
10677 /* PRFM accepts the same addresses as DImode... */
10678 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10679 if (!res)
10680 return false;
10682 /* ... except writeback forms. */
10683 return addr.type != ADDRESS_REG_WB;
10686 bool
10687 aarch64_symbolic_address_p (rtx x)
10689 poly_int64 offset;
10690 x = strip_offset_and_salt (x, &offset);
10691 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10694 /* Classify the base of symbolic expression X. */
10696 enum aarch64_symbol_type
10697 aarch64_classify_symbolic_expression (rtx x)
10699 rtx offset;
10701 split_const (x, &x, &offset);
10702 return aarch64_classify_symbol (x, INTVAL (offset));
10706 /* Return TRUE if X is a legitimate address for accessing memory in
10707 mode MODE. */
10708 static bool
10709 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
10710 code_helper = ERROR_MARK)
10712 struct aarch64_address_info addr;
10714 return aarch64_classify_address (&addr, x, mode, strict_p);
10717 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10718 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
10719 bool
10720 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10721 aarch64_addr_query_type type)
10723 struct aarch64_address_info addr;
10725 return aarch64_classify_address (&addr, x, mode, strict_p, type);
10728 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
10730 static bool
10731 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10732 poly_int64 orig_offset,
10733 machine_mode mode)
10735 HOST_WIDE_INT size;
10736 if (GET_MODE_SIZE (mode).is_constant (&size))
10738 HOST_WIDE_INT const_offset, second_offset;
10740 /* A general SVE offset is A * VQ + B. Remove the A component from
10741 coefficient 0 in order to get the constant B. */
10742 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10744 /* Split an out-of-range address displacement into a base and
10745 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
10746 range otherwise to increase opportunities for sharing the base
10747 address of different sizes. Unaligned accesses use the signed
10748 9-bit range, TImode/TFmode/TDmode use the intersection of signed
10749 scaled 7-bit and signed 9-bit offset. */
10750 if (mode == TImode || mode == TFmode || mode == TDmode)
10751 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10752 else if ((const_offset & (size - 1)) != 0)
10753 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10754 else
10755 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10757 if (second_offset == 0 || known_eq (orig_offset, second_offset))
10758 return false;
10760 /* Split the offset into second_offset and the rest. */
10761 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10762 *offset2 = gen_int_mode (second_offset, Pmode);
10763 return true;
10765 else
10767 /* Get the mode we should use as the basis of the range. For structure
10768 modes this is the mode of one vector. */
10769 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10770 machine_mode step_mode
10771 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10773 /* Get the "mul vl" multiplier we'd like to use. */
10774 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10775 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10776 if (vec_flags & VEC_SVE_DATA)
10777 /* LDR supports a 9-bit range, but the move patterns for
10778 structure modes require all vectors to be in range of the
10779 same base. The simplest way of accomodating that while still
10780 promoting reuse of anchor points between different modes is
10781 to use an 8-bit range unconditionally. */
10782 vnum = ((vnum + 128) & 255) - 128;
10783 else
10784 /* Predicates are only handled singly, so we might as well use
10785 the full range. */
10786 vnum = ((vnum + 256) & 511) - 256;
10787 if (vnum == 0)
10788 return false;
10790 /* Convert the "mul vl" multiplier into a byte offset. */
10791 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10792 if (known_eq (second_offset, orig_offset))
10793 return false;
10795 /* Split the offset into second_offset and the rest. */
10796 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10797 *offset2 = gen_int_mode (second_offset, Pmode);
10798 return true;
10802 /* Return the binary representation of floating point constant VALUE in INTVAL.
10803 If the value cannot be converted, return false without setting INTVAL.
10804 The conversion is done in the given MODE. */
10805 bool
10806 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10809 /* We make a general exception for 0. */
10810 if (aarch64_float_const_zero_rtx_p (value))
10812 *intval = 0;
10813 return true;
10816 scalar_float_mode mode;
10817 if (!CONST_DOUBLE_P (value)
10818 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10819 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10820 /* Only support up to DF mode. */
10821 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10822 return false;
10824 unsigned HOST_WIDE_INT ival = 0;
10826 long res[2];
10827 real_to_target (res,
10828 CONST_DOUBLE_REAL_VALUE (value),
10829 REAL_MODE_FORMAT (mode));
10831 if (mode == DFmode || mode == DDmode)
10833 int order = BYTES_BIG_ENDIAN ? 1 : 0;
10834 ival = zext_hwi (res[order], 32);
10835 ival |= (zext_hwi (res[1 - order], 32) << 32);
10837 else
10838 ival = zext_hwi (res[0], 32);
10840 *intval = ival;
10841 return true;
10844 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10845 single MOV(+MOVK) followed by an FMOV. */
10846 bool
10847 aarch64_float_const_rtx_p (rtx x)
10849 machine_mode mode = GET_MODE (x);
10850 if (mode == VOIDmode)
10851 return false;
10853 /* Determine whether it's cheaper to write float constants as
10854 mov/movk pairs over ldr/adrp pairs. */
10855 unsigned HOST_WIDE_INT ival;
10857 if (CONST_DOUBLE_P (x)
10858 && SCALAR_FLOAT_MODE_P (mode)
10859 && aarch64_reinterpret_float_as_int (x, &ival))
10861 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
10862 int num_instr = aarch64_internal_mov_immediate
10863 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10864 return num_instr < 3;
10867 return false;
10870 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
10871 Floating Point). */
10872 bool
10873 aarch64_float_const_zero_rtx_p (rtx x)
10875 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
10876 zr as our callers expect, so no need to check the actual
10877 value if X is of Decimal Floating Point type. */
10878 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
10879 return false;
10881 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10882 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10883 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
10886 /* Return true if X is any kind of constant zero rtx. */
10888 bool
10889 aarch64_const_zero_rtx_p (rtx x)
10891 return (x == CONST0_RTX (GET_MODE (x))
10892 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
10895 /* Return TRUE if rtx X is immediate constant that fits in a single
10896 MOVI immediate operation. */
10897 bool
10898 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
10900 if (!TARGET_SIMD)
10901 return false;
10903 machine_mode vmode;
10904 scalar_int_mode imode;
10905 unsigned HOST_WIDE_INT ival;
10907 if (CONST_DOUBLE_P (x)
10908 && SCALAR_FLOAT_MODE_P (mode))
10910 if (!aarch64_reinterpret_float_as_int (x, &ival))
10911 return false;
10913 /* We make a general exception for 0. */
10914 if (aarch64_float_const_zero_rtx_p (x))
10915 return true;
10917 imode = int_mode_for_mode (mode).require ();
10919 else if (CONST_INT_P (x)
10920 && is_a <scalar_int_mode> (mode, &imode))
10921 ival = INTVAL (x);
10922 else
10923 return false;
10925 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
10926 a 128 bit vector mode. */
10927 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
10929 vmode = aarch64_simd_container_mode (imode, width);
10930 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
10932 return aarch64_simd_valid_immediate (v_op, NULL);
10936 /* Return the fixed registers used for condition codes. */
10938 static bool
10939 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10941 *p1 = CC_REGNUM;
10942 *p2 = INVALID_REGNUM;
10943 return true;
10946 /* Return a fresh memory reference to the current function's TPIDR2 block,
10947 creating a block if necessary. */
10949 static rtx
10950 aarch64_get_tpidr2_block ()
10952 if (!cfun->machine->tpidr2_block)
10953 /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
10954 boundary. */
10955 cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
10956 return copy_rtx (cfun->machine->tpidr2_block);
10959 /* Return a fresh register that points to the current function's
10960 TPIDR2 block, creating a block if necessary. */
10962 static rtx
10963 aarch64_get_tpidr2_ptr ()
10965 rtx block = aarch64_get_tpidr2_block ();
10966 return force_reg (Pmode, XEXP (block, 0));
10969 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
10970 current function's TPIDR2 block. */
10972 static void
10973 aarch64_init_tpidr2_block ()
10975 rtx block = aarch64_get_tpidr2_block ();
10977 /* The ZA save buffer is SVL.B*SVL.B bytes in size. */
10978 rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
10979 rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
10980 rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
10981 svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
10982 rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
10983 BITS_PER_UNIT, -1, true);
10984 za_save_buffer = force_reg (Pmode, za_save_buffer);
10985 cfun->machine->za_save_buffer = za_save_buffer;
10987 /* The first word of the block points to the save buffer and the second
10988 word is the number of ZA slices to save. */
10989 rtx block_0 = adjust_address (block, DImode, 0);
10990 emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
10992 if (!memory_operand (block, V16QImode))
10993 block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
10994 emit_insn (gen_aarch64_setup_local_tpidr2 (block));
10997 /* Restore the contents of ZA from the lazy save buffer, given that
10998 register TPIDR2_BLOCK points to the current function's TPIDR2 block.
10999 PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null. */
11001 void
11002 aarch64_restore_za (rtx tpidr2_block)
11004 emit_insn (gen_aarch64_smstart_za ());
11005 if (REGNO (tpidr2_block) != R0_REGNUM)
11006 emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11007 emit_insn (gen_aarch64_tpidr2_restore ());
11010 /* Return the ZT0 save buffer, creating one if necessary. */
11012 static rtx
11013 aarch64_get_zt0_save_buffer ()
11015 if (!cfun->machine->zt0_save_buffer)
11016 cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11017 return cfun->machine->zt0_save_buffer;
11020 /* Save ZT0 to the current function's save buffer. */
11022 static void
11023 aarch64_save_zt0 ()
11025 rtx mem = aarch64_get_zt0_save_buffer ();
11026 mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11027 emit_insn (gen_aarch64_sme_str_zt0 (mem));
11030 /* Restore ZT0 from the current function's save buffer. FROM_LAZY_SAVE_P
11031 is true if the load is happening after a call to a private-ZA function,
11032 false if it can be treated as a normal load. */
11034 static void
11035 aarch64_restore_zt0 (bool from_lazy_save_p)
11037 rtx mem = aarch64_get_zt0_save_buffer ();
11038 mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11039 emit_insn (from_lazy_save_p
11040 ? gen_aarch64_restore_zt0 (mem)
11041 : gen_aarch64_sme_ldr_zt0 (mem));
11044 /* Implement TARGET_START_CALL_ARGS. */
11046 static void
11047 aarch64_start_call_args (cumulative_args_t ca_v)
11049 CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11051 if (!TARGET_SME && (ca->isa_mode & AARCH64_FL_SM_ON))
11053 error ("calling a streaming function requires the ISA extension %qs",
11054 "sme");
11055 inform (input_location, "you can enable %qs using the command-line"
11056 " option %<-march%>, or by using the %<target%>"
11057 " attribute or pragma", "sme");
11060 if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11061 && !aarch64_cfun_has_state ("za"))
11062 error ("call to a function that shares %qs state from a function"
11063 " that has no %qs state", "za", "za");
11064 else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11065 && !aarch64_cfun_has_state ("zt0"))
11066 error ("call to a function that shares %qs state from a function"
11067 " that has no %qs state", "zt0", "zt0");
11068 else if (!TARGET_ZA && (ca->isa_mode & AARCH64_FL_ZA_ON))
11069 error ("call to a function that shares SME state from a function"
11070 " that has no SME state");
11072 /* If this is a call to a private ZA function, emit a marker to
11073 indicate where any necessary set-up code could be inserted.
11074 The code itself is inserted by the mode-switching pass. */
11075 if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11076 emit_insn (gen_aarch64_start_private_za_call ());
11078 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11079 save and restore ZT0 around the call. */
11080 if (aarch64_cfun_has_state ("zt0")
11081 && (ca->isa_mode & AARCH64_FL_ZA_ON)
11082 && ca->shared_zt0_flags == 0)
11083 aarch64_save_zt0 ();
11086 /* This function is used by the call expanders of the machine description.
11087 RESULT is the register in which the result is returned. It's NULL for
11088 "call" and "sibcall".
11089 MEM is the location of the function call.
11090 COOKIE is either:
11091 - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11092 - a PARALLEL that contains such a const_int as its first element.
11093 The second element is a PARALLEL that lists all the argument
11094 registers that need to be saved and restored around a change
11095 in PSTATE.SM, or const0_rtx if no such switch is needed.
11096 The third and fourth elements are const_ints that contain the
11097 sharing flags for ZA and ZT0 respectively.
11098 SIBCALL indicates whether this function call is normal call or sibling call.
11099 It will generate different pattern accordingly. */
11101 void
11102 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11104 rtx call, callee, tmp;
11105 rtvec vec;
11106 machine_mode mode;
11108 rtx callee_abi = cookie;
11109 rtx sme_mode_switch_args = const0_rtx;
11110 unsigned int shared_za_flags = 0;
11111 unsigned int shared_zt0_flags = 0;
11112 if (GET_CODE (cookie) == PARALLEL)
11114 callee_abi = XVECEXP (cookie, 0, 0);
11115 sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11116 shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11117 shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11120 gcc_assert (CONST_INT_P (callee_abi));
11121 auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11123 if (aarch64_cfun_has_state ("za")
11124 && (callee_isa_mode & AARCH64_FL_ZA_ON)
11125 && !shared_za_flags)
11127 sorry ("call to a function that shares state other than %qs"
11128 " from a function that has %qs state", "za", "za");
11129 inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11130 " callee preserves ZA");
11133 gcc_assert (MEM_P (mem));
11134 callee = XEXP (mem, 0);
11135 mode = GET_MODE (callee);
11136 gcc_assert (mode == Pmode);
11138 /* Decide if we should generate indirect calls by loading the
11139 address of the callee into a register before performing
11140 the branch-and-link. */
11141 if (SYMBOL_REF_P (callee)
11142 ? (aarch64_is_long_call_p (callee)
11143 || aarch64_is_noplt_call_p (callee))
11144 : !REG_P (callee))
11145 XEXP (mem, 0) = force_reg (mode, callee);
11147 /* Accumulate the return values, including state that is shared via
11148 attributes. */
11149 auto_vec<rtx, 8> return_values;
11150 if (result)
11152 if (GET_CODE (result) == PARALLEL)
11153 for (int i = 0; i < XVECLEN (result, 0); ++i)
11154 return_values.safe_push (XVECEXP (result, 0, i));
11155 else
11156 return_values.safe_push (result);
11158 unsigned int orig_num_return_values = return_values.length ();
11159 if (shared_za_flags & AARCH64_STATE_OUT)
11160 return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11161 /* When calling private-ZA functions from functions with ZA state,
11162 we want to know whether the call committed a lazy save. */
11163 if (TARGET_ZA && !shared_za_flags)
11164 return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11165 if (shared_zt0_flags & AARCH64_STATE_OUT)
11166 return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11168 /* Create the new return value, if necessary. */
11169 if (orig_num_return_values != return_values.length ())
11171 if (return_values.length () == 1)
11172 result = return_values[0];
11173 else
11175 for (rtx &x : return_values)
11176 if (GET_CODE (x) != EXPR_LIST)
11177 x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11178 rtvec v = gen_rtvec_v (return_values.length (),
11179 return_values.address ());
11180 result = gen_rtx_PARALLEL (VOIDmode, v);
11184 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11186 if (result != NULL_RTX)
11187 call = gen_rtx_SET (result, call);
11189 if (sibcall)
11190 tmp = ret_rtx;
11191 else
11192 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11194 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11195 UNSPEC_CALLEE_ABI);
11197 vec = gen_rtvec (3, call, callee_abi, tmp);
11198 call = gen_rtx_PARALLEL (VOIDmode, vec);
11200 auto call_insn = aarch64_emit_call_insn (call);
11202 /* Check whether the call requires a change to PSTATE.SM. We can't
11203 emit the instructions to change PSTATE.SM yet, since they involve
11204 a change in vector length and a change in instruction set, which
11205 cannot be represented in RTL.
11207 For now, just record which registers will be clobbered and used
11208 by the changes to PSTATE.SM. */
11209 if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11211 aarch64_sme_mode_switch_regs args_switch;
11212 if (sme_mode_switch_args != const0_rtx)
11214 unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11215 for (unsigned int i = 0; i < num_args; ++i)
11217 rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11218 args_switch.add_reg (GET_MODE (x), REGNO (x));
11222 aarch64_sme_mode_switch_regs result_switch;
11223 if (result)
11224 result_switch.add_call_result (call_insn);
11226 unsigned int num_gprs = MAX (args_switch.num_gprs (),
11227 result_switch.num_gprs ());
11228 for (unsigned int i = 0; i < num_gprs; ++i)
11229 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11230 gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11232 for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11233 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11234 gen_rtx_REG (V4x16QImode, regno));
11236 for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11237 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11238 gen_rtx_REG (VNx16BImode, regno));
11240 /* Ensure that the VG save slot has been initialized. Also emit
11241 an instruction to model the effect of the temporary clobber
11242 of VG, so that the prologue/epilogue pass sees the need to
11243 save the old value. */
11244 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11245 gen_rtx_REG (DImode, VG_REGNUM));
11246 emit_insn_before (gen_aarch64_update_vg (), call_insn);
11248 cfun->machine->call_switches_pstate_sm = true;
11251 /* Add any ZA-related information.
11253 ZA_REGNUM represents the current function's ZA state, rather than
11254 the contents of the ZA register itself. We ensure that the function's
11255 ZA state is preserved by private-ZA call sequences, so the call itself
11256 does not use or clobber ZA_REGNUM. The same thing applies to
11257 ZT0_REGNUM. */
11258 if (TARGET_ZA)
11260 /* The callee requires ZA to be active if the callee is shared-ZA,
11261 otherwise it requires ZA to be dormant or off. The state of ZA is
11262 captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11263 and ZA_SAVED_REGNUM. */
11264 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11265 gen_rtx_REG (DImode, SME_STATE_REGNUM));
11266 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11267 gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11268 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11269 gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11271 /* Keep the aarch64_start/end_private_za_call markers live. */
11272 if (!(callee_isa_mode & AARCH64_FL_ZA_ON))
11273 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11274 gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11276 /* If the callee is a shared-ZA function, record whether it uses the
11277 current value of ZA and ZT0. */
11278 if (shared_za_flags & AARCH64_STATE_IN)
11279 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11280 gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11282 if (shared_zt0_flags & AARCH64_STATE_IN)
11283 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11284 gen_rtx_REG (V8DImode, ZT0_REGNUM));
11288 /* Implement TARGET_END_CALL_ARGS. */
11290 static void
11291 aarch64_end_call_args (cumulative_args_t ca_v)
11293 CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11295 /* If this is a call to a private ZA function, emit a marker to
11296 indicate where any necessary restoration code could be inserted.
11297 The code itself is inserted by the mode-switching pass. */
11298 if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11299 emit_insn (gen_aarch64_end_private_za_call ());
11301 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11302 save and restore ZT0 around the call. */
11303 if (aarch64_cfun_has_state ("zt0")
11304 && (ca->isa_mode & AARCH64_FL_ZA_ON)
11305 && ca->shared_zt0_flags == 0)
11306 aarch64_restore_zt0 (false);
11309 /* Emit call insn with PAT and do aarch64-specific handling. */
11311 rtx_call_insn *
11312 aarch64_emit_call_insn (rtx pat)
11314 auto insn = emit_call_insn (pat);
11316 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11317 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11318 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11319 return as_a<rtx_call_insn *> (insn);
11322 machine_mode
11323 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11325 machine_mode mode_x = GET_MODE (x);
11326 rtx_code code_x = GET_CODE (x);
11328 /* All floating point compares return CCFP if it is an equality
11329 comparison, and CCFPE otherwise. */
11330 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11332 switch (code)
11334 case EQ:
11335 case NE:
11336 case UNORDERED:
11337 case ORDERED:
11338 case UNLT:
11339 case UNLE:
11340 case UNGT:
11341 case UNGE:
11342 case UNEQ:
11343 return CCFPmode;
11345 case LT:
11346 case LE:
11347 case GT:
11348 case GE:
11349 case LTGT:
11350 return CCFPEmode;
11352 default:
11353 gcc_unreachable ();
11357 /* Equality comparisons of short modes against zero can be performed
11358 using the TST instruction with the appropriate bitmask. */
11359 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11360 && (code == EQ || code == NE)
11361 && (mode_x == HImode || mode_x == QImode))
11362 return CC_Zmode;
11364 /* Similarly, comparisons of zero_extends from shorter modes can
11365 be performed using an ANDS with an immediate mask. */
11366 if (y == const0_rtx && code_x == ZERO_EXTEND
11367 && (mode_x == SImode || mode_x == DImode)
11368 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11369 && (code == EQ || code == NE))
11370 return CC_Zmode;
11372 /* Zero extracts support equality comparisons. */
11373 if ((mode_x == SImode || mode_x == DImode)
11374 && y == const0_rtx
11375 && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11376 && CONST_INT_P (XEXP (x, 2)))
11377 && (code == EQ || code == NE))
11378 return CC_Zmode;
11380 /* ANDS/BICS/TST support equality and all signed comparisons. */
11381 if ((mode_x == SImode || mode_x == DImode)
11382 && y == const0_rtx
11383 && (code_x == AND)
11384 && (code == EQ || code == NE || code == LT || code == GE
11385 || code == GT || code == LE))
11386 return CC_NZVmode;
11388 /* ADDS/SUBS correctly set N and Z flags. */
11389 if ((mode_x == SImode || mode_x == DImode)
11390 && y == const0_rtx
11391 && (code == EQ || code == NE || code == LT || code == GE)
11392 && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11393 return CC_NZmode;
11395 /* A compare with a shifted operand. Because of canonicalization,
11396 the comparison will have to be swapped when we emit the assembly
11397 code. */
11398 if ((mode_x == SImode || mode_x == DImode)
11399 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11400 && (code_x == ASHIFT || code_x == ASHIFTRT
11401 || code_x == LSHIFTRT
11402 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11403 return CC_SWPmode;
11405 /* Similarly for a negated operand, but we can only do this for
11406 equalities. */
11407 if ((mode_x == SImode || mode_x == DImode)
11408 && (REG_P (y) || SUBREG_P (y))
11409 && (code == EQ || code == NE)
11410 && code_x == NEG)
11411 return CC_Zmode;
11413 /* A test for unsigned overflow from an addition. */
11414 if ((mode_x == DImode || mode_x == TImode)
11415 && (code == LTU || code == GEU)
11416 && code_x == PLUS
11417 && rtx_equal_p (XEXP (x, 0), y))
11418 return CC_Cmode;
11420 /* A test for unsigned overflow from an add with carry. */
11421 if ((mode_x == DImode || mode_x == TImode)
11422 && (code == LTU || code == GEU)
11423 && code_x == PLUS
11424 && CONST_SCALAR_INT_P (y)
11425 && (rtx_mode_t (y, mode_x)
11426 == (wi::shwi (1, mode_x)
11427 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11428 return CC_ADCmode;
11430 /* A test for signed overflow. */
11431 if ((mode_x == DImode || mode_x == TImode)
11432 && code == NE
11433 && code_x == PLUS
11434 && GET_CODE (y) == SIGN_EXTEND)
11435 return CC_Vmode;
11437 /* For everything else, return CCmode. */
11438 return CCmode;
11441 static int
11442 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11445 aarch64_get_condition_code (rtx x)
11447 machine_mode mode = GET_MODE (XEXP (x, 0));
11448 enum rtx_code comp_code = GET_CODE (x);
11450 if (GET_MODE_CLASS (mode) != MODE_CC)
11451 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11452 return aarch64_get_condition_code_1 (mode, comp_code);
11455 static int
11456 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11458 switch (mode)
11460 case E_CCFPmode:
11461 case E_CCFPEmode:
11462 switch (comp_code)
11464 case GE: return AARCH64_GE;
11465 case GT: return AARCH64_GT;
11466 case LE: return AARCH64_LS;
11467 case LT: return AARCH64_MI;
11468 case NE: return AARCH64_NE;
11469 case EQ: return AARCH64_EQ;
11470 case ORDERED: return AARCH64_VC;
11471 case UNORDERED: return AARCH64_VS;
11472 case UNLT: return AARCH64_LT;
11473 case UNLE: return AARCH64_LE;
11474 case UNGT: return AARCH64_HI;
11475 case UNGE: return AARCH64_PL;
11476 default: return -1;
11478 break;
11480 case E_CCmode:
11481 switch (comp_code)
11483 case NE: return AARCH64_NE;
11484 case EQ: return AARCH64_EQ;
11485 case GE: return AARCH64_GE;
11486 case GT: return AARCH64_GT;
11487 case LE: return AARCH64_LE;
11488 case LT: return AARCH64_LT;
11489 case GEU: return AARCH64_CS;
11490 case GTU: return AARCH64_HI;
11491 case LEU: return AARCH64_LS;
11492 case LTU: return AARCH64_CC;
11493 default: return -1;
11495 break;
11497 case E_CC_SWPmode:
11498 switch (comp_code)
11500 case NE: return AARCH64_NE;
11501 case EQ: return AARCH64_EQ;
11502 case GE: return AARCH64_LE;
11503 case GT: return AARCH64_LT;
11504 case LE: return AARCH64_GE;
11505 case LT: return AARCH64_GT;
11506 case GEU: return AARCH64_LS;
11507 case GTU: return AARCH64_CC;
11508 case LEU: return AARCH64_CS;
11509 case LTU: return AARCH64_HI;
11510 default: return -1;
11512 break;
11514 case E_CC_NZCmode:
11515 switch (comp_code)
11517 case NE: return AARCH64_NE; /* = any */
11518 case EQ: return AARCH64_EQ; /* = none */
11519 case GE: return AARCH64_PL; /* = nfrst */
11520 case LT: return AARCH64_MI; /* = first */
11521 case GEU: return AARCH64_CS; /* = nlast */
11522 case GTU: return AARCH64_HI; /* = pmore */
11523 case LEU: return AARCH64_LS; /* = plast */
11524 case LTU: return AARCH64_CC; /* = last */
11525 default: return -1;
11527 break;
11529 case E_CC_NZVmode:
11530 switch (comp_code)
11532 case NE: return AARCH64_NE;
11533 case EQ: return AARCH64_EQ;
11534 case GE: return AARCH64_PL;
11535 case LT: return AARCH64_MI;
11536 case GT: return AARCH64_GT;
11537 case LE: return AARCH64_LE;
11538 default: return -1;
11540 break;
11542 case E_CC_NZmode:
11543 switch (comp_code)
11545 case NE: return AARCH64_NE;
11546 case EQ: return AARCH64_EQ;
11547 case GE: return AARCH64_PL;
11548 case LT: return AARCH64_MI;
11549 default: return -1;
11551 break;
11553 case E_CC_Zmode:
11554 switch (comp_code)
11556 case NE: return AARCH64_NE;
11557 case EQ: return AARCH64_EQ;
11558 default: return -1;
11560 break;
11562 case E_CC_Cmode:
11563 switch (comp_code)
11565 case LTU: return AARCH64_CS;
11566 case GEU: return AARCH64_CC;
11567 default: return -1;
11569 break;
11571 case E_CC_ADCmode:
11572 switch (comp_code)
11574 case GEU: return AARCH64_CS;
11575 case LTU: return AARCH64_CC;
11576 default: return -1;
11578 break;
11580 case E_CC_Vmode:
11581 switch (comp_code)
11583 case NE: return AARCH64_VS;
11584 case EQ: return AARCH64_VC;
11585 default: return -1;
11587 break;
11589 default:
11590 return -1;
11593 return -1;
11596 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11597 duplicate of such constants. If so, store in RET_WI the wide_int
11598 representation of the constant paired with the inner mode of the vector mode
11599 or MODE for scalar X constants. If MODE is not provided then TImode is
11600 used. */
11602 static bool
11603 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
11604 scalar_mode mode = TImode)
11606 rtx elt = unwrap_const_vec_duplicate (x);
11607 if (!CONST_SCALAR_INT_P (elt))
11608 return false;
11609 scalar_mode smode
11610 = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
11611 *ret_wi = rtx_mode_t (elt, smode);
11612 return true;
11615 /* Return true if X is a scalar or a constant vector of integer
11616 immediates that represent the rounding constant used in the fixed-point
11617 arithmetic instructions.
11618 The accepted form of the constant is (1 << (C - 1)) where C is in the range
11619 [1, MODE_WIDTH/2]. */
11621 bool
11622 aarch64_rnd_imm_p (rtx x)
11624 wide_int rnd_cst;
11625 if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
11626 return false;
11627 int log2 = wi::exact_log2 (rnd_cst);
11628 if (log2 < 0)
11629 return false;
11630 return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
11633 /* Return true if RND is a constant vector of integer rounding constants
11634 corresponding to a constant vector of shifts, SHIFT.
11635 The relationship should be RND == (1 << (SHIFT - 1)). */
11637 bool
11638 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
11640 wide_int rnd_cst, shft_cst;
11641 if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
11642 || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
11643 return false;
11645 return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
11648 bool
11649 aarch64_const_vec_all_same_in_range_p (rtx x,
11650 HOST_WIDE_INT minval,
11651 HOST_WIDE_INT maxval)
11653 rtx elt;
11654 return (const_vec_duplicate_p (x, &elt)
11655 && CONST_INT_P (elt)
11656 && IN_RANGE (INTVAL (elt), minval, maxval));
11659 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11660 but we can still create them in various ways. If the constant in VAL can be
11661 created using alternate methods then if possible then return true and
11662 additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11663 Otherwise return false if sequence is not possible. */
11665 bool
11666 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
11668 wide_int wval;
11669 auto smode = GET_MODE_INNER (mode);
11670 if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
11671 return false;
11673 /* For Advanced SIMD we can create an integer with only the top bit set
11674 using fneg (0.0f). */
11675 if (TARGET_SIMD
11676 && !TARGET_SVE
11677 && smode == DImode
11678 && wi::only_sign_bit_p (wval))
11680 if (!target)
11681 return true;
11683 /* Use the same base type as aarch64_gen_shareable_zero. */
11684 rtx zero = CONST0_RTX (V4SImode);
11685 emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
11686 rtx neg = lowpart_subreg (V2DFmode, target, mode);
11687 emit_insn (gen_negv2df2 (neg, copy_rtx (neg)));
11688 return true;
11691 return false;
11694 /* Check if the value in VAL with mode MODE can be created using special
11695 instruction sequences. */
11697 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
11699 return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
11702 bool
11703 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11705 return aarch64_const_vec_all_same_in_range_p (x, val, val);
11708 /* Return true if VEC is a constant in which every element is in the range
11709 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11711 static bool
11712 aarch64_const_vec_all_in_range_p (rtx vec,
11713 HOST_WIDE_INT minval,
11714 HOST_WIDE_INT maxval)
11716 if (!CONST_VECTOR_P (vec)
11717 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11718 return false;
11720 int nunits;
11721 if (!CONST_VECTOR_STEPPED_P (vec))
11722 nunits = const_vector_encoded_nelts (vec);
11723 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11724 return false;
11726 for (int i = 0; i < nunits; i++)
11728 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11729 if (!CONST_INT_P (vec_elem)
11730 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11731 return false;
11733 return true;
11736 /* N Z C V. */
11737 #define AARCH64_CC_V 1
11738 #define AARCH64_CC_C (1 << 1)
11739 #define AARCH64_CC_Z (1 << 2)
11740 #define AARCH64_CC_N (1 << 3)
11742 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11743 static const int aarch64_nzcv_codes[] =
11745 0, /* EQ, Z == 1. */
11746 AARCH64_CC_Z, /* NE, Z == 0. */
11747 0, /* CS, C == 1. */
11748 AARCH64_CC_C, /* CC, C == 0. */
11749 0, /* MI, N == 1. */
11750 AARCH64_CC_N, /* PL, N == 0. */
11751 0, /* VS, V == 1. */
11752 AARCH64_CC_V, /* VC, V == 0. */
11753 0, /* HI, C ==1 && Z == 0. */
11754 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
11755 AARCH64_CC_V, /* GE, N == V. */
11756 0, /* LT, N != V. */
11757 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
11758 0, /* LE, !(Z == 0 && N == V). */
11759 0, /* AL, Any. */
11760 0 /* NV, Any. */
11763 /* Print floating-point vector immediate operand X to F, negating it
11764 first if NEGATE is true. Return true on success, false if it isn't
11765 a constant we can handle. */
11767 static bool
11768 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11770 rtx elt;
11772 if (!const_vec_duplicate_p (x, &elt))
11773 return false;
11775 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11776 if (negate)
11777 r = real_value_negate (&r);
11779 /* Handle the SVE single-bit immediates specially, since they have a
11780 fixed form in the assembly syntax. */
11781 if (real_equal (&r, &dconst0))
11782 asm_fprintf (f, "0.0");
11783 else if (real_equal (&r, &dconst2))
11784 asm_fprintf (f, "2.0");
11785 else if (real_equal (&r, &dconst1))
11786 asm_fprintf (f, "1.0");
11787 else if (real_equal (&r, &dconsthalf))
11788 asm_fprintf (f, "0.5");
11789 else
11791 const int buf_size = 20;
11792 char float_buf[buf_size] = {'\0'};
11793 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11794 1, GET_MODE (elt));
11795 asm_fprintf (f, "%s", float_buf);
11798 return true;
11801 /* Return the equivalent letter for size. */
11802 static char
11803 sizetochar (int size)
11805 switch (size)
11807 case 64: return 'd';
11808 case 32: return 's';
11809 case 16: return 'h';
11810 case 8 : return 'b';
11811 default: gcc_unreachable ();
11815 /* Print operand X to file F in a target specific manner according to CODE.
11816 The acceptable formatting commands given by CODE are:
11817 'c': An integer or symbol address without a preceding #
11818 sign.
11819 'C': Take the duplicated element in a vector constant
11820 and print it in hex.
11821 'D': Take the duplicated element in a vector constant
11822 and print it as an unsigned integer, in decimal.
11823 'e': Print the sign/zero-extend size as a character 8->b,
11824 16->h, 32->w. Can also be used for masks:
11825 0xff->b, 0xffff->h, 0xffffffff->w.
11826 'I': If the operand is a duplicated vector constant,
11827 replace it with the duplicated scalar. If the
11828 operand is then a floating-point constant, replace
11829 it with the integer bit representation. Print the
11830 transformed constant as a signed decimal number.
11831 'p': Prints N such that 2^N == X (X must be power of 2 and
11832 const int).
11833 'P': Print the number of non-zero bits in X (a const_int).
11834 'H': Print the higher numbered register of a pair (TImode)
11835 of regs.
11836 'm': Print a condition (eq, ne, etc).
11837 'M': Same as 'm', but invert condition.
11838 'N': Take the duplicated element in a vector constant
11839 and print the negative of it in decimal.
11840 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11841 'Z': Same for SVE registers. ('z' was already taken.)
11842 Note that it is not necessary to use %Z for operands
11843 that have SVE modes. The convention is to use %Z
11844 only for non-SVE (or potentially non-SVE) modes.
11845 'S/T/U/V': Print a FP/SIMD register name for a register list.
11846 The register printed is the FP/SIMD register name
11847 of X + 0/1/2/3 for S/T/U/V.
11848 'R': Print a scalar Integer/FP/SIMD register name + 1.
11849 'X': Print bottom 16 bits of integer constant in hex.
11850 'w/x': Print a general register name or the zero register
11851 (32-bit or 64-bit).
11852 '0': Print a normal operand, if it's a general register,
11853 then we assume DImode.
11854 'k': Print NZCV for conditional compare instructions.
11855 'K': Print a predicate register as pn<N> rather than p<N>
11856 'A': Output address constant representing the first
11857 argument of X, specifying a relocation offset
11858 if appropriate.
11859 'L': Output constant address specified by X
11860 with a relocation offset if appropriate.
11861 'G': Prints address of X, specifying a PC relative
11862 relocation mode if appropriate.
11863 'y': Output address of LDP or STP - this is used for
11864 some LDP/STPs which don't use a PARALLEL in their
11865 pattern (so the mode needs to be adjusted).
11866 'z': Output address of a typical LDP or STP. */
11868 static void
11869 aarch64_print_operand (FILE *f, rtx x, int code)
11871 rtx elt;
11872 switch (code)
11874 case 'c':
11875 if (CONST_INT_P (x))
11876 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11877 else
11879 poly_int64 offset;
11880 rtx base = strip_offset_and_salt (x, &offset);
11881 if (SYMBOL_REF_P (base))
11882 output_addr_const (f, x);
11883 else
11884 output_operand_lossage ("unsupported operand for code '%c'", code);
11886 break;
11888 case 'e':
11890 x = unwrap_const_vec_duplicate (x);
11891 if (!CONST_INT_P (x))
11893 output_operand_lossage ("invalid operand for '%%%c'", code);
11894 return;
11897 HOST_WIDE_INT val = INTVAL (x);
11898 if ((val & ~7) == 8 || val == 0xff)
11899 fputc ('b', f);
11900 else if ((val & ~7) == 16 || val == 0xffff)
11901 fputc ('h', f);
11902 else if ((val & ~7) == 32 || val == 0xffffffff)
11903 fputc ('w', f);
11904 else
11906 output_operand_lossage ("invalid operand for '%%%c'", code);
11907 return;
11910 break;
11912 case 'p':
11914 int n;
11916 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11918 output_operand_lossage ("invalid operand for '%%%c'", code);
11919 return;
11922 asm_fprintf (f, "%d", n);
11924 break;
11926 case 'P':
11927 if (!CONST_INT_P (x))
11929 output_operand_lossage ("invalid operand for '%%%c'", code);
11930 return;
11933 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11934 break;
11936 case 'H':
11937 if (x == const0_rtx)
11939 asm_fprintf (f, "xzr");
11940 break;
11943 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11945 output_operand_lossage ("invalid operand for '%%%c'", code);
11946 return;
11949 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11950 break;
11952 case 'I':
11954 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11955 if (CONST_INT_P (x))
11956 asm_fprintf (f, "%wd", INTVAL (x));
11957 else
11959 output_operand_lossage ("invalid operand for '%%%c'", code);
11960 return;
11962 break;
11965 case 'M':
11966 case 'm':
11968 int cond_code;
11969 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
11970 if (x == const_true_rtx)
11972 if (code == 'M')
11973 fputs ("nv", f);
11974 return;
11977 if (!COMPARISON_P (x))
11979 output_operand_lossage ("invalid operand for '%%%c'", code);
11980 return;
11983 cond_code = aarch64_get_condition_code (x);
11984 gcc_assert (cond_code >= 0);
11985 if (code == 'M')
11986 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11987 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11988 fputs (aarch64_sve_condition_codes[cond_code], f);
11989 else
11990 fputs (aarch64_condition_codes[cond_code], f);
11992 break;
11994 case 'N':
11995 if (!const_vec_duplicate_p (x, &elt))
11997 output_operand_lossage ("invalid vector constant");
11998 return;
12001 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12002 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12003 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12004 && aarch64_print_vector_float_operand (f, x, true))
12006 else
12008 output_operand_lossage ("invalid vector constant");
12009 return;
12011 break;
12013 case 'b':
12014 case 'h':
12015 case 's':
12016 case 'd':
12017 case 'q':
12018 case 'Z':
12019 code = TOLOWER (code);
12020 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12022 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12023 return;
12025 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12026 break;
12028 case 'S':
12029 case 'T':
12030 case 'U':
12031 case 'V':
12032 if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12034 output_operand_lossage ("incompatible operand for '%%%c'", code);
12035 return;
12037 if (PR_REGNUM_P (REGNO (x)))
12038 asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12039 else
12040 asm_fprintf (f, "%c%d",
12041 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12042 REGNO (x) - V0_REGNUM + (code - 'S'));
12043 break;
12045 case 'R':
12046 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12047 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12048 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12049 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12050 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12051 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12052 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12053 else
12054 output_operand_lossage ("incompatible register operand for '%%%c'",
12055 code);
12056 break;
12058 case 'X':
12059 if (!CONST_INT_P (x))
12061 output_operand_lossage ("invalid operand for '%%%c'", code);
12062 return;
12064 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12065 break;
12067 case 'C':
12069 /* Print a replicated constant in hex. */
12070 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12072 output_operand_lossage ("invalid operand for '%%%c'", code);
12073 return;
12075 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12076 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12078 break;
12080 case 'D':
12082 /* Print a replicated constant in decimal, treating it as
12083 unsigned. */
12084 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12086 output_operand_lossage ("invalid operand for '%%%c'", code);
12087 return;
12089 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12090 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12092 break;
12094 case 'w':
12095 case 'x':
12096 if (aarch64_const_zero_rtx_p (x))
12098 asm_fprintf (f, "%czr", code);
12099 break;
12102 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12104 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12105 break;
12108 if (REG_P (x) && REGNO (x) == SP_REGNUM)
12110 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12111 break;
12114 /* Fall through */
12116 case 0:
12117 if (x == NULL)
12119 output_operand_lossage ("missing operand");
12120 return;
12123 switch (GET_CODE (x))
12125 case CONST_STRING:
12127 asm_fprintf (f, "%s", XSTR (x, 0));
12128 break;
12130 case REG:
12131 if (aarch64_sve_data_mode_p (GET_MODE (x)))
12133 if (REG_NREGS (x) == 1)
12134 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12135 else
12137 char suffix
12138 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12139 asm_fprintf (f, "{z%d.%c - z%d.%c}",
12140 REGNO (x) - V0_REGNUM, suffix,
12141 END_REGNO (x) - V0_REGNUM - 1, suffix);
12144 else
12145 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12146 break;
12148 case MEM:
12149 output_address (GET_MODE (x), XEXP (x, 0));
12150 break;
12152 case LABEL_REF:
12153 case SYMBOL_REF:
12154 output_addr_const (asm_out_file, x);
12155 break;
12157 case CONST_INT:
12158 asm_fprintf (f, "%wd", INTVAL (x));
12159 break;
12161 case CONST:
12162 if (!VECTOR_MODE_P (GET_MODE (x)))
12164 output_addr_const (asm_out_file, x);
12165 break;
12167 /* fall through */
12169 case CONST_VECTOR:
12170 if (!const_vec_duplicate_p (x, &elt))
12172 output_operand_lossage ("invalid vector constant");
12173 return;
12176 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12177 asm_fprintf (f, "%wd", INTVAL (elt));
12178 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12179 && aarch64_print_vector_float_operand (f, x, false))
12181 else
12183 output_operand_lossage ("invalid vector constant");
12184 return;
12186 break;
12188 case CONST_DOUBLE:
12189 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12190 be getting CONST_DOUBLEs holding integers. */
12191 gcc_assert (GET_MODE (x) != VOIDmode);
12192 if (aarch64_float_const_zero_rtx_p (x))
12194 fputc ('0', f);
12195 break;
12197 else if (aarch64_float_const_representable_p (x))
12199 #define buf_size 20
12200 char float_buf[buf_size] = {'\0'};
12201 real_to_decimal_for_mode (float_buf,
12202 CONST_DOUBLE_REAL_VALUE (x),
12203 buf_size, buf_size,
12204 1, GET_MODE (x));
12205 asm_fprintf (asm_out_file, "%s", float_buf);
12206 break;
12207 #undef buf_size
12209 output_operand_lossage ("invalid constant");
12210 return;
12211 default:
12212 output_operand_lossage ("invalid operand");
12213 return;
12215 break;
12217 case 'A':
12218 if (GET_CODE (x) == HIGH)
12219 x = XEXP (x, 0);
12221 switch (aarch64_classify_symbolic_expression (x))
12223 case SYMBOL_SMALL_GOT_4G:
12224 asm_fprintf (asm_out_file, ":got:");
12225 break;
12227 case SYMBOL_SMALL_TLSGD:
12228 asm_fprintf (asm_out_file, ":tlsgd:");
12229 break;
12231 case SYMBOL_SMALL_TLSDESC:
12232 asm_fprintf (asm_out_file, ":tlsdesc:");
12233 break;
12235 case SYMBOL_SMALL_TLSIE:
12236 asm_fprintf (asm_out_file, ":gottprel:");
12237 break;
12239 case SYMBOL_TLSLE24:
12240 asm_fprintf (asm_out_file, ":tprel:");
12241 break;
12243 case SYMBOL_TINY_GOT:
12244 gcc_unreachable ();
12245 break;
12247 default:
12248 break;
12250 output_addr_const (asm_out_file, x);
12251 break;
12253 case 'L':
12254 switch (aarch64_classify_symbolic_expression (x))
12256 case SYMBOL_SMALL_GOT_4G:
12257 asm_fprintf (asm_out_file, ":got_lo12:");
12258 break;
12260 case SYMBOL_SMALL_TLSGD:
12261 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12262 break;
12264 case SYMBOL_SMALL_TLSDESC:
12265 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12266 break;
12268 case SYMBOL_SMALL_TLSIE:
12269 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12270 break;
12272 case SYMBOL_TLSLE12:
12273 asm_fprintf (asm_out_file, ":tprel_lo12:");
12274 break;
12276 case SYMBOL_TLSLE24:
12277 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12278 break;
12280 case SYMBOL_TINY_GOT:
12281 asm_fprintf (asm_out_file, ":got:");
12282 break;
12284 case SYMBOL_TINY_TLSIE:
12285 asm_fprintf (asm_out_file, ":gottprel:");
12286 break;
12288 default:
12289 break;
12291 output_addr_const (asm_out_file, x);
12292 break;
12294 case 'G':
12295 switch (aarch64_classify_symbolic_expression (x))
12297 case SYMBOL_TLSLE24:
12298 asm_fprintf (asm_out_file, ":tprel_hi12:");
12299 break;
12300 default:
12301 break;
12303 output_addr_const (asm_out_file, x);
12304 break;
12306 case 'k':
12308 HOST_WIDE_INT cond_code;
12310 if (!CONST_INT_P (x))
12312 output_operand_lossage ("invalid operand for '%%%c'", code);
12313 return;
12316 cond_code = INTVAL (x);
12317 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12318 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12320 break;
12322 case 'K':
12323 if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12325 output_operand_lossage ("invalid operand for '%%%c'", code);
12326 return;
12328 asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12329 break;
12331 case 'y':
12332 case 'z':
12334 machine_mode mode = GET_MODE (x);
12336 if (!MEM_P (x)
12337 || (code == 'y'
12338 && maybe_ne (GET_MODE_SIZE (mode), 8)
12339 && maybe_ne (GET_MODE_SIZE (mode), 16)
12340 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12342 output_operand_lossage ("invalid operand for '%%%c'", code);
12343 return;
12346 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12347 code == 'y'
12348 ? ADDR_QUERY_LDP_STP_N
12349 : ADDR_QUERY_LDP_STP))
12350 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12352 break;
12354 default:
12355 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12356 return;
12360 /* Print address 'x' of a memory access with mode 'mode'.
12361 'op' is the context required by aarch64_classify_address. It can either be
12362 MEM for a normal memory access or PARALLEL for LDP/STP. */
12363 static bool
12364 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12365 aarch64_addr_query_type type)
12367 struct aarch64_address_info addr;
12368 unsigned int size, vec_flags;
12370 /* Check all addresses are Pmode - including ILP32. */
12371 if (GET_MODE (x) != Pmode
12372 && (!CONST_INT_P (x)
12373 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12375 output_operand_lossage ("invalid address mode");
12376 return false;
12379 const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12380 || type == ADDR_QUERY_LDP_STP_N);
12382 if (aarch64_classify_address (&addr, x, mode, true, type))
12383 switch (addr.type)
12385 case ADDRESS_REG_IMM:
12386 if (known_eq (addr.const_offset, 0))
12388 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12389 return true;
12392 vec_flags = aarch64_classify_vector_mode (mode);
12393 if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12395 HOST_WIDE_INT vnum
12396 = exact_div (addr.const_offset,
12397 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12398 asm_fprintf (f, "[%s, #%wd, mul vl]",
12399 reg_names[REGNO (addr.base)], vnum);
12400 return true;
12403 if (!CONST_INT_P (addr.offset))
12404 return false;
12406 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12407 INTVAL (addr.offset));
12408 return true;
12410 case ADDRESS_REG_REG:
12411 if (addr.shift == 0)
12412 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12413 reg_names [REGNO (addr.offset)]);
12414 else
12415 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12416 reg_names [REGNO (addr.offset)], addr.shift);
12417 return true;
12419 case ADDRESS_REG_UXTW:
12420 if (addr.shift == 0)
12421 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12422 REGNO (addr.offset) - R0_REGNUM);
12423 else
12424 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12425 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12426 return true;
12428 case ADDRESS_REG_SXTW:
12429 if (addr.shift == 0)
12430 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12431 REGNO (addr.offset) - R0_REGNUM);
12432 else
12433 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12434 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12435 return true;
12437 case ADDRESS_REG_WB:
12438 /* Writeback is only supported for fixed-width modes. */
12439 size = GET_MODE_SIZE (mode).to_constant ();
12440 switch (GET_CODE (x))
12442 case PRE_INC:
12443 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12444 return true;
12445 case POST_INC:
12446 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12447 return true;
12448 case PRE_DEC:
12449 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12450 return true;
12451 case POST_DEC:
12452 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12453 return true;
12454 case PRE_MODIFY:
12455 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12456 INTVAL (addr.offset));
12457 return true;
12458 case POST_MODIFY:
12459 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12460 INTVAL (addr.offset));
12461 return true;
12462 default:
12463 break;
12465 break;
12467 case ADDRESS_LO_SUM:
12468 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12469 output_addr_const (f, addr.offset);
12470 asm_fprintf (f, "]");
12471 return true;
12473 case ADDRESS_SYMBOLIC:
12474 output_addr_const (f, x);
12475 return true;
12478 return false;
12481 /* Print address 'x' of a memory access with mode 'mode'. */
12482 static void
12483 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12485 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12486 output_addr_const (f, x);
12489 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12491 static bool
12492 aarch64_output_addr_const_extra (FILE *file, rtx x)
12494 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12496 output_addr_const (file, XVECEXP (x, 0, 0));
12497 return true;
12499 return false;
12502 bool
12503 aarch64_label_mentioned_p (rtx x)
12505 const char *fmt;
12506 int i;
12508 if (LABEL_REF_P (x))
12509 return true;
12511 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12512 referencing instruction, but they are constant offsets, not
12513 symbols. */
12514 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12515 return false;
12517 fmt = GET_RTX_FORMAT (GET_CODE (x));
12518 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12520 if (fmt[i] == 'E')
12522 int j;
12524 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12525 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12526 return 1;
12528 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12529 return 1;
12532 return 0;
12535 /* Implement REGNO_REG_CLASS. */
12537 enum reg_class
12538 aarch64_regno_regclass (unsigned regno)
12540 if (W8_W11_REGNUM_P (regno))
12541 return W8_W11_REGS;
12543 if (W12_W15_REGNUM_P (regno))
12544 return W12_W15_REGS;
12546 if (STUB_REGNUM_P (regno))
12547 return STUB_REGS;
12549 if (GP_REGNUM_P (regno))
12550 return GENERAL_REGS;
12552 if (regno == SP_REGNUM)
12553 return STACK_REG;
12555 if (regno == FRAME_POINTER_REGNUM
12556 || regno == ARG_POINTER_REGNUM)
12557 return POINTER_REGS;
12559 if (FP_REGNUM_P (regno))
12560 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12561 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12563 if (PR_REGNUM_P (regno))
12564 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12566 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12567 return FFR_REGS;
12569 if (FAKE_REGNUM_P (regno))
12570 return FAKE_REGS;
12572 return NO_REGS;
12575 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12576 If OFFSET is out of range, return an offset of an anchor point
12577 that is in range. Return 0 otherwise. */
12579 static HOST_WIDE_INT
12580 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12581 machine_mode mode)
12583 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12584 if (size > 16)
12585 return (offset + 0x400) & ~0x7f0;
12587 /* For offsets that aren't a multiple of the access size, the limit is
12588 -256...255. */
12589 if (offset & (size - 1))
12591 /* BLKmode typically uses LDP of X-registers. */
12592 if (mode == BLKmode)
12593 return (offset + 512) & ~0x3ff;
12594 return (offset + 0x100) & ~0x1ff;
12597 /* Small negative offsets are supported. */
12598 if (IN_RANGE (offset, -256, 0))
12599 return 0;
12601 if (mode == TImode || mode == TFmode || mode == TDmode)
12602 return (offset + 0x100) & ~0x1ff;
12604 /* Use 12-bit offset by access size. */
12605 return offset & (~0xfff * size);
12608 static rtx
12609 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
12611 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12612 where mask is selected by alignment and size of the offset.
12613 We try to pick as large a range for the offset as possible to
12614 maximize the chance of a CSE. However, for aligned addresses
12615 we limit the range to 4k so that structures with different sized
12616 elements are likely to use the same base. We need to be careful
12617 not to split a CONST for some forms of address expression, otherwise
12618 it will generate sub-optimal code. */
12620 /* First split X + CONST (base, offset) into (base + X) + offset. */
12621 if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
12623 poly_int64 offset;
12624 rtx base = strip_offset (XEXP (x, 1), &offset);
12626 base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
12627 NULL_RTX, true, OPTAB_DIRECT);
12628 x = plus_constant (Pmode, base, offset);
12631 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12633 rtx base = XEXP (x, 0);
12634 rtx offset_rtx = XEXP (x, 1);
12635 HOST_WIDE_INT offset = INTVAL (offset_rtx);
12637 if (GET_CODE (base) == PLUS)
12639 rtx op0 = XEXP (base, 0);
12640 rtx op1 = XEXP (base, 1);
12642 /* Force any scaling into a temp for CSE. */
12643 op0 = force_reg (Pmode, op0);
12644 op1 = force_reg (Pmode, op1);
12646 /* Let the pointer register be in op0. */
12647 if (REG_POINTER (op1))
12648 std::swap (op0, op1);
12650 /* If the pointer is virtual or frame related, then we know that
12651 virtual register instantiation or register elimination is going
12652 to apply a second constant. We want the two constants folded
12653 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12654 if (virt_or_elim_regno_p (REGNO (op0)))
12656 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12657 NULL_RTX, true, OPTAB_DIRECT);
12658 return gen_rtx_PLUS (Pmode, base, op1);
12661 /* Otherwise, in order to encourage CSE (and thence loop strength
12662 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12663 base = expand_binop (Pmode, add_optab, op0, op1,
12664 NULL_RTX, true, OPTAB_DIRECT);
12665 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12668 HOST_WIDE_INT size;
12669 if (GET_MODE_SIZE (mode).is_constant (&size))
12671 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12672 mode);
12673 if (base_offset != 0)
12675 base = plus_constant (Pmode, base, base_offset);
12676 base = force_operand (base, NULL_RTX);
12677 return plus_constant (Pmode, base, offset - base_offset);
12682 return x;
12685 static reg_class_t
12686 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12687 reg_class_t rclass,
12688 machine_mode mode,
12689 secondary_reload_info *sri)
12691 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12692 LDR and STR. See the comment at the head of aarch64-sve.md for
12693 more details about the big-endian handling. */
12694 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12695 if (reg_class_subset_p (rclass, FP_REGS)
12696 && !((REG_P (x) && HARD_REGISTER_P (x))
12697 || aarch64_simd_valid_immediate (x, NULL))
12698 && mode != VNx16QImode
12699 && (vec_flags & VEC_SVE_DATA)
12700 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12702 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12703 return NO_REGS;
12706 /* If we have to disable direct literal pool loads and stores because the
12707 function is too big, then we need a scratch register. */
12708 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12709 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12710 || targetm.vector_mode_supported_p (GET_MODE (x)))
12711 && !aarch64_pcrelative_literal_loads)
12713 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12714 return NO_REGS;
12717 /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12718 Q register to a Q register directly. We need a scratch. */
12719 if (REG_P (x)
12720 && (mode == TFmode
12721 || mode == TImode
12722 || mode == TDmode
12723 || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12724 && mode == GET_MODE (x)
12725 && !TARGET_SIMD
12726 && FP_REGNUM_P (REGNO (x))
12727 && reg_class_subset_p (rclass, FP_REGS))
12729 sri->icode = code_for_aarch64_reload_mov (mode);
12730 return NO_REGS;
12733 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12734 because AArch64 has richer addressing modes for LDR/STR instructions
12735 than LDP/STP instructions. */
12736 if (TARGET_FLOAT && rclass == GENERAL_REGS
12737 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12738 return FP_REGS;
12740 if (rclass == FP_REGS
12741 && (mode == TImode || mode == TFmode || mode == TDmode)
12742 && CONSTANT_P(x))
12743 return GENERAL_REGS;
12745 return NO_REGS;
12748 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12750 static bool
12751 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12752 reg_class_t class2)
12754 if (!TARGET_SIMD
12755 && reg_classes_intersect_p (class1, FP_REGS)
12756 && reg_classes_intersect_p (class2, FP_REGS))
12758 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12759 so we can't easily split a move involving tuples of 128-bit
12760 vectors. Force the copy through memory instead.
12762 (Tuples of 64-bit vectors are fine.) */
12763 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12764 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12765 return true;
12767 return false;
12770 /* Implement TARGET_FRAME_POINTER_REQUIRED. */
12772 static bool
12773 aarch64_frame_pointer_required ()
12775 /* If the function needs to record the incoming value of PSTATE.SM,
12776 make sure that the slot is accessible from the frame pointer. */
12777 return aarch64_need_old_pstate_sm ();
12780 static bool
12781 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12783 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12785 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12786 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12787 if (frame_pointer_needed)
12788 return to == HARD_FRAME_POINTER_REGNUM;
12789 return true;
12792 poly_int64
12793 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12795 aarch64_frame &frame = cfun->machine->frame;
12797 if (to == HARD_FRAME_POINTER_REGNUM)
12799 if (from == ARG_POINTER_REGNUM)
12800 return frame.bytes_above_hard_fp;
12802 if (from == FRAME_POINTER_REGNUM)
12803 return frame.bytes_above_hard_fp - frame.bytes_above_locals;
12806 if (to == STACK_POINTER_REGNUM)
12808 if (from == FRAME_POINTER_REGNUM)
12809 return frame.frame_size - frame.bytes_above_locals;
12812 return frame.frame_size;
12816 /* Get return address without mangling. */
12819 aarch64_return_addr_rtx (void)
12821 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12822 /* Note: aarch64_return_address_signing_enabled only
12823 works after cfun->machine->frame.laid_out is set,
12824 so here we don't know if the return address will
12825 be signed or not. */
12826 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12827 emit_move_insn (lr, val);
12828 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12829 return lr;
12833 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12834 previous frame. */
12837 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12839 if (count != 0)
12840 return const0_rtx;
12841 return aarch64_return_addr_rtx ();
12844 static void
12845 aarch64_asm_trampoline_template (FILE *f)
12847 /* Even if the current function doesn't have branch protection, some
12848 later function might, so since this template is only generated once
12849 we have to add a BTI just in case. */
12850 asm_fprintf (f, "\thint\t34 // bti c\n");
12852 if (TARGET_ILP32)
12854 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12855 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12857 else
12859 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12860 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12862 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12864 /* We always emit a speculation barrier.
12865 This is because the same trampoline template is used for every nested
12866 function. Since nested functions are not particularly common or
12867 performant we don't worry too much about the extra instructions to copy
12868 around.
12869 This is not yet a problem, since we have not yet implemented function
12870 specific attributes to choose between hardening against straight line
12871 speculation or not, but such function specific attributes are likely to
12872 happen in the future. */
12873 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12875 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12876 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12879 static void
12880 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12882 rtx fnaddr, mem, a_tramp;
12883 const int tramp_code_sz = 24;
12885 /* Don't need to copy the trailing D-words, we fill those in below. */
12886 /* We create our own memory address in Pmode so that `emit_block_move` can
12887 use parts of the backend which expect Pmode addresses. */
12888 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12889 emit_block_move (gen_rtx_MEM (BLKmode, temp),
12890 assemble_trampoline_template (),
12891 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12892 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12893 fnaddr = XEXP (DECL_RTL (fndecl), 0);
12894 if (GET_MODE (fnaddr) != ptr_mode)
12895 fnaddr = convert_memory_address (ptr_mode, fnaddr);
12896 emit_move_insn (mem, fnaddr);
12898 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12899 emit_move_insn (mem, chain_value);
12901 /* XXX We should really define a "clear_cache" pattern and use
12902 gen_clear_cache(). */
12903 a_tramp = XEXP (m_tramp, 0);
12904 maybe_emit_call_builtin___clear_cache (a_tramp,
12905 plus_constant (ptr_mode,
12906 a_tramp,
12907 TRAMPOLINE_SIZE));
12910 static unsigned char
12911 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12913 /* ??? Logically we should only need to provide a value when
12914 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12915 can hold MODE, but at the moment we need to handle all modes.
12916 Just ignore any runtime parts for registers that can't store them. */
12917 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12918 unsigned int nregs, vec_flags;
12919 switch (regclass)
12921 case W8_W11_REGS:
12922 case W12_W15_REGS:
12923 case STUB_REGS:
12924 case TAILCALL_ADDR_REGS:
12925 case POINTER_REGS:
12926 case GENERAL_REGS:
12927 case ALL_REGS:
12928 case POINTER_AND_FP_REGS:
12929 case FP_REGS:
12930 case FP_LO_REGS:
12931 case FP_LO8_REGS:
12932 vec_flags = aarch64_classify_vector_mode (mode);
12933 if ((vec_flags & VEC_SVE_DATA)
12934 && constant_multiple_p (GET_MODE_SIZE (mode),
12935 aarch64_vl_bytes (mode, vec_flags), &nregs))
12936 return nregs;
12937 return (vec_flags & VEC_ADVSIMD
12938 ? CEIL (lowest_size, UNITS_PER_VREG)
12939 : CEIL (lowest_size, UNITS_PER_WORD));
12941 case PR_REGS:
12942 case PR_LO_REGS:
12943 case PR_HI_REGS:
12944 return mode == VNx32BImode ? 2 : 1;
12946 case STACK_REG:
12947 case FFR_REGS:
12948 case PR_AND_FFR_REGS:
12949 case FAKE_REGS:
12950 return 1;
12952 case NO_REGS:
12953 return 0;
12955 default:
12956 break;
12958 gcc_unreachable ();
12961 static reg_class_t
12962 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12964 if (regclass == POINTER_REGS)
12965 return GENERAL_REGS;
12967 if (regclass == STACK_REG)
12969 if (REG_P(x)
12970 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12971 return regclass;
12973 return NO_REGS;
12976 /* Register eliminiation can result in a request for
12977 SP+constant->FP_REGS. We cannot support such operations which
12978 use SP as source and an FP_REG as destination, so reject out
12979 right now. */
12980 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12982 rtx lhs = XEXP (x, 0);
12984 /* Look through a possible SUBREG introduced by ILP32. */
12985 if (SUBREG_P (lhs))
12986 lhs = SUBREG_REG (lhs);
12988 gcc_assert (REG_P (lhs));
12989 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12990 POINTER_REGS));
12991 return NO_REGS;
12994 return regclass;
12997 void
12998 aarch64_asm_output_labelref (FILE* f, const char *name)
13000 asm_fprintf (f, "%U%s", name);
13003 static void
13004 aarch64_elf_asm_constructor (rtx symbol, int priority)
13006 if (priority == DEFAULT_INIT_PRIORITY)
13007 default_ctor_section_asm_out_constructor (symbol, priority);
13008 else
13010 section *s;
13011 /* While priority is known to be in range [0, 65535], so 18 bytes
13012 would be enough, the compiler might not know that. To avoid
13013 -Wformat-truncation false positive, use a larger size. */
13014 char buf[23];
13015 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13016 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13017 switch_to_section (s);
13018 assemble_align (POINTER_SIZE);
13019 assemble_aligned_integer (POINTER_BYTES, symbol);
13023 static void
13024 aarch64_elf_asm_destructor (rtx symbol, int priority)
13026 if (priority == DEFAULT_INIT_PRIORITY)
13027 default_dtor_section_asm_out_destructor (symbol, priority);
13028 else
13030 section *s;
13031 /* While priority is known to be in range [0, 65535], so 18 bytes
13032 would be enough, the compiler might not know that. To avoid
13033 -Wformat-truncation false positive, use a larger size. */
13034 char buf[23];
13035 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13036 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13037 switch_to_section (s);
13038 assemble_align (POINTER_SIZE);
13039 assemble_aligned_integer (POINTER_BYTES, symbol);
13043 const char*
13044 aarch64_output_casesi (rtx *operands)
13046 char buf[100];
13047 char label[100];
13048 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13049 int index;
13050 static const char *const patterns[4][2] =
13053 "ldrb\t%w3, [%0,%w1,uxtw]",
13054 "add\t%3, %4, %w3, sxtb #2"
13057 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13058 "add\t%3, %4, %w3, sxth #2"
13061 "ldr\t%w3, [%0,%w1,uxtw #2]",
13062 "add\t%3, %4, %w3, sxtw #2"
13064 /* We assume that DImode is only generated when not optimizing and
13065 that we don't really need 64-bit address offsets. That would
13066 imply an object file with 8GB of code in a single function! */
13068 "ldr\t%w3, [%0,%w1,uxtw #2]",
13069 "add\t%3, %4, %w3, sxtw #2"
13073 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13075 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13076 index = exact_log2 (GET_MODE_SIZE (mode));
13078 gcc_assert (index >= 0 && index <= 3);
13080 /* Need to implement table size reduction, by chaning the code below. */
13081 output_asm_insn (patterns[index][0], operands);
13082 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13083 snprintf (buf, sizeof (buf),
13084 "adr\t%%4, %s", targetm.strip_name_encoding (label));
13085 output_asm_insn (buf, operands);
13086 output_asm_insn (patterns[index][1], operands);
13087 output_asm_insn ("br\t%3", operands);
13088 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13089 operands);
13090 assemble_label (asm_out_file, label);
13091 return "";
13094 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13095 operand is MASK. */
13096 const char *
13097 aarch64_output_sme_zero_za (rtx mask)
13099 auto mask_val = UINTVAL (mask);
13100 if (mask_val == 0)
13101 return "zero\t{}";
13103 if (mask_val == 0xff)
13104 return "zero\t{ za }";
13106 static constexpr std::pair<unsigned int, char> tiles[] = {
13107 { 0xff, 'b' },
13108 { 0x55, 'h' },
13109 { 0x11, 's' },
13110 { 0x01, 'd' }
13112 /* The last entry in the list has the form "za7.d }", but that's the
13113 same length as "za7.d, ". */
13114 static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13115 unsigned int i = 0;
13116 i += snprintf (buffer + i, sizeof (buffer) - i, "zero\t");
13117 const char *prefix = "{ ";
13118 for (auto &tile : tiles)
13120 auto tile_mask = tile.first;
13121 unsigned int tile_index = 0;
13122 while (tile_mask < 0x100)
13124 if ((mask_val & tile_mask) == tile_mask)
13126 i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13127 prefix, tile_index, tile.second);
13128 prefix = ", ";
13129 mask_val &= ~tile_mask;
13131 tile_mask <<= 1;
13132 tile_index += 1;
13135 gcc_assert (mask_val == 0 && i + 3 <= sizeof (buffer));
13136 snprintf (buffer + i, sizeof (buffer) - i, " }");
13137 return buffer;
13140 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13141 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13142 operator. */
13145 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13147 if (shift >= 0 && shift <= 4)
13149 int size;
13150 for (size = 8; size <= 32; size *= 2)
13152 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13153 if (mask == bits << shift)
13154 return size;
13157 return 0;
13160 /* Constant pools are per function only when PC relative
13161 literal loads are true or we are in the large memory
13162 model. */
13164 static inline bool
13165 aarch64_can_use_per_function_literal_pools_p (void)
13167 return (aarch64_pcrelative_literal_loads
13168 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13171 static bool
13172 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13174 /* We can't use blocks for constants when we're using a per-function
13175 constant pool. */
13176 return !aarch64_can_use_per_function_literal_pools_p ();
13179 /* Select appropriate section for constants depending
13180 on where we place literal pools. */
13182 static section *
13183 aarch64_select_rtx_section (machine_mode mode,
13184 rtx x,
13185 unsigned HOST_WIDE_INT align)
13187 if (aarch64_can_use_per_function_literal_pools_p ())
13188 return function_section (current_function_decl);
13190 return default_elf_select_rtx_section (mode, x, align);
13193 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13194 void
13195 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13196 HOST_WIDE_INT offset)
13198 /* When using per-function literal pools, we must ensure that any code
13199 section is aligned to the minimal instruction length, lest we get
13200 errors from the assembler re "unaligned instructions". */
13201 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13202 ASM_OUTPUT_ALIGN (f, 2);
13205 /* Costs. */
13207 /* Helper function for rtx cost calculation. Strip a shift expression
13208 from X. Returns the inner operand if successful, or the original
13209 expression on failure. */
13210 static rtx
13211 aarch64_strip_shift (rtx x)
13213 rtx op = x;
13215 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13216 we can convert both to ROR during final output. */
13217 if ((GET_CODE (op) == ASHIFT
13218 || GET_CODE (op) == ASHIFTRT
13219 || GET_CODE (op) == LSHIFTRT
13220 || GET_CODE (op) == ROTATERT
13221 || GET_CODE (op) == ROTATE)
13222 && CONST_INT_P (XEXP (op, 1)))
13223 return XEXP (op, 0);
13225 if (GET_CODE (op) == MULT
13226 && CONST_INT_P (XEXP (op, 1))
13227 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13228 return XEXP (op, 0);
13230 return x;
13233 /* Helper function for rtx cost calculation. Strip an extend
13234 expression from X. Returns the inner operand if successful, or the
13235 original expression on failure. We deal with a number of possible
13236 canonicalization variations here. If STRIP_SHIFT is true, then
13237 we can strip off a shift also. */
13238 static rtx
13239 aarch64_strip_extend (rtx x, bool strip_shift)
13241 scalar_int_mode mode;
13242 rtx op = x;
13244 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13245 return op;
13247 if (GET_CODE (op) == AND
13248 && GET_CODE (XEXP (op, 0)) == MULT
13249 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13250 && CONST_INT_P (XEXP (op, 1))
13251 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13252 INTVAL (XEXP (op, 1))) != 0)
13253 return XEXP (XEXP (op, 0), 0);
13255 /* Now handle extended register, as this may also have an optional
13256 left shift by 1..4. */
13257 if (strip_shift
13258 && GET_CODE (op) == ASHIFT
13259 && CONST_INT_P (XEXP (op, 1))
13260 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13261 op = XEXP (op, 0);
13263 if (GET_CODE (op) == ZERO_EXTEND
13264 || GET_CODE (op) == SIGN_EXTEND)
13265 op = XEXP (op, 0);
13267 if (op != x)
13268 return op;
13270 return x;
13273 /* Helper function for rtx cost calculation. Strip extension as well as any
13274 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13275 successful, or the original expression on failure. */
13276 static rtx
13277 aarch64_strip_extend_vec_half (rtx x)
13279 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13281 x = XEXP (x, 0);
13282 if (GET_CODE (x) == VEC_SELECT
13283 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13284 XEXP (x, 1)))
13285 x = XEXP (x, 0);
13287 return x;
13290 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13291 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13292 operand if successful, or the original expression on failure. */
13293 static rtx
13294 aarch64_strip_duplicate_vec_elt (rtx x)
13296 if (GET_CODE (x) == VEC_DUPLICATE
13297 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13299 x = XEXP (x, 0);
13300 if (GET_CODE (x) == VEC_SELECT)
13301 x = XEXP (x, 0);
13302 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13303 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13304 x = XEXP (XEXP (x, 0), 0);
13306 return x;
13309 /* Return true iff CODE is a shift supported in combination
13310 with arithmetic instructions. */
13312 static bool
13313 aarch64_shift_p (enum rtx_code code)
13315 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13319 /* Return true iff X is a cheap shift without a sign extend. */
13321 static bool
13322 aarch64_cheap_mult_shift_p (rtx x)
13324 rtx op0, op1;
13326 op0 = XEXP (x, 0);
13327 op1 = XEXP (x, 1);
13329 if (!(aarch64_tune_params.extra_tuning_flags
13330 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13331 return false;
13333 if (GET_CODE (op0) == SIGN_EXTEND)
13334 return false;
13336 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13337 && UINTVAL (op1) <= 4)
13338 return true;
13340 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13341 return false;
13343 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13345 if (l2 > 0 && l2 <= 4)
13346 return true;
13348 return false;
13351 /* Helper function for rtx cost calculation. Calculate the cost of
13352 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13353 Return the calculated cost of the expression, recursing manually in to
13354 operands where needed. */
13356 static int
13357 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13359 rtx op0, op1;
13360 const struct cpu_cost_table *extra_cost
13361 = aarch64_tune_params.insn_extra_cost;
13362 int cost = 0;
13363 bool compound_p = (outer == PLUS || outer == MINUS);
13364 machine_mode mode = GET_MODE (x);
13366 gcc_checking_assert (code == MULT);
13368 op0 = XEXP (x, 0);
13369 op1 = XEXP (x, 1);
13371 if (VECTOR_MODE_P (mode))
13373 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13374 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13376 /* The select-operand-high-half versions of the instruction have the
13377 same cost as the three vector version - don't add the costs of the
13378 extension or selection into the costs of the multiply. */
13379 op0 = aarch64_strip_extend_vec_half (op0);
13380 op1 = aarch64_strip_extend_vec_half (op1);
13381 /* The by-element versions of the instruction have the same costs as
13382 the normal 3-vector version. We make an assumption that the input
13383 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13384 costing of a MUL by element pre RA is a bit optimistic. */
13385 op0 = aarch64_strip_duplicate_vec_elt (op0);
13386 op1 = aarch64_strip_duplicate_vec_elt (op1);
13388 cost += rtx_cost (op0, mode, MULT, 0, speed);
13389 cost += rtx_cost (op1, mode, MULT, 1, speed);
13390 if (speed)
13392 if (GET_CODE (x) == MULT)
13393 cost += extra_cost->vect.mult;
13394 /* This is to catch the SSRA costing currently flowing here. */
13395 else
13396 cost += extra_cost->vect.alu;
13398 return cost;
13401 /* Integer multiply/fma. */
13402 if (GET_MODE_CLASS (mode) == MODE_INT)
13404 /* The multiply will be canonicalized as a shift, cost it as such. */
13405 if (aarch64_shift_p (GET_CODE (x))
13406 || (CONST_INT_P (op1)
13407 && exact_log2 (INTVAL (op1)) > 0))
13409 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13410 || GET_CODE (op0) == SIGN_EXTEND;
13411 if (speed)
13413 if (compound_p)
13415 /* If the shift is considered cheap,
13416 then don't add any cost. */
13417 if (aarch64_cheap_mult_shift_p (x))
13419 else if (REG_P (op1))
13420 /* ARITH + shift-by-register. */
13421 cost += extra_cost->alu.arith_shift_reg;
13422 else if (is_extend)
13423 /* ARITH + extended register. We don't have a cost field
13424 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13425 cost += extra_cost->alu.extend_arith;
13426 else
13427 /* ARITH + shift-by-immediate. */
13428 cost += extra_cost->alu.arith_shift;
13430 else
13431 /* LSL (immediate). */
13432 cost += extra_cost->alu.shift;
13435 /* Strip extends as we will have costed them in the case above. */
13436 if (is_extend)
13437 op0 = aarch64_strip_extend (op0, true);
13439 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13441 return cost;
13444 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13445 compound and let the below cases handle it. After all, MNEG is a
13446 special-case alias of MSUB. */
13447 if (GET_CODE (op0) == NEG)
13449 op0 = XEXP (op0, 0);
13450 compound_p = true;
13453 /* Integer multiplies or FMAs have zero/sign extending variants. */
13454 if ((GET_CODE (op0) == ZERO_EXTEND
13455 && GET_CODE (op1) == ZERO_EXTEND)
13456 || (GET_CODE (op0) == SIGN_EXTEND
13457 && GET_CODE (op1) == SIGN_EXTEND))
13459 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13460 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13462 if (speed)
13464 if (compound_p)
13465 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13466 cost += extra_cost->mult[0].extend_add;
13467 else
13468 /* MUL/SMULL/UMULL. */
13469 cost += extra_cost->mult[0].extend;
13472 return cost;
13475 /* This is either an integer multiply or a MADD. In both cases
13476 we want to recurse and cost the operands. */
13477 cost += rtx_cost (op0, mode, MULT, 0, speed);
13478 cost += rtx_cost (op1, mode, MULT, 1, speed);
13480 if (speed)
13482 if (compound_p)
13483 /* MADD/MSUB. */
13484 cost += extra_cost->mult[mode == DImode].add;
13485 else
13486 /* MUL. */
13487 cost += extra_cost->mult[mode == DImode].simple;
13490 return cost;
13492 else
13494 if (speed)
13496 /* Floating-point FMA/FMUL can also support negations of the
13497 operands, unless the rounding mode is upward or downward in
13498 which case FNMUL is different than FMUL with operand negation. */
13499 bool neg0 = GET_CODE (op0) == NEG;
13500 bool neg1 = GET_CODE (op1) == NEG;
13501 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13503 if (neg0)
13504 op0 = XEXP (op0, 0);
13505 if (neg1)
13506 op1 = XEXP (op1, 0);
13509 if (compound_p)
13510 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13511 cost += extra_cost->fp[mode == DFmode].fma;
13512 else
13513 /* FMUL/FNMUL. */
13514 cost += extra_cost->fp[mode == DFmode].mult;
13517 cost += rtx_cost (op0, mode, MULT, 0, speed);
13518 cost += rtx_cost (op1, mode, MULT, 1, speed);
13519 return cost;
13523 static int
13524 aarch64_address_cost (rtx x,
13525 machine_mode mode,
13526 addr_space_t as ATTRIBUTE_UNUSED,
13527 bool speed)
13529 enum rtx_code c = GET_CODE (x);
13530 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13531 struct aarch64_address_info info;
13532 int cost = 0;
13533 info.shift = 0;
13535 if (!aarch64_classify_address (&info, x, mode, false))
13537 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13539 /* This is a CONST or SYMBOL ref which will be split
13540 in a different way depending on the code model in use.
13541 Cost it through the generic infrastructure. */
13542 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13543 /* Divide through by the cost of one instruction to
13544 bring it to the same units as the address costs. */
13545 cost_symbol_ref /= COSTS_N_INSNS (1);
13546 /* The cost is then the cost of preparing the address,
13547 followed by an immediate (possibly 0) offset. */
13548 return cost_symbol_ref + addr_cost->imm_offset;
13550 else
13552 /* This is most likely a jump table from a case
13553 statement. */
13554 return addr_cost->register_offset;
13558 switch (info.type)
13560 case ADDRESS_LO_SUM:
13561 case ADDRESS_SYMBOLIC:
13562 case ADDRESS_REG_IMM:
13563 cost += addr_cost->imm_offset;
13564 break;
13566 case ADDRESS_REG_WB:
13567 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13568 cost += addr_cost->pre_modify;
13569 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13571 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13572 if (nvectors == 3)
13573 cost += addr_cost->post_modify_ld3_st3;
13574 else if (nvectors == 4)
13575 cost += addr_cost->post_modify_ld4_st4;
13576 else
13577 cost += addr_cost->post_modify;
13579 else
13580 gcc_unreachable ();
13582 break;
13584 case ADDRESS_REG_REG:
13585 cost += addr_cost->register_offset;
13586 break;
13588 case ADDRESS_REG_SXTW:
13589 cost += addr_cost->register_sextend;
13590 break;
13592 case ADDRESS_REG_UXTW:
13593 cost += addr_cost->register_zextend;
13594 break;
13596 default:
13597 gcc_unreachable ();
13601 if (info.shift > 0)
13603 /* For the sake of calculating the cost of the shifted register
13604 component, we can treat same sized modes in the same way. */
13605 if (known_eq (GET_MODE_BITSIZE (mode), 16))
13606 cost += addr_cost->addr_scale_costs.hi;
13607 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13608 cost += addr_cost->addr_scale_costs.si;
13609 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13610 cost += addr_cost->addr_scale_costs.di;
13611 else
13612 /* We can't tell, or this is a 128-bit vector. */
13613 cost += addr_cost->addr_scale_costs.ti;
13616 return cost;
13619 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13620 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13621 to be taken. */
13624 aarch64_branch_cost (bool speed_p, bool predictable_p)
13626 /* When optimizing for speed, use the cost of unpredictable branches. */
13627 const struct cpu_branch_cost *branch_costs =
13628 aarch64_tune_params.branch_costs;
13630 if (!speed_p || predictable_p)
13631 return branch_costs->predictable;
13632 else
13633 return branch_costs->unpredictable;
13636 /* Return true if X is a zero or sign extract
13637 usable in an ADD or SUB (extended register) instruction. */
13638 static bool
13639 aarch64_rtx_arith_op_extract_p (rtx x)
13641 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13642 No shift. */
13643 if (GET_CODE (x) == SIGN_EXTEND
13644 || GET_CODE (x) == ZERO_EXTEND)
13645 return REG_P (XEXP (x, 0));
13647 return false;
13650 static bool
13651 aarch64_frint_unspec_p (unsigned int u)
13653 switch (u)
13655 case UNSPEC_FRINTZ:
13656 case UNSPEC_FRINTP:
13657 case UNSPEC_FRINTM:
13658 case UNSPEC_FRINTA:
13659 case UNSPEC_FRINTN:
13660 case UNSPEC_FRINTX:
13661 case UNSPEC_FRINTI:
13662 return true;
13664 default:
13665 return false;
13669 /* Return true iff X is an rtx that will match an extr instruction
13670 i.e. as described in the *extr<mode>5_insn family of patterns.
13671 OP0 and OP1 will be set to the operands of the shifts involved
13672 on success and will be NULL_RTX otherwise. */
13674 static bool
13675 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13677 rtx op0, op1;
13678 scalar_int_mode mode;
13679 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13680 return false;
13682 *res_op0 = NULL_RTX;
13683 *res_op1 = NULL_RTX;
13685 if (GET_CODE (x) != IOR)
13686 return false;
13688 op0 = XEXP (x, 0);
13689 op1 = XEXP (x, 1);
13691 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13692 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13694 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13695 if (GET_CODE (op1) == ASHIFT)
13696 std::swap (op0, op1);
13698 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13699 return false;
13701 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13702 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13704 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13705 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13707 *res_op0 = XEXP (op0, 0);
13708 *res_op1 = XEXP (op1, 0);
13709 return true;
13713 return false;
13716 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13717 storing it in *COST. Result is true if the total cost of the operation
13718 has now been calculated. */
13719 static bool
13720 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13722 rtx inner;
13723 rtx comparator;
13724 enum rtx_code cmpcode;
13725 const struct cpu_cost_table *extra_cost
13726 = aarch64_tune_params.insn_extra_cost;
13728 if (COMPARISON_P (op0))
13730 inner = XEXP (op0, 0);
13731 comparator = XEXP (op0, 1);
13732 cmpcode = GET_CODE (op0);
13734 else
13736 inner = op0;
13737 comparator = const0_rtx;
13738 cmpcode = NE;
13741 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13743 /* Conditional branch. */
13744 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13745 return true;
13746 else
13748 if (cmpcode == NE || cmpcode == EQ)
13750 if (comparator == const0_rtx)
13752 /* TBZ/TBNZ/CBZ/CBNZ. */
13753 if (GET_CODE (inner) == ZERO_EXTRACT)
13754 /* TBZ/TBNZ. */
13755 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13756 ZERO_EXTRACT, 0, speed);
13757 else
13758 /* CBZ/CBNZ. */
13759 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13761 return true;
13763 if (register_operand (inner, VOIDmode)
13764 && aarch64_imm24 (comparator, VOIDmode))
13766 /* SUB and SUBS. */
13767 *cost += COSTS_N_INSNS (2);
13768 if (speed)
13769 *cost += extra_cost->alu.arith * 2;
13770 return true;
13773 else if (cmpcode == LT || cmpcode == GE)
13775 /* TBZ/TBNZ. */
13776 if (comparator == const0_rtx)
13777 return true;
13781 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13783 /* CCMP. */
13784 if (GET_CODE (op1) == COMPARE)
13786 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13787 if (XEXP (op1, 1) == const0_rtx)
13788 *cost += 1;
13789 if (speed)
13791 machine_mode mode = GET_MODE (XEXP (op1, 0));
13793 if (GET_MODE_CLASS (mode) == MODE_INT)
13794 *cost += extra_cost->alu.arith;
13795 else
13796 *cost += extra_cost->fp[mode == DFmode].compare;
13798 return true;
13801 /* It's a conditional operation based on the status flags,
13802 so it must be some flavor of CSEL. */
13804 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13805 if (GET_CODE (op1) == NEG
13806 || GET_CODE (op1) == NOT
13807 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13808 op1 = XEXP (op1, 0);
13809 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13811 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13812 op1 = XEXP (op1, 0);
13813 op2 = XEXP (op2, 0);
13815 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13817 inner = XEXP (op1, 0);
13818 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13819 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13820 op1 = XEXP (inner, 0);
13822 else if (op1 == constm1_rtx || op1 == const1_rtx)
13824 /* Use CSINV or CSINC. */
13825 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13826 return true;
13828 else if (op2 == constm1_rtx || op2 == const1_rtx)
13830 /* Use CSINV or CSINC. */
13831 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13832 return true;
13835 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13836 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13837 return true;
13840 /* We don't know what this is, cost all operands. */
13841 return false;
13844 /* Check whether X is a bitfield operation of the form shift + extend that
13845 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13846 operand to which the bitfield operation is applied. Otherwise return
13847 NULL_RTX. */
13849 static rtx
13850 aarch64_extend_bitfield_pattern_p (rtx x)
13852 rtx_code outer_code = GET_CODE (x);
13853 machine_mode outer_mode = GET_MODE (x);
13855 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13856 && outer_mode != SImode && outer_mode != DImode)
13857 return NULL_RTX;
13859 rtx inner = XEXP (x, 0);
13860 rtx_code inner_code = GET_CODE (inner);
13861 machine_mode inner_mode = GET_MODE (inner);
13862 rtx op = NULL_RTX;
13864 switch (inner_code)
13866 case ASHIFT:
13867 if (CONST_INT_P (XEXP (inner, 1))
13868 && (inner_mode == QImode || inner_mode == HImode))
13869 op = XEXP (inner, 0);
13870 break;
13871 case LSHIFTRT:
13872 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13873 && (inner_mode == QImode || inner_mode == HImode))
13874 op = XEXP (inner, 0);
13875 break;
13876 case ASHIFTRT:
13877 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13878 && (inner_mode == QImode || inner_mode == HImode))
13879 op = XEXP (inner, 0);
13880 break;
13881 default:
13882 break;
13885 return op;
13888 /* Return true if the mask and a shift amount from an RTX of the form
13889 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13890 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
13892 bool
13893 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13894 rtx shft_amnt)
13896 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13897 && INTVAL (mask) > 0
13898 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13899 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13900 && (UINTVAL (mask)
13901 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13904 /* Return true if the masks and a shift amount from an RTX of the form
13905 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13906 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
13908 bool
13909 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13910 unsigned HOST_WIDE_INT mask1,
13911 unsigned HOST_WIDE_INT shft_amnt,
13912 unsigned HOST_WIDE_INT mask2)
13914 unsigned HOST_WIDE_INT t;
13916 /* Verify that there is no overlap in what bits are set in the two masks. */
13917 if (mask1 != ~mask2)
13918 return false;
13920 /* Verify that mask2 is not all zeros or ones. */
13921 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13922 return false;
13924 /* The shift amount should always be less than the mode size. */
13925 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13927 /* Verify that the mask being shifted is contiguous and would be in the
13928 least significant bits after shifting by shft_amnt. */
13929 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13930 return (t == (t & -t));
13933 /* Return true if X is an RTX representing an operation in the ABD family
13934 of instructions. */
13936 static bool
13937 aarch64_abd_rtx_p (rtx x)
13939 if (GET_CODE (x) != MINUS)
13940 return false;
13941 rtx max_arm = XEXP (x, 0);
13942 rtx min_arm = XEXP (x, 1);
13943 if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
13944 return false;
13945 bool signed_p = GET_CODE (max_arm) == SMAX;
13946 if (signed_p && GET_CODE (min_arm) != SMIN)
13947 return false;
13948 else if (!signed_p && GET_CODE (min_arm) != UMIN)
13949 return false;
13951 rtx maxop0 = XEXP (max_arm, 0);
13952 rtx maxop1 = XEXP (max_arm, 1);
13953 rtx minop0 = XEXP (min_arm, 0);
13954 rtx minop1 = XEXP (min_arm, 1);
13955 return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
13958 /* Calculate the cost of calculating X, storing it in *COST. Result
13959 is true if the total cost of the operation has now been calculated. */
13960 static bool
13961 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13962 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13964 rtx op0, op1, op2;
13965 const struct cpu_cost_table *extra_cost
13966 = aarch64_tune_params.insn_extra_cost;
13967 rtx_code code = GET_CODE (x);
13968 scalar_int_mode int_mode;
13970 /* By default, assume that everything has equivalent cost to the
13971 cheapest instruction. Any additional costs are applied as a delta
13972 above this default. */
13973 *cost = COSTS_N_INSNS (1);
13975 switch (code)
13977 case SET:
13978 /* The cost depends entirely on the operands to SET. */
13979 *cost = 0;
13980 op0 = SET_DEST (x);
13981 op1 = SET_SRC (x);
13983 switch (GET_CODE (op0))
13985 case MEM:
13986 if (speed)
13988 rtx address = XEXP (op0, 0);
13989 if (VECTOR_MODE_P (mode))
13990 *cost += extra_cost->ldst.storev;
13991 else if (GET_MODE_CLASS (mode) == MODE_INT)
13992 *cost += extra_cost->ldst.store;
13993 else if (mode == SFmode || mode == SDmode)
13994 *cost += extra_cost->ldst.storef;
13995 else if (mode == DFmode || mode == DDmode)
13996 *cost += extra_cost->ldst.stored;
13998 *cost +=
13999 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14000 0, speed));
14003 *cost += rtx_cost (op1, mode, SET, 1, speed);
14004 return true;
14006 case SUBREG:
14007 if (! REG_P (SUBREG_REG (op0)))
14008 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14010 /* Fall through. */
14011 case REG:
14012 /* The cost is one per vector-register copied. */
14013 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14015 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14016 *cost = COSTS_N_INSNS (nregs);
14018 /* const0_rtx is in general free, but we will use an
14019 instruction to set a register to 0. */
14020 else if (REG_P (op1) || op1 == const0_rtx)
14022 /* The cost is 1 per register copied. */
14023 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14024 *cost = COSTS_N_INSNS (nregs);
14026 else
14027 /* Cost is just the cost of the RHS of the set. */
14028 *cost += rtx_cost (op1, mode, SET, 1, speed);
14029 return true;
14031 case ZERO_EXTRACT:
14032 case SIGN_EXTRACT:
14033 /* Bit-field insertion. Strip any redundant widening of
14034 the RHS to meet the width of the target. */
14035 if (SUBREG_P (op1))
14036 op1 = SUBREG_REG (op1);
14037 if ((GET_CODE (op1) == ZERO_EXTEND
14038 || GET_CODE (op1) == SIGN_EXTEND)
14039 && CONST_INT_P (XEXP (op0, 1))
14040 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14041 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14042 op1 = XEXP (op1, 0);
14044 if (CONST_INT_P (op1))
14046 /* MOV immediate is assumed to always be cheap. */
14047 *cost = COSTS_N_INSNS (1);
14049 else
14051 /* BFM. */
14052 if (speed)
14053 *cost += extra_cost->alu.bfi;
14054 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
14057 return true;
14059 default:
14060 /* We can't make sense of this, assume default cost. */
14061 *cost = COSTS_N_INSNS (1);
14062 return false;
14064 return false;
14066 case CONST_INT:
14067 /* If an instruction can incorporate a constant within the
14068 instruction, the instruction's expression avoids calling
14069 rtx_cost() on the constant. If rtx_cost() is called on a
14070 constant, then it is usually because the constant must be
14071 moved into a register by one or more instructions.
14073 The exception is constant 0, which can be expressed
14074 as XZR/WZR and is therefore free. The exception to this is
14075 if we have (set (reg) (const0_rtx)) in which case we must cost
14076 the move. However, we can catch that when we cost the SET, so
14077 we don't need to consider that here. */
14078 if (x == const0_rtx)
14079 *cost = 0;
14080 else
14082 /* To an approximation, building any other constant is
14083 proportionally expensive to the number of instructions
14084 required to build that constant. This is true whether we
14085 are compiling for SPEED or otherwise. */
14086 machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14087 ? SImode : DImode;
14088 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14089 (NULL_RTX, x, false, imode));
14091 return true;
14093 case CONST_DOUBLE:
14095 /* First determine number of instructions to do the move
14096 as an integer constant. */
14097 if (!aarch64_float_const_representable_p (x)
14098 && !aarch64_can_const_movi_rtx_p (x, mode)
14099 && aarch64_float_const_rtx_p (x))
14101 unsigned HOST_WIDE_INT ival;
14102 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14103 gcc_assert (succeed);
14105 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14106 ? DImode : SImode;
14107 int ncost = aarch64_internal_mov_immediate
14108 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14109 *cost += COSTS_N_INSNS (ncost);
14110 return true;
14113 if (speed)
14115 /* mov[df,sf]_aarch64. */
14116 if (aarch64_float_const_representable_p (x))
14117 /* FMOV (scalar immediate). */
14118 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14119 else if (!aarch64_float_const_zero_rtx_p (x))
14121 /* This will be a load from memory. */
14122 if (mode == DFmode || mode == DDmode)
14123 *cost += extra_cost->ldst.loadd;
14124 else
14125 *cost += extra_cost->ldst.loadf;
14127 else
14128 /* Otherwise this is +0.0. We get this using MOVI d0, #0
14129 or MOV v0.s[0], wzr - neither of which are modeled by the
14130 cost tables. Just use the default cost. */
14135 return true;
14137 case MEM:
14138 if (speed)
14140 /* For loads we want the base cost of a load, plus an
14141 approximation for the additional cost of the addressing
14142 mode. */
14143 rtx address = XEXP (x, 0);
14144 if (VECTOR_MODE_P (mode))
14145 *cost += extra_cost->ldst.loadv;
14146 else if (GET_MODE_CLASS (mode) == MODE_INT)
14147 *cost += extra_cost->ldst.load;
14148 else if (mode == SFmode || mode == SDmode)
14149 *cost += extra_cost->ldst.loadf;
14150 else if (mode == DFmode || mode == DDmode)
14151 *cost += extra_cost->ldst.loadd;
14153 *cost +=
14154 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14155 0, speed));
14158 return true;
14160 case NEG:
14161 op0 = XEXP (x, 0);
14163 if (VECTOR_MODE_P (mode))
14165 /* Many vector comparison operations are represented as NEG
14166 of a comparison. */
14167 if (COMPARISON_P (op0))
14169 rtx op00 = XEXP (op0, 0);
14170 rtx op01 = XEXP (op0, 1);
14171 machine_mode inner_mode = GET_MODE (op00);
14172 /* FACGE/FACGT. */
14173 if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14174 && GET_CODE (op00) == ABS
14175 && GET_CODE (op01) == ABS)
14177 op00 = XEXP (op00, 0);
14178 op01 = XEXP (op01, 0);
14180 *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14181 *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14182 if (speed)
14183 *cost += extra_cost->vect.alu;
14184 return true;
14186 if (speed)
14188 /* FNEG. */
14189 *cost += extra_cost->vect.alu;
14191 return false;
14194 if (GET_MODE_CLASS (mode) == MODE_INT)
14196 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14197 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14199 /* CSETM. */
14200 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14201 return true;
14204 /* Cost this as SUB wzr, X. */
14205 op0 = CONST0_RTX (mode);
14206 op1 = XEXP (x, 0);
14207 goto cost_minus;
14210 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14212 /* Support (neg(fma...)) as a single instruction only if
14213 sign of zeros is unimportant. This matches the decision
14214 making in aarch64.md. */
14215 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14217 /* FNMADD. */
14218 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14219 return true;
14221 if (GET_CODE (op0) == MULT)
14223 /* FNMUL. */
14224 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14225 return true;
14227 if (speed)
14228 /* FNEG. */
14229 *cost += extra_cost->fp[mode == DFmode].neg;
14230 return false;
14233 return false;
14235 case CLRSB:
14236 case CLZ:
14237 if (speed)
14239 if (VECTOR_MODE_P (mode))
14240 *cost += extra_cost->vect.alu;
14241 else
14242 *cost += extra_cost->alu.clz;
14245 return false;
14247 case CTZ:
14248 *cost = COSTS_N_INSNS (2);
14250 if (speed)
14251 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14252 return false;
14254 case COMPARE:
14255 op0 = XEXP (x, 0);
14256 op1 = XEXP (x, 1);
14258 if (op1 == const0_rtx
14259 && GET_CODE (op0) == AND)
14261 x = op0;
14262 mode = GET_MODE (op0);
14263 goto cost_logic;
14266 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14268 /* TODO: A write to the CC flags possibly costs extra, this
14269 needs encoding in the cost tables. */
14271 mode = GET_MODE (op0);
14272 /* ANDS. */
14273 if (GET_CODE (op0) == AND)
14275 x = op0;
14276 goto cost_logic;
14279 if (GET_CODE (op0) == PLUS)
14281 /* ADDS (and CMN alias). */
14282 x = op0;
14283 goto cost_plus;
14286 if (GET_CODE (op0) == MINUS)
14288 /* SUBS. */
14289 x = op0;
14290 goto cost_minus;
14293 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14294 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14295 && CONST_INT_P (XEXP (op0, 2)))
14297 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14298 Handle it here directly rather than going to cost_logic
14299 since we know the immediate generated for the TST is valid
14300 so we can avoid creating an intermediate rtx for it only
14301 for costing purposes. */
14302 if (speed)
14303 *cost += extra_cost->alu.logical;
14305 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14306 ZERO_EXTRACT, 0, speed);
14307 return true;
14310 if (GET_CODE (op1) == NEG)
14312 /* CMN. */
14313 if (speed)
14314 *cost += extra_cost->alu.arith;
14316 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14317 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14318 return true;
14321 /* CMP.
14323 Compare can freely swap the order of operands, and
14324 canonicalization puts the more complex operation first.
14325 But the integer MINUS logic expects the shift/extend
14326 operation in op1. */
14327 if (! (REG_P (op0)
14328 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14330 op0 = XEXP (x, 1);
14331 op1 = XEXP (x, 0);
14333 goto cost_minus;
14336 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14338 /* FCMP. */
14339 if (speed)
14340 *cost += extra_cost->fp[mode == DFmode].compare;
14342 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14344 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14345 /* FCMP supports constant 0.0 for no extra cost. */
14346 return true;
14348 return false;
14351 if (VECTOR_MODE_P (mode))
14353 /* Vector compare. */
14354 if (speed)
14355 *cost += extra_cost->vect.alu;
14357 if (aarch64_float_const_zero_rtx_p (op1))
14359 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14360 cost. */
14361 return true;
14363 return false;
14365 return false;
14367 case MINUS:
14369 op0 = XEXP (x, 0);
14370 op1 = XEXP (x, 1);
14372 cost_minus:
14373 if (VECTOR_MODE_P (mode))
14375 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14376 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14378 /* Recognise the SABD and UABD operation here.
14379 Recursion from the PLUS case will catch the accumulating
14380 forms. */
14381 if (aarch64_abd_rtx_p (x))
14383 if (speed)
14384 *cost += extra_cost->vect.alu;
14385 return true;
14387 /* SUBL2 and SUBW2.
14388 The select-operand-high-half versions of the sub instruction
14389 have the same cost as the regular three vector version -
14390 don't add the costs of the select into the costs of the sub.
14392 op0 = aarch64_strip_extend_vec_half (op0);
14393 op1 = aarch64_strip_extend_vec_half (op1);
14397 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14399 /* Detect valid immediates. */
14400 if ((GET_MODE_CLASS (mode) == MODE_INT
14401 || (GET_MODE_CLASS (mode) == MODE_CC
14402 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14403 && CONST_INT_P (op1)
14404 && aarch64_uimm12_shift (INTVAL (op1)))
14406 if (speed)
14407 /* SUB(S) (immediate). */
14408 *cost += extra_cost->alu.arith;
14409 return true;
14412 /* Look for SUB (extended register). */
14413 if (is_a <scalar_int_mode> (mode)
14414 && aarch64_rtx_arith_op_extract_p (op1))
14416 if (speed)
14417 *cost += extra_cost->alu.extend_arith;
14419 op1 = aarch64_strip_extend (op1, true);
14420 *cost += rtx_cost (op1, VOIDmode,
14421 (enum rtx_code) GET_CODE (op1), 0, speed);
14422 return true;
14425 rtx new_op1 = aarch64_strip_extend (op1, false);
14427 /* Cost this as an FMA-alike operation. */
14428 if ((GET_CODE (new_op1) == MULT
14429 || aarch64_shift_p (GET_CODE (new_op1)))
14430 && code != COMPARE)
14432 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14433 (enum rtx_code) code,
14434 speed);
14435 return true;
14438 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14440 if (speed)
14442 if (VECTOR_MODE_P (mode))
14444 /* Vector SUB. */
14445 *cost += extra_cost->vect.alu;
14447 else if (GET_MODE_CLASS (mode) == MODE_INT)
14449 /* SUB(S). */
14450 *cost += extra_cost->alu.arith;
14452 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14454 /* FSUB. */
14455 *cost += extra_cost->fp[mode == DFmode].addsub;
14458 return true;
14461 case PLUS:
14463 rtx new_op0;
14465 op0 = XEXP (x, 0);
14466 op1 = XEXP (x, 1);
14468 cost_plus:
14469 if (VECTOR_MODE_P (mode))
14471 /* ADDL2 and ADDW2. */
14472 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14473 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14475 /* The select-operand-high-half versions of the add instruction
14476 have the same cost as the regular three vector version -
14477 don't add the costs of the select into the costs of the add.
14479 op0 = aarch64_strip_extend_vec_half (op0);
14480 op1 = aarch64_strip_extend_vec_half (op1);
14484 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14485 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14487 /* CSINC. */
14488 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14489 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14490 return true;
14493 if (GET_MODE_CLASS (mode) == MODE_INT
14494 && (aarch64_plus_immediate (op1, mode)
14495 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14497 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14499 if (speed)
14501 /* ADD (immediate). */
14502 *cost += extra_cost->alu.arith;
14504 /* Some tunings prefer to not use the VL-based scalar ops.
14505 Increase the cost of the poly immediate to prevent their
14506 formation. */
14507 if (GET_CODE (op1) == CONST_POLY_INT
14508 && (aarch64_tune_params.extra_tuning_flags
14509 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14510 *cost += COSTS_N_INSNS (1);
14512 return true;
14515 if (aarch64_pluslong_immediate (op1, mode))
14517 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14518 if ((INTVAL (op1) & 0xfff) != 0)
14519 *cost += COSTS_N_INSNS (1);
14521 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14522 return true;
14525 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14527 /* Look for ADD (extended register). */
14528 if (is_a <scalar_int_mode> (mode)
14529 && aarch64_rtx_arith_op_extract_p (op0))
14531 if (speed)
14532 *cost += extra_cost->alu.extend_arith;
14534 op0 = aarch64_strip_extend (op0, true);
14535 *cost += rtx_cost (op0, VOIDmode,
14536 (enum rtx_code) GET_CODE (op0), 0, speed);
14537 return true;
14540 /* Strip any extend, leave shifts behind as we will
14541 cost them through mult_cost. */
14542 new_op0 = aarch64_strip_extend (op0, false);
14544 if (GET_CODE (new_op0) == MULT
14545 || aarch64_shift_p (GET_CODE (new_op0)))
14547 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14548 speed);
14549 return true;
14552 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14554 if (speed)
14556 if (VECTOR_MODE_P (mode))
14558 /* Vector ADD. */
14559 *cost += extra_cost->vect.alu;
14561 else if (GET_MODE_CLASS (mode) == MODE_INT)
14563 /* ADD. */
14564 *cost += extra_cost->alu.arith;
14566 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14568 /* FADD. */
14569 *cost += extra_cost->fp[mode == DFmode].addsub;
14572 return true;
14575 case BSWAP:
14576 *cost = COSTS_N_INSNS (1);
14578 if (speed)
14580 if (VECTOR_MODE_P (mode))
14581 *cost += extra_cost->vect.alu;
14582 else
14583 *cost += extra_cost->alu.rev;
14585 return false;
14587 case IOR:
14588 if (aarch_rev16_p (x))
14590 *cost = COSTS_N_INSNS (1);
14592 if (speed)
14594 if (VECTOR_MODE_P (mode))
14595 *cost += extra_cost->vect.alu;
14596 else
14597 *cost += extra_cost->alu.rev;
14599 return true;
14602 if (aarch64_extr_rtx_p (x, &op0, &op1))
14604 *cost += rtx_cost (op0, mode, IOR, 0, speed);
14605 *cost += rtx_cost (op1, mode, IOR, 1, speed);
14606 if (speed)
14607 *cost += extra_cost->alu.shift;
14609 return true;
14611 /* Fall through. */
14612 case XOR:
14613 case AND:
14614 cost_logic:
14615 op0 = XEXP (x, 0);
14616 op1 = XEXP (x, 1);
14618 if (VECTOR_MODE_P (mode))
14620 if (speed)
14621 *cost += extra_cost->vect.alu;
14622 return true;
14625 if (code == AND
14626 && GET_CODE (op0) == MULT
14627 && CONST_INT_P (XEXP (op0, 1))
14628 && CONST_INT_P (op1)
14629 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14630 INTVAL (op1)) != 0)
14632 /* This is a UBFM/SBFM. */
14633 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14634 if (speed)
14635 *cost += extra_cost->alu.bfx;
14636 return true;
14639 if (is_int_mode (mode, &int_mode))
14641 if (CONST_INT_P (op1))
14643 /* We have a mask + shift version of a UBFIZ
14644 i.e. the *andim_ashift<mode>_bfiz pattern. */
14645 if (GET_CODE (op0) == ASHIFT
14646 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14647 XEXP (op0, 1)))
14649 *cost += rtx_cost (XEXP (op0, 0), int_mode,
14650 (enum rtx_code) code, 0, speed);
14651 if (speed)
14652 *cost += extra_cost->alu.bfx;
14654 return true;
14656 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14658 /* We possibly get the immediate for free, this is not
14659 modelled. */
14660 *cost += rtx_cost (op0, int_mode,
14661 (enum rtx_code) code, 0, speed);
14662 if (speed)
14663 *cost += extra_cost->alu.logical;
14665 return true;
14668 else
14670 rtx new_op0 = op0;
14672 /* Handle ORN, EON, or BIC. */
14673 if (GET_CODE (op0) == NOT)
14674 op0 = XEXP (op0, 0);
14676 new_op0 = aarch64_strip_shift (op0);
14678 /* If we had a shift on op0 then this is a logical-shift-
14679 by-register/immediate operation. Otherwise, this is just
14680 a logical operation. */
14681 if (speed)
14683 if (new_op0 != op0)
14685 /* Shift by immediate. */
14686 if (CONST_INT_P (XEXP (op0, 1)))
14687 *cost += extra_cost->alu.log_shift;
14688 else
14689 *cost += extra_cost->alu.log_shift_reg;
14691 else
14692 *cost += extra_cost->alu.logical;
14695 /* In both cases we want to cost both operands. */
14696 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14697 0, speed);
14698 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14699 1, speed);
14701 return true;
14704 return false;
14706 case NOT:
14707 x = XEXP (x, 0);
14708 op0 = aarch64_strip_shift (x);
14710 if (VECTOR_MODE_P (mode))
14712 /* Vector NOT. */
14713 *cost += extra_cost->vect.alu;
14714 return false;
14717 /* MVN-shifted-reg. */
14718 if (op0 != x)
14720 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14722 if (speed)
14723 *cost += extra_cost->alu.log_shift;
14725 return true;
14727 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14728 Handle the second form here taking care that 'a' in the above can
14729 be a shift. */
14730 else if (GET_CODE (op0) == XOR)
14732 rtx newop0 = XEXP (op0, 0);
14733 rtx newop1 = XEXP (op0, 1);
14734 rtx op0_stripped = aarch64_strip_shift (newop0);
14736 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14737 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14739 if (speed)
14741 if (op0_stripped != newop0)
14742 *cost += extra_cost->alu.log_shift;
14743 else
14744 *cost += extra_cost->alu.logical;
14747 return true;
14749 /* MVN. */
14750 if (speed)
14751 *cost += extra_cost->alu.logical;
14753 return false;
14755 case ZERO_EXTEND:
14757 op0 = XEXP (x, 0);
14758 /* If a value is written in SI mode, then zero extended to DI
14759 mode, the operation will in general be free as a write to
14760 a 'w' register implicitly zeroes the upper bits of an 'x'
14761 register. However, if this is
14763 (set (reg) (zero_extend (reg)))
14765 we must cost the explicit register move. */
14766 if (mode == DImode
14767 && GET_MODE (op0) == SImode)
14769 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14771 /* If OP_COST is non-zero, then the cost of the zero extend
14772 is effectively the cost of the inner operation. Otherwise
14773 we have a MOV instruction and we take the cost from the MOV
14774 itself. This is true independently of whether we are
14775 optimizing for space or time. */
14776 if (op_cost)
14777 *cost = op_cost;
14779 return true;
14781 else if (MEM_P (op0))
14783 /* All loads can zero extend to any size for free. */
14784 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14785 return true;
14788 op0 = aarch64_extend_bitfield_pattern_p (x);
14789 if (op0)
14791 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14792 if (speed)
14793 *cost += extra_cost->alu.bfx;
14794 return true;
14797 if (speed)
14799 if (VECTOR_MODE_P (mode))
14801 /* UMOV. */
14802 *cost += extra_cost->vect.alu;
14804 else
14806 /* We generate an AND instead of UXTB/UXTH. */
14807 *cost += extra_cost->alu.logical;
14810 return false;
14812 case SIGN_EXTEND:
14813 if (MEM_P (XEXP (x, 0)))
14815 /* LDRSH. */
14816 if (speed)
14818 rtx address = XEXP (XEXP (x, 0), 0);
14819 *cost += extra_cost->ldst.load_sign_extend;
14821 *cost +=
14822 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14823 0, speed));
14825 return true;
14828 op0 = aarch64_extend_bitfield_pattern_p (x);
14829 if (op0)
14831 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14832 if (speed)
14833 *cost += extra_cost->alu.bfx;
14834 return true;
14837 if (speed)
14839 if (VECTOR_MODE_P (mode))
14840 *cost += extra_cost->vect.alu;
14841 else
14842 *cost += extra_cost->alu.extend;
14844 return false;
14846 case ROTATE:
14847 case ROTATERT:
14848 case LSHIFTRT:
14849 case ASHIFTRT:
14850 case ASHIFT:
14851 op0 = XEXP (x, 0);
14852 op1 = XEXP (x, 1);
14854 if (CONST_INT_P (op1))
14856 if (speed)
14858 if (VECTOR_MODE_P (mode))
14860 /* Vector shift (immediate). */
14861 *cost += extra_cost->vect.alu;
14863 else
14865 /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
14866 These are all aliases. */
14867 *cost += extra_cost->alu.shift;
14871 /* We can incorporate zero/sign extend for free. */
14872 if (GET_CODE (op0) == ZERO_EXTEND
14873 || GET_CODE (op0) == SIGN_EXTEND)
14874 op0 = XEXP (op0, 0);
14876 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14877 return true;
14879 else
14881 if (VECTOR_MODE_P (mode))
14883 if (speed)
14884 /* Vector shift (register). */
14885 *cost += extra_cost->vect.alu;
14887 else
14889 if (speed)
14890 /* LSLV, ASRV. */
14891 *cost += extra_cost->alu.shift_reg;
14893 /* The register shift amount may be in a shorter mode expressed
14894 as a lowpart SUBREG. For costing purposes just look inside. */
14895 if (SUBREG_P (op1) && subreg_lowpart_p (op1))
14896 op1 = SUBREG_REG (op1);
14897 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14898 && CONST_INT_P (XEXP (op1, 1))
14899 && known_eq (INTVAL (XEXP (op1, 1)),
14900 GET_MODE_BITSIZE (mode) - 1))
14902 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14903 /* We already demanded XEXP (op1, 0) to be REG_P, so
14904 don't recurse into it. */
14905 return true;
14908 return false; /* All arguments need to be in registers. */
14911 case SYMBOL_REF:
14913 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14914 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14916 /* LDR. */
14917 if (speed)
14918 *cost += extra_cost->ldst.load;
14920 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14921 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14923 /* ADRP, followed by ADD. */
14924 *cost += COSTS_N_INSNS (1);
14925 if (speed)
14926 *cost += 2 * extra_cost->alu.arith;
14928 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14929 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14931 /* ADR. */
14932 if (speed)
14933 *cost += extra_cost->alu.arith;
14936 if (flag_pic)
14938 /* One extra load instruction, after accessing the GOT. */
14939 *cost += COSTS_N_INSNS (1);
14940 if (speed)
14941 *cost += extra_cost->ldst.load;
14943 return true;
14945 case HIGH:
14946 case LO_SUM:
14947 /* ADRP/ADD (immediate). */
14948 if (speed)
14949 *cost += extra_cost->alu.arith;
14950 return true;
14952 case ZERO_EXTRACT:
14953 case SIGN_EXTRACT:
14954 /* UBFX/SBFX. */
14955 if (speed)
14957 if (VECTOR_MODE_P (mode))
14958 *cost += extra_cost->vect.alu;
14959 else
14960 *cost += extra_cost->alu.bfx;
14963 /* We can trust that the immediates used will be correct (there
14964 are no by-register forms), so we need only cost op0. */
14965 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14966 return true;
14968 case MULT:
14969 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14970 /* aarch64_rtx_mult_cost always handles recursion to its
14971 operands. */
14972 return true;
14974 case MOD:
14975 /* We can expand signed mod by power of 2 using a NEGS, two parallel
14976 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
14977 an unconditional negate. This case should only ever be reached through
14978 the set_smod_pow2_cheap check in expmed.cc. */
14979 if (CONST_INT_P (XEXP (x, 1))
14980 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14981 && (mode == SImode || mode == DImode))
14983 /* We expand to 4 instructions. Reset the baseline. */
14984 *cost = COSTS_N_INSNS (4);
14986 if (speed)
14987 *cost += 2 * extra_cost->alu.logical
14988 + 2 * extra_cost->alu.arith;
14990 return true;
14993 /* Fall-through. */
14994 case UMOD:
14995 if (speed)
14997 /* Slighly prefer UMOD over SMOD. */
14998 if (VECTOR_MODE_P (mode))
14999 *cost += extra_cost->vect.alu;
15000 else if (GET_MODE_CLASS (mode) == MODE_INT)
15001 *cost += (extra_cost->mult[mode == DImode].add
15002 + extra_cost->mult[mode == DImode].idiv
15003 + (code == MOD ? 1 : 0));
15005 return false; /* All arguments need to be in registers. */
15007 case DIV:
15008 case UDIV:
15009 case SQRT:
15010 if (speed)
15012 if (VECTOR_MODE_P (mode))
15013 *cost += extra_cost->vect.alu;
15014 else if (GET_MODE_CLASS (mode) == MODE_INT)
15015 /* There is no integer SQRT, so only DIV and UDIV can get
15016 here. */
15017 *cost += (extra_cost->mult[mode == DImode].idiv
15018 /* Slighly prefer UDIV over SDIV. */
15019 + (code == DIV ? 1 : 0));
15020 else
15021 *cost += extra_cost->fp[mode == DFmode].div;
15023 return false; /* All arguments need to be in registers. */
15025 case IF_THEN_ELSE:
15026 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15027 XEXP (x, 2), cost, speed);
15029 case EQ:
15030 case NE:
15031 case GT:
15032 case GTU:
15033 case LT:
15034 case LTU:
15035 case GE:
15036 case GEU:
15037 case LE:
15038 case LEU:
15040 return false; /* All arguments must be in registers. */
15042 case FMA:
15043 op0 = XEXP (x, 0);
15044 op1 = XEXP (x, 1);
15045 op2 = XEXP (x, 2);
15047 if (speed)
15049 if (VECTOR_MODE_P (mode))
15050 *cost += extra_cost->vect.alu;
15051 else
15052 *cost += extra_cost->fp[mode == DFmode].fma;
15055 /* FMSUB, FNMADD, and FNMSUB are free. */
15056 if (GET_CODE (op0) == NEG)
15057 op0 = XEXP (op0, 0);
15059 if (GET_CODE (op2) == NEG)
15060 op2 = XEXP (op2, 0);
15062 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15063 and the by-element operand as operand 0. */
15064 if (GET_CODE (op1) == NEG)
15065 op1 = XEXP (op1, 0);
15067 /* Catch vector-by-element operations. The by-element operand can
15068 either be (vec_duplicate (vec_select (x))) or just
15069 (vec_select (x)), depending on whether we are multiplying by
15070 a vector or a scalar.
15072 Canonicalization is not very good in these cases, FMA4 will put the
15073 by-element operand as operand 0, FNMA4 will have it as operand 1. */
15074 if (GET_CODE (op0) == VEC_DUPLICATE)
15075 op0 = XEXP (op0, 0);
15076 else if (GET_CODE (op1) == VEC_DUPLICATE)
15077 op1 = XEXP (op1, 0);
15079 if (GET_CODE (op0) == VEC_SELECT)
15080 op0 = XEXP (op0, 0);
15081 else if (GET_CODE (op1) == VEC_SELECT)
15082 op1 = XEXP (op1, 0);
15084 /* If the remaining parameters are not registers,
15085 get the cost to put them into registers. */
15086 *cost += rtx_cost (op0, mode, FMA, 0, speed);
15087 *cost += rtx_cost (op1, mode, FMA, 1, speed);
15088 *cost += rtx_cost (op2, mode, FMA, 2, speed);
15089 return true;
15091 case FLOAT:
15092 case UNSIGNED_FLOAT:
15093 if (speed)
15094 *cost += extra_cost->fp[mode == DFmode].fromint;
15095 return false;
15097 case FLOAT_EXTEND:
15098 if (speed)
15100 if (VECTOR_MODE_P (mode))
15102 /*Vector truncate. */
15103 *cost += extra_cost->vect.alu;
15105 else
15106 *cost += extra_cost->fp[mode == DFmode].widen;
15108 return false;
15110 case FLOAT_TRUNCATE:
15111 if (speed)
15113 if (VECTOR_MODE_P (mode))
15115 /*Vector conversion. */
15116 *cost += extra_cost->vect.alu;
15118 else
15119 *cost += extra_cost->fp[mode == DFmode].narrow;
15121 return false;
15123 case FIX:
15124 case UNSIGNED_FIX:
15125 x = XEXP (x, 0);
15126 /* Strip the rounding part. They will all be implemented
15127 by the fcvt* family of instructions anyway. */
15128 if (GET_CODE (x) == UNSPEC)
15130 unsigned int uns_code = XINT (x, 1);
15132 if (uns_code == UNSPEC_FRINTA
15133 || uns_code == UNSPEC_FRINTM
15134 || uns_code == UNSPEC_FRINTN
15135 || uns_code == UNSPEC_FRINTP
15136 || uns_code == UNSPEC_FRINTZ)
15137 x = XVECEXP (x, 0, 0);
15140 if (speed)
15142 if (VECTOR_MODE_P (mode))
15143 *cost += extra_cost->vect.alu;
15144 else
15145 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15148 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15149 fixed-point fcvt. */
15150 if (GET_CODE (x) == MULT
15151 && ((VECTOR_MODE_P (mode)
15152 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15153 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15155 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15156 0, speed);
15157 return true;
15160 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15161 return true;
15163 case ABS:
15164 if (VECTOR_MODE_P (mode))
15166 /* ABS (vector). */
15167 if (speed)
15168 *cost += extra_cost->vect.alu;
15170 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15172 op0 = XEXP (x, 0);
15174 /* FABD, which is analogous to FADD. */
15175 if (GET_CODE (op0) == MINUS)
15177 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15178 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15179 if (speed)
15180 *cost += extra_cost->fp[mode == DFmode].addsub;
15182 return true;
15184 /* Simple FABS is analogous to FNEG. */
15185 if (speed)
15186 *cost += extra_cost->fp[mode == DFmode].neg;
15188 else
15190 /* Integer ABS will either be split to
15191 two arithmetic instructions, or will be an ABS
15192 (scalar), which we don't model. */
15193 *cost = COSTS_N_INSNS (2);
15194 if (speed)
15195 *cost += 2 * extra_cost->alu.arith;
15197 return false;
15199 case SMAX:
15200 case SMIN:
15201 if (speed)
15203 if (VECTOR_MODE_P (mode))
15204 *cost += extra_cost->vect.alu;
15205 else
15207 /* FMAXNM/FMINNM/FMAX/FMIN.
15208 TODO: This may not be accurate for all implementations, but
15209 we do not model this in the cost tables. */
15210 *cost += extra_cost->fp[mode == DFmode].addsub;
15213 return false;
15215 case UNSPEC:
15216 /* The floating point round to integer frint* instructions. */
15217 if (aarch64_frint_unspec_p (XINT (x, 1)))
15219 if (speed)
15220 *cost += extra_cost->fp[mode == DFmode].roundint;
15222 return false;
15225 if (XINT (x, 1) == UNSPEC_RBIT)
15227 if (speed)
15228 *cost += extra_cost->alu.rev;
15230 return false;
15232 break;
15234 case TRUNCATE:
15236 /* Decompose <su>muldi3_highpart. */
15237 if (/* (truncate:DI */
15238 mode == DImode
15239 /* (lshiftrt:TI */
15240 && GET_MODE (XEXP (x, 0)) == TImode
15241 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15242 /* (mult:TI */
15243 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15244 /* (ANY_EXTEND:TI (reg:DI))
15245 (ANY_EXTEND:TI (reg:DI))) */
15246 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15247 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15248 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15249 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15250 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15251 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15252 /* (const_int 64) */
15253 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15254 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15256 /* UMULH/SMULH. */
15257 if (speed)
15258 *cost += extra_cost->mult[mode == DImode].extend;
15259 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15260 mode, MULT, 0, speed);
15261 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15262 mode, MULT, 1, speed);
15263 return true;
15265 break;
15266 case CONST_VECTOR:
15268 /* Load using MOVI/MVNI. */
15269 if (aarch64_simd_valid_immediate (x, NULL))
15270 *cost = extra_cost->vect.movi;
15271 else /* Load using constant pool. */
15272 *cost = extra_cost->ldst.load;
15273 break;
15275 case VEC_CONCAT:
15276 /* depending on the operation, either DUP or INS.
15277 For now, keep default costing. */
15278 break;
15279 case VEC_DUPLICATE:
15280 /* Load using a DUP. */
15281 *cost = extra_cost->vect.dup;
15282 return false;
15283 case VEC_SELECT:
15285 rtx op0 = XEXP (x, 0);
15286 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15288 /* cost subreg of 0 as free, otherwise as DUP */
15289 rtx op1 = XEXP (x, 1);
15290 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15292 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15293 *cost = extra_cost->vect.dup;
15294 else
15295 *cost = extra_cost->vect.extract;
15296 return true;
15298 default:
15299 break;
15302 if (dump_file
15303 && flag_aarch64_verbose_cost)
15304 fprintf (dump_file,
15305 "\nFailed to cost RTX. Assuming default cost.\n");
15307 return true;
15310 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15311 calculated for X. This cost is stored in *COST. Returns true
15312 if the total cost of X was calculated. */
15313 static bool
15314 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15315 int param, int *cost, bool speed)
15317 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15319 if (dump_file
15320 && flag_aarch64_verbose_cost)
15322 print_rtl_single (dump_file, x);
15323 fprintf (dump_file, "\n%s cost: %d (%s)\n",
15324 speed ? "Hot" : "Cold",
15325 *cost, result ? "final" : "partial");
15328 return result;
15331 static int
15332 aarch64_register_move_cost (machine_mode mode,
15333 reg_class_t from_i, reg_class_t to_i)
15335 enum reg_class from = (enum reg_class) from_i;
15336 enum reg_class to = (enum reg_class) to_i;
15337 const struct cpu_regmove_cost *regmove_cost
15338 = aarch64_tune_params.regmove_cost;
15340 /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS. */
15341 if (reg_class_subset_p (to, POINTER_REGS))
15342 to = GENERAL_REGS;
15344 if (reg_class_subset_p (from, POINTER_REGS))
15345 from = GENERAL_REGS;
15347 /* Make RDFFR very expensive. In particular, if we know that the FFR
15348 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15349 as a way of obtaining a PTRUE. */
15350 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15351 && hard_reg_set_subset_p (reg_class_contents[from_i],
15352 reg_class_contents[FFR_REGS]))
15353 return 80;
15355 /* Moving between GPR and stack cost is the same as GP2GP. */
15356 if ((from == GENERAL_REGS && to == STACK_REG)
15357 || (to == GENERAL_REGS && from == STACK_REG))
15358 return regmove_cost->GP2GP;
15360 /* To/From the stack register, we move via the gprs. */
15361 if (to == STACK_REG || from == STACK_REG)
15362 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15363 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15365 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15366 if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15367 && known_eq (GET_MODE_SIZE (mode), 16))
15369 /* 128-bit operations on general registers require 2 instructions. */
15370 if (from == GENERAL_REGS && to == GENERAL_REGS)
15371 return regmove_cost->GP2GP * 2;
15372 else if (from == GENERAL_REGS)
15373 return regmove_cost->GP2FP * 2;
15374 else if (to == GENERAL_REGS)
15375 return regmove_cost->FP2GP * 2;
15377 /* When AdvSIMD instructions are disabled it is not possible to move
15378 a 128-bit value directly between Q registers. This is handled in
15379 secondary reload. A general register is used as a scratch to move
15380 the upper DI value and the lower DI value is moved directly,
15381 hence the cost is the sum of three moves. */
15382 if (!TARGET_SIMD && !TARGET_SVE)
15383 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15385 return regmove_cost->FP2FP;
15388 if (from == GENERAL_REGS && to == GENERAL_REGS)
15389 return regmove_cost->GP2GP;
15390 else if (from == GENERAL_REGS)
15391 return regmove_cost->GP2FP;
15392 else if (to == GENERAL_REGS)
15393 return regmove_cost->FP2GP;
15395 if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15397 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15398 The cost must be greater than 2 units to indicate that direct
15399 moves aren't possible. */
15400 auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15401 + aarch64_tune_params.memmov_cost.store_fp);
15402 return MIN (CEIL (per_vector, 2), 4);
15405 return regmove_cost->FP2FP;
15408 /* Implements TARGET_MEMORY_MOVE_COST. */
15409 static int
15410 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15412 enum reg_class rclass = (enum reg_class) rclass_i;
15413 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15414 ? reg_classes_intersect_p (rclass, PR_REGS)
15415 : reg_class_subset_p (rclass, PR_REGS))
15416 return (in
15417 ? aarch64_tune_params.memmov_cost.load_pred
15418 : aarch64_tune_params.memmov_cost.store_pred);
15420 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15421 ? reg_classes_intersect_p (rclass, FP_REGS)
15422 : reg_class_subset_p (rclass, FP_REGS))
15423 return (in
15424 ? aarch64_tune_params.memmov_cost.load_fp
15425 : aarch64_tune_params.memmov_cost.store_fp);
15427 return (in
15428 ? aarch64_tune_params.memmov_cost.load_int
15429 : aarch64_tune_params.memmov_cost.store_int);
15432 /* Implement TARGET_INSN_COST. We have the opportunity to do something
15433 much more productive here, such as using insn attributes to cost things.
15434 But we don't, not yet.
15436 The main point of this current definition is to make calling insn_cost
15437 on one instruction equivalent to calling seq_cost on a sequence that
15438 contains only that instruction. The default definition would instead
15439 only look at SET_SRCs, ignoring SET_DESTs.
15441 This ensures that, for example, storing a 128-bit zero vector is more
15442 expensive than storing a 128-bit vector register. A move of zero
15443 into a 128-bit vector register followed by multiple stores of that
15444 register is then cheaper than multiple stores of zero (which would
15445 use STP of XZR). This in turn allows STP Qs to be formed. */
15446 static int
15447 aarch64_insn_cost (rtx_insn *insn, bool speed)
15449 if (rtx set = single_set (insn))
15450 return set_rtx_cost (set, speed);
15451 return pattern_cost (PATTERN (insn), speed);
15454 /* Implement TARGET_INIT_BUILTINS. */
15455 static void
15456 aarch64_init_builtins ()
15458 aarch64_general_init_builtins ();
15459 aarch64_sve::init_builtins ();
15460 #ifdef SUBTARGET_INIT_BUILTINS
15461 SUBTARGET_INIT_BUILTINS;
15462 #endif
15465 /* Implement TARGET_FOLD_BUILTIN. */
15466 static tree
15467 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15469 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15470 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15471 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15472 switch (code & AARCH64_BUILTIN_CLASS)
15474 case AARCH64_BUILTIN_GENERAL:
15475 return aarch64_general_fold_builtin (subcode, type, nargs, args);
15477 case AARCH64_BUILTIN_SVE:
15478 return NULL_TREE;
15480 gcc_unreachable ();
15483 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15484 static bool
15485 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15487 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15488 tree fndecl = gimple_call_fndecl (stmt);
15489 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15490 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15491 gimple *new_stmt = NULL;
15492 switch (code & AARCH64_BUILTIN_CLASS)
15494 case AARCH64_BUILTIN_GENERAL:
15495 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15496 break;
15498 case AARCH64_BUILTIN_SVE:
15499 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15500 break;
15503 if (!new_stmt)
15504 return false;
15506 gsi_replace (gsi, new_stmt, false);
15507 return true;
15510 /* Implement TARGET_EXPAND_BUILTIN. */
15511 static rtx
15512 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15514 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15515 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15516 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15517 switch (code & AARCH64_BUILTIN_CLASS)
15519 case AARCH64_BUILTIN_GENERAL:
15520 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15522 case AARCH64_BUILTIN_SVE:
15523 return aarch64_sve::expand_builtin (subcode, exp, target);
15525 gcc_unreachable ();
15528 /* Implement TARGET_BUILTIN_DECL. */
15529 static tree
15530 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15532 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15533 switch (code & AARCH64_BUILTIN_CLASS)
15535 case AARCH64_BUILTIN_GENERAL:
15536 return aarch64_general_builtin_decl (subcode, initialize_p);
15538 case AARCH64_BUILTIN_SVE:
15539 return aarch64_sve::builtin_decl (subcode, initialize_p);
15541 gcc_unreachable ();
15544 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15545 to optimize 1.0/sqrt. */
15547 static bool
15548 use_rsqrt_p (machine_mode mode)
15550 return (!flag_trapping_math
15551 && flag_unsafe_math_optimizations
15552 && ((aarch64_tune_params.approx_modes->recip_sqrt
15553 & AARCH64_APPROX_MODE (mode))
15554 || flag_mrecip_low_precision_sqrt));
15557 /* Function to decide when to use the approximate reciprocal square root
15558 builtin. */
15560 static tree
15561 aarch64_builtin_reciprocal (tree fndecl)
15563 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15565 if (!use_rsqrt_p (mode))
15566 return NULL_TREE;
15567 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15568 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15569 switch (code & AARCH64_BUILTIN_CLASS)
15571 case AARCH64_BUILTIN_GENERAL:
15572 return aarch64_general_builtin_rsqrt (subcode);
15574 case AARCH64_BUILTIN_SVE:
15575 return NULL_TREE;
15577 gcc_unreachable ();
15580 /* Emit code to perform the floating-point operation:
15582 DST = SRC1 * SRC2
15584 where all three operands are already known to be registers.
15585 If the operation is an SVE one, PTRUE is a suitable all-true
15586 predicate. */
15588 static void
15589 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15591 if (ptrue)
15592 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15593 dst, ptrue, src1, src2,
15594 gen_int_mode (SVE_RELAXED_GP, SImode)));
15595 else
15596 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15599 /* Emit instruction sequence to compute either the approximate square root
15600 or its approximate reciprocal, depending on the flag RECP, and return
15601 whether the sequence was emitted or not. */
15603 bool
15604 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15606 machine_mode mode = GET_MODE (dst);
15608 if (GET_MODE_INNER (mode) == HFmode)
15610 gcc_assert (!recp);
15611 return false;
15614 if (!recp)
15616 if (!(flag_mlow_precision_sqrt
15617 || (aarch64_tune_params.approx_modes->sqrt
15618 & AARCH64_APPROX_MODE (mode))))
15619 return false;
15621 if (!flag_finite_math_only
15622 || flag_trapping_math
15623 || !flag_unsafe_math_optimizations
15624 || optimize_function_for_size_p (cfun))
15625 return false;
15627 else
15628 /* Caller assumes we cannot fail. */
15629 gcc_assert (use_rsqrt_p (mode));
15631 rtx pg = NULL_RTX;
15632 if (aarch64_sve_mode_p (mode))
15633 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15634 machine_mode mmsk = (VECTOR_MODE_P (mode)
15635 ? related_int_vector_mode (mode).require ()
15636 : int_mode_for_mode (mode).require ());
15637 rtx xmsk = NULL_RTX;
15638 if (!recp)
15640 /* When calculating the approximate square root, compare the
15641 argument with 0.0 and create a mask. */
15642 rtx zero = CONST0_RTX (mode);
15643 if (pg)
15645 xmsk = gen_reg_rtx (GET_MODE (pg));
15646 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15647 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15648 xmsk, pg, hint, src, zero));
15650 else
15652 xmsk = gen_reg_rtx (mmsk);
15653 emit_insn (gen_rtx_SET (xmsk,
15654 gen_rtx_NEG (mmsk,
15655 gen_rtx_EQ (mmsk, src, zero))));
15659 /* Estimate the approximate reciprocal square root. */
15660 rtx xdst = gen_reg_rtx (mode);
15661 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15663 /* Iterate over the series twice for SF and thrice for DF. */
15664 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15666 /* Optionally iterate over the series once less for faster performance
15667 while sacrificing the accuracy. */
15668 if ((recp && flag_mrecip_low_precision_sqrt)
15669 || (!recp && flag_mlow_precision_sqrt))
15670 iterations--;
15672 /* Iterate over the series to calculate the approximate reciprocal square
15673 root. */
15674 rtx x1 = gen_reg_rtx (mode);
15675 while (iterations--)
15677 rtx x2 = gen_reg_rtx (mode);
15678 aarch64_emit_mult (x2, pg, xdst, xdst);
15680 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15682 if (iterations > 0)
15683 aarch64_emit_mult (xdst, pg, xdst, x1);
15686 if (!recp)
15688 if (pg)
15689 /* Multiply nonzero source values by the corresponding intermediate
15690 result elements, so that the final calculation is the approximate
15691 square root rather than its reciprocal. Select a zero result for
15692 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15693 otherwise. */
15694 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15695 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15696 else
15698 /* Qualify the approximate reciprocal square root when the
15699 argument is 0.0 by squashing the intermediary result to 0.0. */
15700 rtx xtmp = gen_reg_rtx (mmsk);
15701 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15702 gen_rtx_SUBREG (mmsk, xdst, 0)));
15703 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15705 /* Calculate the approximate square root. */
15706 aarch64_emit_mult (xdst, pg, xdst, src);
15710 /* Finalize the approximation. */
15711 aarch64_emit_mult (dst, pg, xdst, x1);
15713 return true;
15716 /* Emit the instruction sequence to compute the approximation for the division
15717 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15719 bool
15720 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15722 machine_mode mode = GET_MODE (quo);
15724 if (GET_MODE_INNER (mode) == HFmode)
15725 return false;
15727 bool use_approx_division_p = (flag_mlow_precision_div
15728 || (aarch64_tune_params.approx_modes->division
15729 & AARCH64_APPROX_MODE (mode)));
15731 if (!flag_finite_math_only
15732 || flag_trapping_math
15733 || !flag_unsafe_math_optimizations
15734 || optimize_function_for_size_p (cfun)
15735 || !use_approx_division_p)
15736 return false;
15738 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15739 return false;
15741 rtx pg = NULL_RTX;
15742 if (aarch64_sve_mode_p (mode))
15743 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15745 /* Estimate the approximate reciprocal. */
15746 rtx xrcp = gen_reg_rtx (mode);
15747 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15749 /* Iterate over the series twice for SF and thrice for DF. */
15750 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15752 /* Optionally iterate over the series less for faster performance,
15753 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15754 if (flag_mlow_precision_div)
15755 iterations = (GET_MODE_INNER (mode) == DFmode
15756 ? aarch64_double_recp_precision
15757 : aarch64_float_recp_precision);
15759 /* Iterate over the series to calculate the approximate reciprocal. */
15760 rtx xtmp = gen_reg_rtx (mode);
15761 while (iterations--)
15763 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15765 if (iterations > 0)
15766 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15769 if (num != CONST1_RTX (mode))
15771 /* As the approximate reciprocal of DEN is already calculated, only
15772 calculate the approximate division when NUM is not 1.0. */
15773 rtx xnum = force_reg (mode, num);
15774 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15777 /* Finalize the approximation. */
15778 aarch64_emit_mult (quo, pg, xrcp, xtmp);
15779 return true;
15782 /* Return the number of instructions that can be issued per cycle. */
15783 static int
15784 aarch64_sched_issue_rate (void)
15786 return aarch64_tune_params.issue_rate;
15789 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15790 static int
15791 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15793 if (DEBUG_INSN_P (insn))
15794 return more;
15796 rtx_code code = GET_CODE (PATTERN (insn));
15797 if (code == USE || code == CLOBBER)
15798 return more;
15800 if (get_attr_type (insn) == TYPE_NO_INSN)
15801 return more;
15803 return more - 1;
15806 static int
15807 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15809 int issue_rate = aarch64_sched_issue_rate ();
15811 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15815 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15816 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15817 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15819 static int
15820 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15821 int ready_index)
15823 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15827 /* Vectorizer cost model target hooks. */
15829 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15830 return the decl that should be recorded. Return null otherwise. */
15831 tree
15832 aarch64_vector_load_decl (tree addr)
15834 if (TREE_CODE (addr) != ADDR_EXPR)
15835 return NULL_TREE;
15836 tree base = get_base_address (TREE_OPERAND (addr, 0));
15837 if (TREE_CODE (base) != VAR_DECL)
15838 return NULL_TREE;
15839 return base;
15842 /* Return true if STMT_INFO accesses a decl that is known to be the
15843 argument to a vld1 in the same function. */
15844 static bool
15845 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15847 if (!cfun->machine->vector_load_decls)
15848 return false;
15849 auto dr = STMT_VINFO_DATA_REF (stmt_info);
15850 if (!dr)
15851 return false;
15852 tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15853 return decl && cfun->machine->vector_load_decls->contains (decl);
15856 /* Information about how the CPU would issue the scalar, Advanced SIMD
15857 or SVE version of a vector loop, using the scheme defined by the
15858 aarch64_base_vec_issue_info hierarchy of structures. */
15859 class aarch64_vec_op_count
15861 public:
15862 aarch64_vec_op_count () = default;
15863 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15864 unsigned int = 1);
15866 unsigned int vec_flags () const { return m_vec_flags; }
15867 unsigned int vf_factor () const { return m_vf_factor; }
15869 const aarch64_base_vec_issue_info *base_issue_info () const;
15870 const aarch64_simd_vec_issue_info *simd_issue_info () const;
15871 const aarch64_sve_vec_issue_info *sve_issue_info () const;
15873 fractional_cost rename_cycles_per_iter () const;
15874 fractional_cost min_nonpred_cycles_per_iter () const;
15875 fractional_cost min_pred_cycles_per_iter () const;
15876 fractional_cost min_cycles_per_iter () const;
15878 void dump () const;
15880 /* The number of individual "general" operations. See the comments
15881 in aarch64_base_vec_issue_info for details. */
15882 unsigned int general_ops = 0;
15884 /* The number of load and store operations, under the same scheme
15885 as above. */
15886 unsigned int loads = 0;
15887 unsigned int stores = 0;
15889 /* The minimum number of cycles needed to execute all loop-carried
15890 operations, which in the vector code become associated with
15891 reductions. */
15892 unsigned int reduction_latency = 0;
15894 /* The number of individual predicate operations. See the comments
15895 in aarch64_sve_vec_issue_info for details. */
15896 unsigned int pred_ops = 0;
15898 private:
15899 /* The issue information for the core. */
15900 const aarch64_vec_issue_info *m_issue_info = nullptr;
15902 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15903 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15904 Advanced SIMD code.
15905 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15906 SVE code. */
15907 unsigned int m_vec_flags = 0;
15909 /* Assume that, when the code is executing on the core described
15910 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15911 times more data than the vectorizer anticipates.
15913 This is only ever different from 1 for SVE. It allows us to consider
15914 what would happen on a 256-bit SVE target even when the -mtune
15915 parameters say that the “likely” SVE length is 128 bits. */
15916 unsigned int m_vf_factor = 1;
15919 aarch64_vec_op_count::
15920 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15921 unsigned int vec_flags, unsigned int vf_factor)
15922 : m_issue_info (issue_info),
15923 m_vec_flags (vec_flags),
15924 m_vf_factor (vf_factor)
15928 /* Return the base issue information (i.e. the parts that make sense
15929 for both scalar and vector code). Return null if we have no issue
15930 information. */
15931 const aarch64_base_vec_issue_info *
15932 aarch64_vec_op_count::base_issue_info () const
15934 if (auto *ret = simd_issue_info ())
15935 return ret;
15936 return m_issue_info->scalar;
15939 /* If the structure describes vector code and we have associated issue
15940 information, return that issue information, otherwise return null. */
15941 const aarch64_simd_vec_issue_info *
15942 aarch64_vec_op_count::simd_issue_info () const
15944 if (auto *ret = sve_issue_info ())
15945 return ret;
15946 if (m_vec_flags)
15947 return m_issue_info->advsimd;
15948 return nullptr;
15951 /* If the structure describes SVE code and we have associated issue
15952 information, return that issue information, otherwise return null. */
15953 const aarch64_sve_vec_issue_info *
15954 aarch64_vec_op_count::sve_issue_info () const
15956 if (m_vec_flags & VEC_ANY_SVE)
15957 return m_issue_info->sve;
15958 return nullptr;
15961 /* Estimate the minimum number of cycles per iteration needed to rename
15962 the instructions.
15964 ??? For now this is done inline rather than via cost tables, since it
15965 isn't clear how it should be parameterized for the general case. */
15966 fractional_cost
15967 aarch64_vec_op_count::rename_cycles_per_iter () const
15969 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15970 || sve_issue_info () == &neoversen2_sve_issue_info
15971 || sve_issue_info () == &neoversev2_sve_issue_info)
15972 /* + 1 for an addition. We've already counted a general op for each
15973 store, so we don't need to account for stores separately. The branch
15974 reads no registers and so does not need to be counted either.
15976 ??? This value is very much on the pessimistic side, but seems to work
15977 pretty well in practice. */
15978 return { general_ops + loads + pred_ops + 1, 5 };
15980 return 0;
15983 /* Like min_cycles_per_iter, but excluding predicate operations. */
15984 fractional_cost
15985 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15987 auto *issue_info = base_issue_info ();
15989 fractional_cost cycles = MAX (reduction_latency, 1);
15990 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15991 cycles = std::max (cycles, { loads + stores,
15992 issue_info->loads_stores_per_cycle });
15993 cycles = std::max (cycles, { general_ops,
15994 issue_info->general_ops_per_cycle });
15995 cycles = std::max (cycles, rename_cycles_per_iter ());
15996 return cycles;
15999 /* Like min_cycles_per_iter, but including only the predicate operations. */
16000 fractional_cost
16001 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16003 if (auto *issue_info = sve_issue_info ())
16004 return { pred_ops, issue_info->pred_ops_per_cycle };
16005 return 0;
16008 /* Estimate the minimum number of cycles needed to issue the operations.
16009 This is a very simplistic model! */
16010 fractional_cost
16011 aarch64_vec_op_count::min_cycles_per_iter () const
16013 return std::max (min_nonpred_cycles_per_iter (),
16014 min_pred_cycles_per_iter ());
16017 /* Dump information about the structure. */
16018 void
16019 aarch64_vec_op_count::dump () const
16021 dump_printf_loc (MSG_NOTE, vect_location,
16022 " load operations = %d\n", loads);
16023 dump_printf_loc (MSG_NOTE, vect_location,
16024 " store operations = %d\n", stores);
16025 dump_printf_loc (MSG_NOTE, vect_location,
16026 " general operations = %d\n", general_ops);
16027 if (sve_issue_info ())
16028 dump_printf_loc (MSG_NOTE, vect_location,
16029 " predicate operations = %d\n", pred_ops);
16030 dump_printf_loc (MSG_NOTE, vect_location,
16031 " reduction latency = %d\n", reduction_latency);
16032 if (auto rcpi = rename_cycles_per_iter ())
16033 dump_printf_loc (MSG_NOTE, vect_location,
16034 " estimated cycles per iteration to rename = %f\n",
16035 rcpi.as_double ());
16036 if (auto pred_cpi = min_pred_cycles_per_iter ())
16038 dump_printf_loc (MSG_NOTE, vect_location,
16039 " estimated min cycles per iteration"
16040 " without predication = %f\n",
16041 min_nonpred_cycles_per_iter ().as_double ());
16042 dump_printf_loc (MSG_NOTE, vect_location,
16043 " estimated min cycles per iteration"
16044 " for predication = %f\n", pred_cpi.as_double ());
16046 if (auto cpi = min_cycles_per_iter ())
16047 dump_printf_loc (MSG_NOTE, vect_location,
16048 " estimated min cycles per iteration = %f\n",
16049 cpi.as_double ());
16052 /* Information about vector code that we're in the process of costing. */
16053 class aarch64_vector_costs : public vector_costs
16055 public:
16056 aarch64_vector_costs (vec_info *, bool);
16058 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16059 stmt_vec_info stmt_info, slp_tree, tree vectype,
16060 int misalign,
16061 vect_cost_model_location where) override;
16062 void finish_cost (const vector_costs *) override;
16063 bool better_main_loop_than_p (const vector_costs *other) const override;
16065 private:
16066 void record_potential_advsimd_unrolling (loop_vec_info);
16067 void analyze_loop_vinfo (loop_vec_info);
16068 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
16069 aarch64_vec_op_count *);
16070 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16071 fractional_cost, unsigned int,
16072 unsigned int *, bool *);
16073 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16074 unsigned int);
16075 bool prefer_unrolled_loop () const;
16076 unsigned int determine_suggested_unroll_factor ();
16078 /* True if we have performed one-time initialization based on the
16079 vec_info. */
16080 bool m_analyzed_vinfo = false;
16082 /* This loop uses an average operation that is not supported by SVE, but is
16083 supported by Advanced SIMD and SVE2. */
16084 bool m_has_avg = false;
16086 /* True if the vector body contains a store to a decl and if the
16087 function is known to have a vld1 from the same decl.
16089 In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16090 initializing a vector is:
16092 float f[4] = { elts };
16093 float32x4_t x = vld1q_f32(f);
16095 We should strongly prefer vectorization of the initialization of f,
16096 so that the store to f and the load back can be optimized away,
16097 leaving a vectorization of { elts }. */
16098 bool m_stores_to_vector_load_decl = false;
16100 /* Non-zero if the last operation we costed is a vector promotion or demotion.
16101 In this case the value is the number of insns in the last operation.
16103 On AArch64 vector promotion and demotions require us to first widen or
16104 narrow the input and only after that emit conversion instructions. For
16105 costing this means we need to emit the cost of the final conversions as
16106 well. */
16107 unsigned int m_num_last_promote_demote = 0;
16109 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16110 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16111 SIMD code.
16112 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
16113 unsigned int m_vec_flags = 0;
16115 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16116 This means that code such as:
16118 a[0] = x;
16119 a[1] = x;
16121 will be costed as two scalar instructions and two vector instructions
16122 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
16123 wins if the costs are equal, because of the fact that the vector costs
16124 include constant initializations whereas the scalar costs don't.
16125 We would therefore tend to vectorize the code above, even though
16126 the scalar version can use a single STP.
16128 We should eventually fix this and model LDP and STP in the main costs;
16129 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16130 Until then, we look specifically for code that does nothing more than
16131 STP-like operations. We cost them on that basis in addition to the
16132 normal latency-based costs.
16134 If the scalar or vector code could be a sequence of STPs +
16135 initialization, this variable counts the cost of the sequence,
16136 with 2 units per instruction. The variable is ~0U for other
16137 kinds of code. */
16138 unsigned int m_stp_sequence_cost = 0;
16140 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16141 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
16142 situations, we try to predict whether an Advanced SIMD implementation
16143 of the loop could be completely unrolled and become straight-line code.
16144 If so, it is generally better to use the Advanced SIMD version rather
16145 than length-agnostic SVE, since the SVE loop would execute an unknown
16146 number of times and so could not be completely unrolled in the same way.
16148 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16149 number of Advanced SIMD loop iterations that would be unrolled and
16150 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16151 in the unrolled loop. Both values are zero if we're not applying
16152 the heuristic. */
16153 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16154 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16156 /* If we're vectorizing a loop that executes a constant number of times,
16157 this variable gives the number of times that the vector loop would
16158 iterate, otherwise it is zero. */
16159 uint64_t m_num_vector_iterations = 0;
16161 /* Used only when vectorizing loops. Estimates the number and kind of
16162 operations that would be needed by one iteration of the scalar
16163 or vector loop. There is one entry for each tuning option of
16164 interest. */
16165 auto_vec<aarch64_vec_op_count, 2> m_ops;
16168 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16169 bool costing_for_scalar)
16170 : vector_costs (vinfo, costing_for_scalar),
16171 m_vec_flags (costing_for_scalar ? 0
16172 : aarch64_classify_vector_mode (vinfo->vector_mode))
16174 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16176 m_ops.quick_push ({ issue_info, m_vec_flags });
16177 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16179 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16180 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16181 vf_factor });
16186 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
16187 vector_costs *
16188 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16190 return new aarch64_vector_costs (vinfo, costing_for_scalar);
16193 /* Return true if the current CPU should use the new costs defined
16194 in GCC 11. This should be removed for GCC 12 and above, with the
16195 costs applying to all CPUs instead. */
16196 static bool
16197 aarch64_use_new_vector_costs_p ()
16199 return (aarch64_tune_params.extra_tuning_flags
16200 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16203 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16204 static const simd_vec_cost *
16205 aarch64_simd_vec_costs (tree vectype)
16207 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16208 if (vectype != NULL
16209 && aarch64_sve_mode_p (TYPE_MODE (vectype))
16210 && costs->sve != NULL)
16211 return costs->sve;
16212 return costs->advsimd;
16215 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16216 static const simd_vec_cost *
16217 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16219 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16220 if ((flags & VEC_ANY_SVE) && costs->sve)
16221 return costs->sve;
16222 return costs->advsimd;
16225 /* If STMT_INFO is a memory reference, return the scalar memory type,
16226 otherwise return null. */
16227 static tree
16228 aarch64_dr_type (stmt_vec_info stmt_info)
16230 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16231 return TREE_TYPE (DR_REF (dr));
16232 return NULL_TREE;
16235 /* Decide whether to use the unrolling heuristic described above
16236 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16237 describes the loop that we're vectorizing. */
16238 void
16239 aarch64_vector_costs::
16240 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16242 /* The heuristic only makes sense on targets that have the same
16243 vector throughput for SVE and Advanced SIMD. */
16244 if (!(aarch64_tune_params.extra_tuning_flags
16245 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16246 return;
16248 /* We only want to apply the heuristic if LOOP_VINFO is being
16249 vectorized for SVE. */
16250 if (!(m_vec_flags & VEC_ANY_SVE))
16251 return;
16253 /* Check whether it is possible in principle to use Advanced SIMD
16254 instead. */
16255 if (aarch64_autovec_preference == 2)
16256 return;
16258 /* We don't want to apply the heuristic to outer loops, since it's
16259 harder to track two levels of unrolling. */
16260 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16261 return;
16263 /* Only handle cases in which the number of Advanced SIMD iterations
16264 would be known at compile time but the number of SVE iterations
16265 would not. */
16266 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16267 || aarch64_sve_vg.is_constant ())
16268 return;
16270 /* Guess how many times the Advanced SIMD loop would iterate and make
16271 sure that it is within the complete unrolling limit. Even if the
16272 number of iterations is small enough, the number of statements might
16273 not be, which is why we need to estimate the number of statements too. */
16274 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16275 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16276 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16277 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16278 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16279 return;
16281 /* Record that we're applying the heuristic and should try to estimate
16282 the number of statements in the Advanced SIMD loop. */
16283 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16286 /* Do one-time initialization of the aarch64_vector_costs given that we're
16287 costing the loop vectorization described by LOOP_VINFO. */
16288 void
16289 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16291 /* Record the number of times that the vector loop would execute,
16292 if known. */
16293 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16294 auto scalar_niters = max_stmt_executions_int (loop);
16295 if (scalar_niters >= 0)
16297 unsigned int vf = vect_vf_for_cost (loop_vinfo);
16298 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16299 m_num_vector_iterations = scalar_niters / vf;
16300 else
16301 m_num_vector_iterations = CEIL (scalar_niters, vf);
16304 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16305 heuristic described above m_unrolled_advsimd_niters. */
16306 record_potential_advsimd_unrolling (loop_vinfo);
16309 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16310 static int
16311 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16312 tree vectype,
16313 int misalign ATTRIBUTE_UNUSED)
16315 unsigned elements;
16316 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16317 bool fp = false;
16319 if (vectype != NULL)
16320 fp = FLOAT_TYPE_P (vectype);
16322 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16324 switch (type_of_cost)
16326 case scalar_stmt:
16327 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16329 case scalar_load:
16330 return costs->scalar_load_cost;
16332 case scalar_store:
16333 return costs->scalar_store_cost;
16335 case vector_stmt:
16336 return fp ? simd_costs->fp_stmt_cost
16337 : simd_costs->int_stmt_cost;
16339 case vector_load:
16340 return simd_costs->align_load_cost;
16342 case vector_store:
16343 return simd_costs->store_cost;
16345 case vec_to_scalar:
16346 return simd_costs->vec_to_scalar_cost;
16348 case scalar_to_vec:
16349 return simd_costs->scalar_to_vec_cost;
16351 case unaligned_load:
16352 case vector_gather_load:
16353 return simd_costs->unalign_load_cost;
16355 case unaligned_store:
16356 case vector_scatter_store:
16357 return simd_costs->unalign_store_cost;
16359 case cond_branch_taken:
16360 return costs->cond_taken_branch_cost;
16362 case cond_branch_not_taken:
16363 return costs->cond_not_taken_branch_cost;
16365 case vec_perm:
16366 return simd_costs->permute_cost;
16368 case vec_promote_demote:
16369 return fp ? simd_costs->fp_stmt_cost
16370 : simd_costs->int_stmt_cost;
16372 case vec_construct:
16373 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16374 return elements / 2 + 1;
16376 default:
16377 gcc_unreachable ();
16381 /* Return true if an access of kind KIND for STMT_INFO represents one
16382 vector of an LD[234] or ST[234] operation. Return the total number of
16383 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16384 static int
16385 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16387 if ((kind == vector_load
16388 || kind == unaligned_load
16389 || kind == vector_store
16390 || kind == unaligned_store)
16391 && STMT_VINFO_DATA_REF (stmt_info))
16393 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16394 if (stmt_info
16395 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16396 return DR_GROUP_SIZE (stmt_info);
16398 return 0;
16401 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16402 vectors would produce a series of LDP or STP operations. KIND is the
16403 kind of statement that STMT_INFO represents. */
16404 static bool
16405 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16406 stmt_vec_info stmt_info)
16408 switch (kind)
16410 case vector_load:
16411 case vector_store:
16412 case unaligned_load:
16413 case unaligned_store:
16414 break;
16416 default:
16417 return false;
16420 if (aarch64_tune_params.extra_tuning_flags
16421 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16422 return false;
16424 return is_gimple_assign (stmt_info->stmt);
16427 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16428 or multiply-subtract sequence that might be suitable for fusing into a
16429 single instruction. If VEC_FLAGS is zero, analyze the operation as
16430 a scalar one, otherwise analyze it as an operation on vectors with those
16431 VEC_* flags. */
16432 static bool
16433 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16434 unsigned int vec_flags)
16436 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16437 if (!assign)
16438 return false;
16439 tree_code code = gimple_assign_rhs_code (assign);
16440 if (code != PLUS_EXPR && code != MINUS_EXPR)
16441 return false;
16443 auto is_mul_result = [&](int i)
16445 tree rhs = gimple_op (assign, i);
16446 /* ??? Should we try to check for a single use as well? */
16447 if (TREE_CODE (rhs) != SSA_NAME)
16448 return false;
16450 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16451 if (!def_stmt_info
16452 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16453 return false;
16454 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16455 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16456 return false;
16458 if (vec_flags & VEC_ADVSIMD)
16460 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16461 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16462 only supports MLA forms, so will require a move if the result
16463 cannot be tied to the accumulator. The most important case in
16464 which this is true is when the accumulator input is invariant. */
16465 rhs = gimple_op (assign, 3 - i);
16466 if (TREE_CODE (rhs) != SSA_NAME)
16467 return false;
16468 def_stmt_info = vinfo->lookup_def (rhs);
16469 if (!def_stmt_info
16470 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16471 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16472 return false;
16475 return true;
16478 if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16479 /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16480 multiplication must be on the second operand (to form an FMLS).
16481 But if both operands are multiplications and the second operand
16482 is used more than once, we'll instead negate the second operand
16483 and use it as an accumulator for the first operand. */
16484 return (is_mul_result (2)
16485 && (has_single_use (gimple_assign_rhs2 (assign))
16486 || !is_mul_result (1)));
16488 return is_mul_result (1) || is_mul_result (2);
16491 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16492 expression sequence that might be suitable for fusing into a
16493 single instruction. If VEC_FLAGS is zero, analyze the operation as
16494 a scalar one, otherwise analyze it as an operation on vectors with those
16495 VEC_* flags. */
16497 static bool
16498 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16499 unsigned int vec_flags)
16501 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16502 if (!assign
16503 || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16504 || !STMT_VINFO_VECTYPE (stmt_info)
16505 || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16506 return false;
16508 for (int i = 1; i < 3; ++i)
16510 tree rhs = gimple_op (assign, i);
16512 if (TREE_CODE (rhs) != SSA_NAME)
16513 continue;
16515 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16516 if (!def_stmt_info
16517 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16518 continue;
16520 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16521 if (!rhs_assign
16522 || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16523 != tcc_comparison)
16524 continue;
16526 if (vec_flags & VEC_ADVSIMD)
16527 return false;
16529 return true;
16531 return false;
16534 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16535 in-loop reduction that SVE supports directly, return its latency in cycles,
16536 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16537 instructions. */
16538 static unsigned int
16539 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16540 stmt_vec_info stmt_info,
16541 const sve_vec_cost *sve_costs)
16543 switch (vect_reduc_type (vinfo, stmt_info))
16545 case EXTRACT_LAST_REDUCTION:
16546 return sve_costs->clast_cost;
16548 case FOLD_LEFT_REDUCTION:
16549 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16551 case E_HFmode:
16552 case E_BFmode:
16553 return sve_costs->fadda_f16_cost;
16555 case E_SFmode:
16556 return sve_costs->fadda_f32_cost;
16558 case E_DFmode:
16559 return sve_costs->fadda_f64_cost;
16561 default:
16562 break;
16564 break;
16567 return 0;
16570 /* STMT_INFO describes a loop-carried operation in the original scalar code
16571 that we are considering implementing as a reduction. Return one of the
16572 following values, depending on VEC_FLAGS:
16574 - If VEC_FLAGS is zero, return the loop carry latency of the original
16575 scalar operation.
16577 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16578 Advanced SIMD implementation.
16580 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16581 SVE implementation. */
16582 static unsigned int
16583 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16584 unsigned int vec_flags)
16586 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16587 const sve_vec_cost *sve_costs = nullptr;
16588 if (vec_flags & VEC_ANY_SVE)
16589 sve_costs = aarch64_tune_params.vec_costs->sve;
16591 /* If the caller is asking for the SVE latency, check for forms of reduction
16592 that only SVE can handle directly. */
16593 if (sve_costs)
16595 unsigned int latency
16596 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16597 if (latency)
16598 return latency;
16601 /* Handle scalar costs. */
16602 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16603 if (vec_flags == 0)
16605 if (is_float)
16606 return vec_costs->scalar_fp_stmt_cost;
16607 return vec_costs->scalar_int_stmt_cost;
16610 /* Otherwise, the loop body just contains normal integer or FP operations,
16611 with a vector reduction outside the loop. */
16612 const simd_vec_cost *simd_costs
16613 = aarch64_simd_vec_costs_for_flags (vec_flags);
16614 if (is_float)
16615 return simd_costs->fp_stmt_cost;
16616 return simd_costs->int_stmt_cost;
16619 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16620 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16621 try to subdivide the target-independent categorization provided by KIND
16622 to get a more accurate cost. */
16623 static fractional_cost
16624 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16625 stmt_vec_info stmt_info,
16626 fractional_cost stmt_cost)
16628 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16629 the extension with the load. */
16630 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16631 return 0;
16633 return stmt_cost;
16636 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16637 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16638 when vectorized would operate on vector type VECTYPE. Try to subdivide
16639 the target-independent categorization provided by KIND to get a more
16640 accurate cost. WHERE specifies where the cost associated with KIND
16641 occurs. */
16642 static fractional_cost
16643 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16644 stmt_vec_info stmt_info, tree vectype,
16645 enum vect_cost_model_location where,
16646 fractional_cost stmt_cost)
16648 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16649 const sve_vec_cost *sve_costs = nullptr;
16650 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16651 sve_costs = aarch64_tune_params.vec_costs->sve;
16653 /* It's generally better to avoid costing inductions, since the induction
16654 will usually be hidden by other operations. This is particularly true
16655 for things like COND_REDUCTIONS. */
16656 if (is_a<gphi *> (stmt_info->stmt))
16657 return 0;
16659 /* Detect cases in which vec_to_scalar is describing the extraction of a
16660 vector element in preparation for a scalar store. The store itself is
16661 costed separately. */
16662 if (vect_is_store_elt_extraction (kind, stmt_info))
16663 return simd_costs->store_elt_extra_cost;
16665 /* Detect SVE gather loads, which are costed as a single scalar_load
16666 for each element. We therefore need to divide the full-instruction
16667 cost by the number of elements in the vector. */
16668 if (kind == scalar_load
16669 && sve_costs
16670 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16672 unsigned int nunits = vect_nunits_for_cost (vectype);
16673 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16674 return { sve_costs->gather_load_x64_cost, nunits };
16675 return { sve_costs->gather_load_x32_cost, nunits };
16678 /* Detect cases in which a scalar_store is really storing one element
16679 in a scatter operation. */
16680 if (kind == scalar_store
16681 && sve_costs
16682 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16683 return sve_costs->scatter_store_elt_cost;
16685 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16686 if (kind == vec_to_scalar
16687 && where == vect_body
16688 && sve_costs)
16690 unsigned int latency
16691 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16692 if (latency)
16693 return latency;
16696 /* Detect cases in which vec_to_scalar represents a single reduction
16697 instruction like FADDP or MAXV. */
16698 if (kind == vec_to_scalar
16699 && where == vect_epilogue
16700 && vect_is_reduction (stmt_info))
16701 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16703 case E_QImode:
16704 return simd_costs->reduc_i8_cost;
16706 case E_HImode:
16707 return simd_costs->reduc_i16_cost;
16709 case E_SImode:
16710 return simd_costs->reduc_i32_cost;
16712 case E_DImode:
16713 return simd_costs->reduc_i64_cost;
16715 case E_HFmode:
16716 case E_BFmode:
16717 return simd_costs->reduc_f16_cost;
16719 case E_SFmode:
16720 return simd_costs->reduc_f32_cost;
16722 case E_DFmode:
16723 return simd_costs->reduc_f64_cost;
16725 default:
16726 break;
16729 /* Otherwise stick with the original categorization. */
16730 return stmt_cost;
16733 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16734 for STMT_INFO, which has cost kind KIND and which when vectorized would
16735 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16736 targets. */
16737 static fractional_cost
16738 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16739 stmt_vec_info stmt_info, tree vectype,
16740 fractional_cost stmt_cost)
16742 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16743 vector register size or number of units. Integer promotions of this
16744 type therefore map to SXT[BHW] or UXT[BHW].
16746 Most loads have extending forms that can do the sign or zero extension
16747 on the fly. Optimistically assume that a load followed by an extension
16748 will fold to this form during combine, and that the extension therefore
16749 comes for free. */
16750 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16751 stmt_cost = 0;
16753 /* For similar reasons, vector_stmt integer truncations are a no-op,
16754 because we can just ignore the unused upper bits of the source. */
16755 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16756 stmt_cost = 0;
16758 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16759 but there are no equivalent instructions for SVE. This means that
16760 (all other things being equal) 128-bit SVE needs twice as many load
16761 and store instructions as Advanced SIMD in order to process vector pairs.
16763 Also, scalar code can often use LDP and STP to access pairs of values,
16764 so it is too simplistic to say that one SVE load or store replaces
16765 VF scalar loads and stores.
16767 Ideally we would account for this in the scalar and Advanced SIMD
16768 costs by making suitable load/store pairs as cheap as a single
16769 load/store. However, that would be a very invasive change and in
16770 practice it tends to stress other parts of the cost model too much.
16771 E.g. stores of scalar constants currently count just a store,
16772 whereas stores of vector constants count a store and a vec_init.
16773 This is an artificial distinction for AArch64, where stores of
16774 nonzero scalar constants need the same kind of register invariant
16775 as vector stores.
16777 An alternative would be to double the cost of any SVE loads and stores
16778 that could be paired in Advanced SIMD (and possibly also paired in
16779 scalar code). But this tends to stress other parts of the cost model
16780 in the same way. It also means that we can fall back to Advanced SIMD
16781 even if full-loop predication would have been useful.
16783 Here we go for a more conservative version: double the costs of SVE
16784 loads and stores if one iteration of the scalar loop processes enough
16785 elements for it to use a whole number of Advanced SIMD LDP or STP
16786 instructions. This makes it very likely that the VF would be 1 for
16787 Advanced SIMD, and so no epilogue should be needed. */
16788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16790 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16791 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16792 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16793 if (multiple_p (count * elt_bits, 256)
16794 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16795 stmt_cost *= 2;
16798 return stmt_cost;
16801 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16802 and which when vectorized would operate on vector type VECTYPE. Add the
16803 cost of any embedded operations. */
16804 static fractional_cost
16805 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
16806 stmt_vec_info stmt_info, tree vectype,
16807 unsigned vec_flags, fractional_cost stmt_cost)
16809 if (vectype)
16811 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16813 /* Detect cases in which a vector load or store represents an
16814 LD[234] or ST[234] instruction. */
16815 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16817 case 2:
16818 stmt_cost += simd_costs->ld2_st2_permute_cost;
16819 break;
16821 case 3:
16822 stmt_cost += simd_costs->ld3_st3_permute_cost;
16823 break;
16825 case 4:
16826 stmt_cost += simd_costs->ld4_st4_permute_cost;
16827 break;
16830 gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
16831 if ((kind == scalar_stmt || kind == vector_stmt) && assign)
16833 /* For MLA we need to reduce the cost since MLA is 1 instruction. */
16834 if (!vect_is_reduction (stmt_info)
16835 && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
16836 return 0;
16838 /* For vector boolean ANDs with a compare operand we just need
16839 one insn. */
16840 if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
16841 return 0;
16844 if (kind == vector_stmt || kind == vec_to_scalar)
16845 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16847 if (FLOAT_TYPE_P (cmp_type))
16848 stmt_cost += simd_costs->fp_stmt_cost;
16849 else
16850 stmt_cost += simd_costs->int_stmt_cost;
16854 if (kind == scalar_stmt)
16855 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16857 if (FLOAT_TYPE_P (cmp_type))
16858 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16859 else
16860 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16863 return stmt_cost;
16866 /* Return true if STMT_INFO is part of a reduction that has the form:
16868 r = r op ...;
16869 r = r op ...;
16871 with the single accumulator being read and written multiple times. */
16872 static bool
16873 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
16875 if (!STMT_VINFO_REDUC_DEF (stmt_info))
16876 return false;
16878 auto reduc_info = info_for_reduction (vinfo, stmt_info);
16879 return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
16882 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16883 and they describe an operation in the body of a vector loop. Record issue
16884 information relating to the vector operation in OPS. */
16885 void
16886 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16887 stmt_vec_info stmt_info,
16888 aarch64_vec_op_count *ops)
16890 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16891 if (!base_issue)
16892 return;
16893 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16894 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16896 /* Calculate the minimum cycles per iteration imposed by a reduction
16897 operation. */
16898 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16899 && vect_is_reduction (stmt_info))
16901 unsigned int base
16902 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16903 if (aarch64_force_single_cycle (m_vinfo, stmt_info))
16904 /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
16905 and then accumulate that, but at the moment the loop-carried
16906 dependency includes all copies. */
16907 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16908 else
16909 ops->reduction_latency = MAX (ops->reduction_latency, base);
16912 if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
16914 /* Assume that multiply-adds will become a single operation. */
16915 if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16916 return;
16918 /* Assume that bool AND with compare operands will become a single
16919 operation. */
16920 if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
16921 return;
16925 /* Count the basic operation cost associated with KIND. */
16926 switch (kind)
16928 case cond_branch_taken:
16929 case cond_branch_not_taken:
16930 case vector_gather_load:
16931 case vector_scatter_store:
16932 /* We currently don't expect these to be used in a loop body. */
16933 break;
16935 case vec_perm:
16936 case vec_promote_demote:
16937 case vec_construct:
16938 case vec_to_scalar:
16939 case scalar_to_vec:
16940 case vector_stmt:
16941 case scalar_stmt:
16942 ops->general_ops += count;
16943 break;
16945 case scalar_load:
16946 case vector_load:
16947 case unaligned_load:
16948 ops->loads += count;
16949 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16950 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16951 break;
16953 case vector_store:
16954 case unaligned_store:
16955 case scalar_store:
16956 ops->stores += count;
16957 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16958 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16959 break;
16962 /* Add any embedded comparison operations. */
16963 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16964 && vect_embedded_comparison_type (stmt_info))
16965 ops->general_ops += count;
16967 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16968 have only accounted for one. */
16969 if ((kind == vector_stmt || kind == vec_to_scalar)
16970 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16971 ops->general_ops += count;
16973 /* Count the predicate operations needed by an SVE comparison. */
16974 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16975 if (tree type = vect_comparison_type (stmt_info))
16977 unsigned int base = (FLOAT_TYPE_P (type)
16978 ? sve_issue->fp_cmp_pred_ops
16979 : sve_issue->int_cmp_pred_ops);
16980 ops->pred_ops += base * count;
16983 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
16984 if (simd_issue)
16985 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16987 case 2:
16988 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16989 break;
16991 case 3:
16992 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16993 break;
16995 case 4:
16996 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16997 break;
17000 /* Add any overhead associated with gather loads and scatter stores. */
17001 if (sve_issue
17002 && (kind == scalar_load || kind == scalar_store)
17003 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
17005 unsigned int pairs = CEIL (count, 2);
17006 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17007 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17011 /* Return true if STMT_INFO contains a memory access and if the constant
17012 component of the memory address is aligned to SIZE bytes. */
17013 static bool
17014 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17015 poly_uint64 size)
17017 if (!STMT_VINFO_DATA_REF (stmt_info))
17018 return false;
17020 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17021 stmt_info = first_stmt;
17022 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17023 /* Needed for gathers & scatters, for example. */
17024 if (!constant_offset)
17025 return false;
17027 return multiple_p (wi::to_poly_offset (constant_offset), size);
17030 /* Check if a scalar or vector stmt could be part of a region of code
17031 that does nothing more than store values to memory, in the scalar
17032 case using STP. Return the cost of the stmt if so, counting 2 for
17033 one instruction. Return ~0U otherwise.
17035 The arguments are a subset of those passed to add_stmt_cost. */
17036 unsigned int
17037 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17038 stmt_vec_info stmt_info, tree vectype)
17040 /* Code that stores vector constants uses a vector_load to create
17041 the constant. We don't apply the heuristic to that case for two
17042 main reasons:
17044 - At the moment, STPs are only formed via peephole2, and the
17045 constant scalar moves would often come between STRs and so
17046 prevent STP formation.
17048 - The scalar code also has to load the constant somehow, and that
17049 isn't costed. */
17050 switch (kind)
17052 case scalar_to_vec:
17053 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
17054 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17056 case vec_construct:
17057 if (FLOAT_TYPE_P (vectype))
17058 /* Count 1 insn for the maximum number of FP->SIMD INS
17059 instructions. */
17060 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17062 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17063 maximum number of GPR->SIMD INS instructions. */
17064 return vect_nunits_for_cost (vectype) * 4 * count;
17066 case vector_store:
17067 case unaligned_store:
17068 /* Count 1 insn per vector if we can't form STP Q pairs. */
17069 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17070 return count * 2;
17071 if (aarch64_tune_params.extra_tuning_flags
17072 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
17073 return count * 2;
17075 if (stmt_info)
17077 /* Assume we won't be able to use STP if the constant offset
17078 component of the address is misaligned. ??? This could be
17079 removed if we formed STP pairs earlier, rather than relying
17080 on peephole2. */
17081 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17082 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17083 return count * 2;
17085 return CEIL (count, 2) * 2;
17087 case scalar_store:
17088 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17090 /* Check for a mode in which STP pairs can be formed. */
17091 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17092 if (maybe_ne (size, 4) && maybe_ne (size, 8))
17093 return ~0U;
17095 /* Assume we won't be able to use STP if the constant offset
17096 component of the address is misaligned. ??? This could be
17097 removed if we formed STP pairs earlier, rather than relying
17098 on peephole2. */
17099 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17100 return ~0U;
17102 return count;
17104 default:
17105 return ~0U;
17109 unsigned
17110 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17111 stmt_vec_info stmt_info, slp_tree,
17112 tree vectype, int misalign,
17113 vect_cost_model_location where)
17115 fractional_cost stmt_cost
17116 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17118 bool in_inner_loop_p = (where == vect_body
17119 && stmt_info
17120 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17122 /* Do one-time initialization based on the vinfo. */
17123 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17124 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
17126 if (loop_vinfo)
17127 analyze_loop_vinfo (loop_vinfo);
17129 m_analyzed_vinfo = true;
17132 /* Apply the heuristic described above m_stp_sequence_cost. */
17133 if (m_stp_sequence_cost != ~0U)
17135 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17136 stmt_info, vectype);
17137 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17140 /* Try to get a more accurate cost by looking at STMT_INFO instead
17141 of just looking at KIND. */
17142 if (stmt_info && aarch64_use_new_vector_costs_p ())
17144 /* If we scalarize a strided store, the vectorizer costs one
17145 vec_to_scalar for each element. However, we can store the first
17146 element using an FP store without a separate extract step. */
17147 if (vect_is_store_elt_extraction (kind, stmt_info))
17148 count -= 1;
17150 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17151 stmt_info, stmt_cost);
17153 if (vectype && m_vec_flags)
17154 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17155 stmt_info, vectype,
17156 where, stmt_cost);
17159 /* Do any SVE-specific adjustments to the cost. */
17160 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17161 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17162 vectype, stmt_cost);
17164 /* Vector promotion and demotion requires us to widen the operation first
17165 and only after that perform the conversion. Unfortunately the mid-end
17166 expects this to be doable as a single operation and doesn't pass on
17167 enough context here for us to tell which operation is happening. To
17168 account for this we count every promote-demote operation twice and if
17169 the previously costed operation was also a promote-demote we reduce
17170 the cost of the currently being costed operation to simulate the final
17171 conversion cost. Note that for SVE we can do better here if the converted
17172 value comes from a load since the widening load would consume the widening
17173 operations. However since we're in stage 3 we can't change the helper
17174 vect_is_extending_load and duplicating the code seems not useful. */
17175 gassign *assign = NULL;
17176 if (kind == vec_promote_demote
17177 && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17178 && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17180 auto new_count = count * 2 - m_num_last_promote_demote;
17181 m_num_last_promote_demote = count;
17182 count = new_count;
17184 else
17185 m_num_last_promote_demote = 0;
17187 if (stmt_info && aarch64_use_new_vector_costs_p ())
17189 /* Account for any extra "embedded" costs that apply additively
17190 to the base cost calculated above. */
17191 stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17192 vectype, m_vec_flags, stmt_cost);
17194 /* If we're recording a nonzero vector loop body cost for the
17195 innermost loop, also estimate the operations that would need
17196 to be issued by all relevant implementations of the loop. */
17197 if (loop_vinfo
17198 && (m_costing_for_scalar || where == vect_body)
17199 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17200 && stmt_cost != 0)
17201 for (auto &ops : m_ops)
17202 count_ops (count, kind, stmt_info, &ops);
17204 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17205 estimate the number of statements in the unrolled Advanced SIMD
17206 loop. For simplicitly, we assume that one iteration of the
17207 Advanced SIMD loop would need the same number of statements
17208 as one iteration of the SVE loop. */
17209 if (where == vect_body && m_unrolled_advsimd_niters)
17210 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17212 /* Detect the use of an averaging operation. */
17213 gimple *stmt = stmt_info->stmt;
17214 if (is_gimple_call (stmt)
17215 && gimple_call_internal_p (stmt))
17217 switch (gimple_call_internal_fn (stmt))
17219 case IFN_AVG_FLOOR:
17220 case IFN_AVG_CEIL:
17221 m_has_avg = true;
17222 default:
17223 break;
17228 /* If the statement stores to a decl that is known to be the argument
17229 to a vld1 in the same function, ignore the store for costing purposes.
17230 See the comment above m_stores_to_vector_load_decl for more details. */
17231 if (stmt_info
17232 && (kind == vector_store || kind == unaligned_store)
17233 && aarch64_accesses_vector_load_decl_p (stmt_info))
17235 stmt_cost = 0;
17236 m_stores_to_vector_load_decl = true;
17239 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17242 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17243 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17244 says that we should prefer the Advanced SIMD loop. */
17245 bool
17246 aarch64_vector_costs::prefer_unrolled_loop () const
17248 if (!m_unrolled_advsimd_stmts)
17249 return false;
17251 if (dump_enabled_p ())
17252 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17253 " unrolled Advanced SIMD loop = "
17254 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17255 m_unrolled_advsimd_stmts);
17257 /* The balance here is tricky. On the one hand, we can't be sure whether
17258 the code is vectorizable with Advanced SIMD or not. However, even if
17259 it isn't vectorizable with Advanced SIMD, there's a possibility that
17260 the scalar code could also be unrolled. Some of the code might then
17261 benefit from SLP, or from using LDP and STP. We therefore apply
17262 the heuristic regardless of can_use_advsimd_p. */
17263 return (m_unrolled_advsimd_stmts
17264 && (m_unrolled_advsimd_stmts
17265 <= (unsigned int) param_max_completely_peeled_insns));
17268 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
17269 how fast the SVE code can be issued and compare it to the equivalent value
17270 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
17271 also compare it to the issue rate of Advanced SIMD code
17272 (ADVSIMD_CYCLES_PER_ITER).
17274 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17275 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
17276 is true if we think the loop body is too expensive. */
17278 fractional_cost
17279 aarch64_vector_costs::
17280 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17281 fractional_cost scalar_cycles_per_iter,
17282 unsigned int orig_body_cost, unsigned int *body_cost,
17283 bool *should_disparage)
17285 if (dump_enabled_p ())
17286 ops->dump ();
17288 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17289 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17291 /* If the scalar version of the loop could issue at least as
17292 quickly as the predicate parts of the SVE loop, make the SVE loop
17293 prohibitively expensive. In this case vectorization is adding an
17294 overhead that the original scalar code didn't have.
17296 This is mostly intended to detect cases in which WHILELOs dominate
17297 for very tight loops, which is something that normal latency-based
17298 costs would not model. Adding this kind of cliffedge would be
17299 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17300 code in the caller handles that case in a more conservative way. */
17301 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17302 if (scalar_cycles_per_iter < sve_estimate)
17304 unsigned int min_cost
17305 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17306 if (*body_cost < min_cost)
17308 if (dump_enabled_p ())
17309 dump_printf_loc (MSG_NOTE, vect_location,
17310 "Increasing body cost to %d because the"
17311 " scalar code could issue within the limit"
17312 " imposed by predicate operations\n",
17313 min_cost);
17314 *body_cost = min_cost;
17315 *should_disparage = true;
17319 return sve_cycles_per_iter;
17322 unsigned int
17323 aarch64_vector_costs::determine_suggested_unroll_factor ()
17325 bool sve = m_vec_flags & VEC_ANY_SVE;
17326 /* If we are trying to unroll an Advanced SIMD main loop that contains
17327 an averaging operation that we do not support with SVE and we might use a
17328 predicated epilogue, we need to be conservative and block unrolling as
17329 this might lead to a less optimal loop for the first and only epilogue
17330 using the original loop's vectorization factor.
17331 TODO: Remove this constraint when we add support for multiple epilogue
17332 vectorization. */
17333 if (!sve && !TARGET_SVE2 && m_has_avg)
17334 return 1;
17336 unsigned int max_unroll_factor = 1;
17337 for (auto vec_ops : m_ops)
17339 aarch64_simd_vec_issue_info const *vec_issue
17340 = vec_ops.simd_issue_info ();
17341 if (!vec_issue)
17342 return 1;
17343 /* Limit unroll factor to a value adjustable by the user, the default
17344 value is 4. */
17345 unsigned int unroll_factor = aarch64_vect_unroll_limit;
17346 unsigned int factor
17347 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17348 unsigned int temp;
17350 /* Sanity check, this should never happen. */
17351 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17352 return 1;
17354 /* Check stores. */
17355 if (vec_ops.stores > 0)
17357 temp = CEIL (factor * vec_issue->stores_per_cycle,
17358 vec_ops.stores);
17359 unroll_factor = MIN (unroll_factor, temp);
17362 /* Check loads + stores. */
17363 if (vec_ops.loads > 0)
17365 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17366 vec_ops.loads + vec_ops.stores);
17367 unroll_factor = MIN (unroll_factor, temp);
17370 /* Check general ops. */
17371 if (vec_ops.general_ops > 0)
17373 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17374 vec_ops.general_ops);
17375 unroll_factor = MIN (unroll_factor, temp);
17377 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17380 /* Make sure unroll factor is power of 2. */
17381 return 1 << ceil_log2 (max_unroll_factor);
17384 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17385 and return the new cost. */
17386 unsigned int
17387 aarch64_vector_costs::
17388 adjust_body_cost (loop_vec_info loop_vinfo,
17389 const aarch64_vector_costs *scalar_costs,
17390 unsigned int body_cost)
17392 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17393 return body_cost;
17395 const auto &scalar_ops = scalar_costs->m_ops[0];
17396 const auto &vector_ops = m_ops[0];
17397 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17398 unsigned int orig_body_cost = body_cost;
17399 bool should_disparage = false;
17401 if (dump_enabled_p ())
17402 dump_printf_loc (MSG_NOTE, vect_location,
17403 "Original vector body cost = %d\n", body_cost);
17405 fractional_cost scalar_cycles_per_iter
17406 = scalar_ops.min_cycles_per_iter () * estimated_vf;
17408 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17410 if (dump_enabled_p ())
17412 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17413 dump_printf_loc (MSG_NOTE, vect_location,
17414 "Vector loop iterates at most %wd times\n",
17415 m_num_vector_iterations);
17416 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17417 scalar_ops.dump ();
17418 dump_printf_loc (MSG_NOTE, vect_location,
17419 " estimated cycles per vector iteration"
17420 " (for VF %d) = %f\n",
17421 estimated_vf, scalar_cycles_per_iter.as_double ());
17424 if (vector_ops.sve_issue_info ())
17426 if (dump_enabled_p ())
17427 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17428 vector_cycles_per_iter
17429 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17430 orig_body_cost, &body_cost, &should_disparage);
17432 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17434 /* Also take Neoverse V1 tuning into account, doubling the
17435 scalar and Advanced SIMD estimates to account for the
17436 doubling in SVE vector length. */
17437 if (dump_enabled_p ())
17438 dump_printf_loc (MSG_NOTE, vect_location,
17439 "Neoverse V1 estimate:\n");
17440 auto vf_factor = m_ops[1].vf_factor ();
17441 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17442 orig_body_cost, &body_cost, &should_disparage);
17445 else
17447 if (dump_enabled_p ())
17449 dump_printf_loc (MSG_NOTE, vect_location,
17450 "Vector issue estimate:\n");
17451 vector_ops.dump ();
17455 /* Decide whether to stick to latency-based costs or whether to try to
17456 take issue rates into account. */
17457 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17458 if (m_vec_flags & VEC_ANY_SVE)
17459 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17461 if (m_num_vector_iterations >= 1
17462 && m_num_vector_iterations < threshold)
17464 if (dump_enabled_p ())
17465 dump_printf_loc (MSG_NOTE, vect_location,
17466 "Low iteration count, so using pure latency"
17467 " costs\n");
17469 /* Increase the cost of the vector code if it looks like the scalar code
17470 could issue more quickly. These values are only rough estimates,
17471 so minor differences should only result in minor changes. */
17472 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17474 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17475 scalar_cycles_per_iter);
17476 if (dump_enabled_p ())
17477 dump_printf_loc (MSG_NOTE, vect_location,
17478 "Increasing body cost to %d because scalar code"
17479 " would issue more quickly\n", body_cost);
17481 /* In general, it's expected that the proposed vector code would be able
17482 to issue more quickly than the original scalar code. This should
17483 already be reflected to some extent in the latency-based costs.
17485 However, the latency-based costs effectively assume that the scalar
17486 code and the vector code execute serially, which tends to underplay
17487 one important case: if the real (non-serialized) execution time of
17488 a scalar iteration is dominated by loop-carried dependencies,
17489 and if the vector code is able to reduce both the length of
17490 the loop-carried dependencies *and* the number of cycles needed
17491 to issue the code in general, we can be more confident that the
17492 vector code is an improvement, even if adding the other (non-loop-carried)
17493 latencies tends to hide this saving. We therefore reduce the cost of the
17494 vector loop body in proportion to the saving. */
17495 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17496 && scalar_ops.reduction_latency == scalar_cycles_per_iter
17497 && scalar_cycles_per_iter > vector_cycles_per_iter
17498 && !should_disparage)
17500 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17501 scalar_cycles_per_iter);
17502 if (dump_enabled_p ())
17503 dump_printf_loc (MSG_NOTE, vect_location,
17504 "Decreasing body cost to %d account for smaller"
17505 " reduction latency\n", body_cost);
17508 return body_cost;
17511 void
17512 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17514 /* Record the issue information for any SVE WHILE instructions that the
17515 loop needs. */
17516 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17517 if (!m_ops.is_empty ()
17518 && loop_vinfo
17519 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
17521 unsigned int num_masks = 0;
17522 rgroup_controls *rgm;
17523 unsigned int num_vectors_m1;
17524 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
17525 num_vectors_m1, rgm)
17526 if (rgm->type)
17527 num_masks += num_vectors_m1 + 1;
17528 for (auto &ops : m_ops)
17529 if (auto *issue = ops.sve_issue_info ())
17530 ops.pred_ops += num_masks * issue->while_pred_ops;
17533 auto *scalar_costs
17534 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17535 if (loop_vinfo
17536 && m_vec_flags
17537 && aarch64_use_new_vector_costs_p ())
17539 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17540 m_costs[vect_body]);
17541 m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17544 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17545 the scalar code in the event of a tie, since there is more chance
17546 of scalar code being optimized with surrounding operations.
17548 In addition, if the vector body is a simple store to a decl that
17549 is elsewhere loaded using vld1, strongly prefer the vector form,
17550 to the extent of giving the prologue a zero cost. See the comment
17551 above m_stores_to_vector_load_decl for details. */
17552 if (!loop_vinfo
17553 && scalar_costs
17554 && m_stp_sequence_cost != ~0U)
17556 if (m_stores_to_vector_load_decl)
17557 m_costs[vect_prologue] = 0;
17558 else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17559 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17562 vector_costs::finish_cost (scalar_costs);
17565 bool
17566 aarch64_vector_costs::
17567 better_main_loop_than_p (const vector_costs *uncast_other) const
17569 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17571 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17572 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17574 if (dump_enabled_p ())
17575 dump_printf_loc (MSG_NOTE, vect_location,
17576 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17577 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17578 vect_vf_for_cost (this_loop_vinfo),
17579 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17580 vect_vf_for_cost (other_loop_vinfo));
17582 /* Apply the unrolling heuristic described above
17583 m_unrolled_advsimd_niters. */
17584 if (bool (m_unrolled_advsimd_stmts)
17585 != bool (other->m_unrolled_advsimd_stmts))
17587 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17588 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17589 if (this_prefer_unrolled != other_prefer_unrolled)
17591 if (dump_enabled_p ())
17592 dump_printf_loc (MSG_NOTE, vect_location,
17593 "Preferring Advanced SIMD loop because"
17594 " it can be unrolled\n");
17595 return other_prefer_unrolled;
17599 for (unsigned int i = 0; i < m_ops.length (); ++i)
17601 if (dump_enabled_p ())
17603 if (i)
17604 dump_printf_loc (MSG_NOTE, vect_location,
17605 "Reconsidering with subtuning %d\n", i);
17606 dump_printf_loc (MSG_NOTE, vect_location,
17607 "Issue info for %s loop:\n",
17608 GET_MODE_NAME (this_loop_vinfo->vector_mode));
17609 this->m_ops[i].dump ();
17610 dump_printf_loc (MSG_NOTE, vect_location,
17611 "Issue info for %s loop:\n",
17612 GET_MODE_NAME (other_loop_vinfo->vector_mode));
17613 other->m_ops[i].dump ();
17616 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17617 * this->m_ops[i].vf_factor ());
17618 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17619 * other->m_ops[i].vf_factor ());
17621 /* If it appears that one loop could process the same amount of data
17622 in fewer cycles, prefer that loop over the other one. */
17623 fractional_cost this_cost
17624 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17625 fractional_cost other_cost
17626 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17627 if (dump_enabled_p ())
17629 dump_printf_loc (MSG_NOTE, vect_location,
17630 "Weighted cycles per iteration of %s loop ~= %f\n",
17631 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17632 this_cost.as_double ());
17633 dump_printf_loc (MSG_NOTE, vect_location,
17634 "Weighted cycles per iteration of %s loop ~= %f\n",
17635 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17636 other_cost.as_double ());
17638 if (this_cost != other_cost)
17640 if (dump_enabled_p ())
17641 dump_printf_loc (MSG_NOTE, vect_location,
17642 "Preferring loop with lower cycles"
17643 " per iteration\n");
17644 return this_cost < other_cost;
17647 /* If the issue rate of SVE code is limited by predicate operations
17648 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17649 and if Advanced SIMD code could issue within the limit imposed
17650 by the predicate operations, the predicate operations are adding an
17651 overhead that the original code didn't have and so we should prefer
17652 the Advanced SIMD version. */
17653 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17654 const aarch64_vec_op_count &b) -> bool
17656 if (a.pred_ops == 0
17657 && (b.min_pred_cycles_per_iter ()
17658 > b.min_nonpred_cycles_per_iter ()))
17660 if (dump_enabled_p ())
17661 dump_printf_loc (MSG_NOTE, vect_location,
17662 "Preferring Advanced SIMD loop since"
17663 " SVE loop is predicate-limited\n");
17664 return true;
17666 return false;
17668 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17669 return true;
17670 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17671 return false;
17674 return vector_costs::better_main_loop_than_p (other);
17677 static void initialize_aarch64_code_model (struct gcc_options *);
17679 /* Parse the TO_PARSE string and put the architecture struct that it
17680 selects into RES and the architectural features into ISA_FLAGS.
17681 Return an aarch_parse_opt_result describing the parse result.
17682 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17683 When the TO_PARSE string contains an invalid extension,
17684 a copy of the string is created and stored to INVALID_EXTENSION. */
17686 static enum aarch_parse_opt_result
17687 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17688 aarch64_feature_flags *isa_flags,
17689 std::string *invalid_extension)
17691 const char *ext;
17692 const struct processor *arch;
17693 size_t len;
17695 ext = strchr (to_parse, '+');
17697 if (ext != NULL)
17698 len = ext - to_parse;
17699 else
17700 len = strlen (to_parse);
17702 if (len == 0)
17703 return AARCH_PARSE_MISSING_ARG;
17706 /* Loop through the list of supported ARCHes to find a match. */
17707 for (arch = all_architectures; arch->name != NULL; arch++)
17709 if (strlen (arch->name) == len
17710 && strncmp (arch->name, to_parse, len) == 0)
17712 auto isa_temp = arch->flags;
17714 if (ext != NULL)
17716 /* TO_PARSE string contains at least one extension. */
17717 enum aarch_parse_opt_result ext_res
17718 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17720 if (ext_res != AARCH_PARSE_OK)
17721 return ext_res;
17723 /* Extension parsing was successful. Confirm the result
17724 arch and ISA flags. */
17725 *res = arch;
17726 *isa_flags = isa_temp;
17727 return AARCH_PARSE_OK;
17731 /* ARCH name not found in list. */
17732 return AARCH_PARSE_INVALID_ARG;
17735 /* Parse the TO_PARSE string and put the result tuning in RES and the
17736 architecture flags in ISA_FLAGS. Return an aarch_parse_opt_result
17737 describing the parse result. If there is an error parsing, RES and
17738 ISA_FLAGS are left unchanged.
17739 When the TO_PARSE string contains an invalid extension,
17740 a copy of the string is created and stored to INVALID_EXTENSION. */
17742 static enum aarch_parse_opt_result
17743 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17744 aarch64_feature_flags *isa_flags,
17745 std::string *invalid_extension)
17747 const char *ext;
17748 const struct processor *cpu;
17749 size_t len;
17751 ext = strchr (to_parse, '+');
17753 if (ext != NULL)
17754 len = ext - to_parse;
17755 else
17756 len = strlen (to_parse);
17758 if (len == 0)
17759 return AARCH_PARSE_MISSING_ARG;
17762 /* Loop through the list of supported CPUs to find a match. */
17763 for (cpu = all_cores; cpu->name != NULL; cpu++)
17765 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17767 auto isa_temp = cpu->flags;
17769 if (ext != NULL)
17771 /* TO_PARSE string contains at least one extension. */
17772 enum aarch_parse_opt_result ext_res
17773 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17775 if (ext_res != AARCH_PARSE_OK)
17776 return ext_res;
17778 /* Extension parsing was successfull. Confirm the result
17779 cpu and ISA flags. */
17780 *res = cpu;
17781 *isa_flags = isa_temp;
17782 return AARCH_PARSE_OK;
17786 /* CPU name not found in list. */
17787 return AARCH_PARSE_INVALID_ARG;
17790 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17791 Return an aarch_parse_opt_result describing the parse result.
17792 If the parsing fails the RES does not change. */
17794 static enum aarch_parse_opt_result
17795 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17797 const struct processor *cpu;
17799 /* Loop through the list of supported CPUs to find a match. */
17800 for (cpu = all_cores; cpu->name != NULL; cpu++)
17802 if (strcmp (cpu->name, to_parse) == 0)
17804 *res = cpu;
17805 return AARCH_PARSE_OK;
17809 /* CPU name not found in list. */
17810 return AARCH_PARSE_INVALID_ARG;
17813 /* Parse TOKEN, which has length LENGTH to see if it is an option
17814 described in FLAG. If it is, return the index bit for that fusion type.
17815 If not, error (printing OPTION_NAME) and return zero. */
17817 static unsigned int
17818 aarch64_parse_one_option_token (const char *token,
17819 size_t length,
17820 const struct aarch64_flag_desc *flag,
17821 const char *option_name)
17823 for (; flag->name != NULL; flag++)
17825 if (length == strlen (flag->name)
17826 && !strncmp (flag->name, token, length))
17827 return flag->flag;
17830 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17831 return 0;
17834 /* Parse OPTION which is a comma-separated list of flags to enable.
17835 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17836 default state we inherit from the CPU tuning structures. OPTION_NAME
17837 gives the top-level option we are parsing in the -moverride string,
17838 for use in error messages. */
17840 static unsigned int
17841 aarch64_parse_boolean_options (const char *option,
17842 const struct aarch64_flag_desc *flags,
17843 unsigned int initial_state,
17844 const char *option_name)
17846 const char separator = '.';
17847 const char* specs = option;
17848 const char* ntoken = option;
17849 unsigned int found_flags = initial_state;
17851 while ((ntoken = strchr (specs, separator)))
17853 size_t token_length = ntoken - specs;
17854 unsigned token_ops = aarch64_parse_one_option_token (specs,
17855 token_length,
17856 flags,
17857 option_name);
17858 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17859 in the token stream, reset the supported operations. So:
17861 adrp+add.cmp+branch.none.adrp+add
17863 would have the result of turning on only adrp+add fusion. */
17864 if (!token_ops)
17865 found_flags = 0;
17867 found_flags |= token_ops;
17868 specs = ++ntoken;
17871 /* We ended with a comma, print something. */
17872 if (!(*specs))
17874 error ("%qs string ill-formed", option_name);
17875 return 0;
17878 /* We still have one more token to parse. */
17879 size_t token_length = strlen (specs);
17880 unsigned token_ops = aarch64_parse_one_option_token (specs,
17881 token_length,
17882 flags,
17883 option_name);
17884 if (!token_ops)
17885 found_flags = 0;
17887 found_flags |= token_ops;
17888 return found_flags;
17891 /* Support for overriding instruction fusion. */
17893 static void
17894 aarch64_parse_fuse_string (const char *fuse_string,
17895 struct tune_params *tune)
17897 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17898 aarch64_fusible_pairs,
17899 tune->fusible_ops,
17900 "fuse=");
17903 /* Support for overriding other tuning flags. */
17905 static void
17906 aarch64_parse_tune_string (const char *tune_string,
17907 struct tune_params *tune)
17909 tune->extra_tuning_flags
17910 = aarch64_parse_boolean_options (tune_string,
17911 aarch64_tuning_flags,
17912 tune->extra_tuning_flags,
17913 "tune=");
17916 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17917 Accept the valid SVE vector widths allowed by
17918 aarch64_sve_vector_bits_enum and use it to override sve_width
17919 in TUNE. */
17921 static void
17922 aarch64_parse_sve_width_string (const char *tune_string,
17923 struct tune_params *tune)
17925 int width = -1;
17927 int n = sscanf (tune_string, "%d", &width);
17928 if (n == EOF)
17930 error ("invalid format for %<sve_width%>");
17931 return;
17933 switch (width)
17935 case SVE_128:
17936 case SVE_256:
17937 case SVE_512:
17938 case SVE_1024:
17939 case SVE_2048:
17940 break;
17941 default:
17942 error ("invalid %<sve_width%> value: %d", width);
17944 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17947 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17948 we understand. If it is, extract the option string and handoff to
17949 the appropriate function. */
17951 void
17952 aarch64_parse_one_override_token (const char* token,
17953 size_t length,
17954 struct tune_params *tune)
17956 const struct aarch64_tuning_override_function *fn
17957 = aarch64_tuning_override_functions;
17959 const char *option_part = strchr (token, '=');
17960 if (!option_part)
17962 error ("tuning string missing in option (%s)", token);
17963 return;
17966 /* Get the length of the option name. */
17967 length = option_part - token;
17968 /* Skip the '=' to get to the option string. */
17969 option_part++;
17971 for (; fn->name != NULL; fn++)
17973 if (!strncmp (fn->name, token, length))
17975 fn->parse_override (option_part, tune);
17976 return;
17980 error ("unknown tuning option (%s)",token);
17981 return;
17984 /* A checking mechanism for the implementation of the tls size. */
17986 static void
17987 initialize_aarch64_tls_size (struct gcc_options *opts)
17989 if (aarch64_tls_size == 0)
17990 aarch64_tls_size = 24;
17992 switch (opts->x_aarch64_cmodel_var)
17994 case AARCH64_CMODEL_TINY:
17995 /* Both the default and maximum TLS size allowed under tiny is 1M which
17996 needs two instructions to address, so we clamp the size to 24. */
17997 if (aarch64_tls_size > 24)
17998 aarch64_tls_size = 24;
17999 break;
18000 case AARCH64_CMODEL_SMALL:
18001 /* The maximum TLS size allowed under small is 4G. */
18002 if (aarch64_tls_size > 32)
18003 aarch64_tls_size = 32;
18004 break;
18005 case AARCH64_CMODEL_LARGE:
18006 /* The maximum TLS size allowed under large is 16E.
18007 FIXME: 16E should be 64bit, we only support 48bit offset now. */
18008 if (aarch64_tls_size > 48)
18009 aarch64_tls_size = 48;
18010 break;
18011 default:
18012 gcc_unreachable ();
18015 return;
18018 /* Return the CPU corresponding to the enum CPU. */
18020 static const struct processor *
18021 aarch64_get_tune_cpu (enum aarch64_processor cpu)
18023 gcc_assert (cpu != aarch64_none);
18025 return &all_cores[cpu];
18028 /* Return the architecture corresponding to the enum ARCH. */
18030 static const struct processor *
18031 aarch64_get_arch (enum aarch64_arch arch)
18033 gcc_assert (arch != aarch64_no_arch);
18035 return &all_architectures[arch];
18038 /* Parse STRING looking for options in the format:
18039 string :: option:string
18040 option :: name=substring
18041 name :: {a-z}
18042 substring :: defined by option. */
18044 static void
18045 aarch64_parse_override_string (const char* input_string,
18046 struct tune_params* tune)
18048 const char separator = ':';
18049 size_t string_length = strlen (input_string) + 1;
18050 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18051 char *string = string_root;
18052 strncpy (string, input_string, string_length);
18053 string[string_length - 1] = '\0';
18055 char* ntoken = string;
18057 while ((ntoken = strchr (string, separator)))
18059 size_t token_length = ntoken - string;
18060 /* Make this substring look like a string. */
18061 *ntoken = '\0';
18062 aarch64_parse_one_override_token (string, token_length, tune);
18063 string = ++ntoken;
18066 /* One last option to parse. */
18067 aarch64_parse_one_override_token (string, strlen (string), tune);
18068 free (string_root);
18071 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18072 are best for a generic target with the currently-enabled architecture
18073 extensions. */
18074 static void
18075 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18077 /* Neoverse V1 is the only core that is known to benefit from
18078 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
18079 point enabling it for SVE2 and above. */
18080 if (TARGET_SVE2)
18081 current_tune.extra_tuning_flags
18082 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18085 static void
18086 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18088 /* PR 70044: We have to be careful about being called multiple times for the
18089 same function. This means all changes should be repeatable. */
18091 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18092 Disable the frame pointer flag so the mid-end will not use a frame
18093 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18094 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18095 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
18096 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18097 if (opts->x_flag_omit_frame_pointer == 0)
18098 opts->x_flag_omit_frame_pointer = 2;
18100 /* If not optimizing for size, set the default
18101 alignment to what the target wants. */
18102 if (!opts->x_optimize_size)
18104 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18105 opts->x_str_align_loops = aarch64_tune_params.loop_align;
18106 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18107 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18108 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18109 opts->x_str_align_functions = aarch64_tune_params.function_align;
18112 /* We default to no pc-relative literal loads. */
18114 aarch64_pcrelative_literal_loads = false;
18116 /* If -mpc-relative-literal-loads is set on the command line, this
18117 implies that the user asked for PC relative literal loads. */
18118 if (opts->x_pcrelative_literal_loads == 1)
18119 aarch64_pcrelative_literal_loads = true;
18121 /* In the tiny memory model it makes no sense to disallow PC relative
18122 literal pool loads. */
18123 if (aarch64_cmodel == AARCH64_CMODEL_TINY
18124 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18125 aarch64_pcrelative_literal_loads = true;
18127 /* When enabling the lower precision Newton series for the square root, also
18128 enable it for the reciprocal square root, since the latter is an
18129 intermediary step for the former. */
18130 if (flag_mlow_precision_sqrt)
18131 flag_mrecip_low_precision_sqrt = true;
18134 /* 'Unpack' up the internal tuning structs and update the options
18135 in OPTS. The caller must have set up selected_tune and selected_arch
18136 as all the other target-specific codegen decisions are
18137 derived from them. */
18139 void
18140 aarch64_override_options_internal (struct gcc_options *opts)
18142 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18143 aarch64_tune_flags = tune->flags;
18144 aarch64_tune = tune->sched_core;
18145 /* Make a copy of the tuning parameters attached to the core, which
18146 we may later overwrite. */
18147 aarch64_tune_params = *(tune->tune);
18148 if (tune->tune == &generic_tunings)
18149 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18151 if (opts->x_aarch64_override_tune_string)
18152 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18153 &aarch64_tune_params);
18155 if (opts->x_aarch64_ldp_policy_param)
18156 aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18158 if (opts->x_aarch64_stp_policy_param)
18159 aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18161 /* This target defaults to strict volatile bitfields. */
18162 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18163 opts->x_flag_strict_volatile_bitfields = 1;
18165 if (aarch64_stack_protector_guard == SSP_GLOBAL
18166 && opts->x_aarch64_stack_protector_guard_offset_str)
18168 error ("incompatible options %<-mstack-protector-guard=global%> and "
18169 "%<-mstack-protector-guard-offset=%s%>",
18170 aarch64_stack_protector_guard_offset_str);
18173 if (aarch64_stack_protector_guard == SSP_SYSREG
18174 && !(opts->x_aarch64_stack_protector_guard_offset_str
18175 && opts->x_aarch64_stack_protector_guard_reg_str))
18177 error ("both %<-mstack-protector-guard-offset%> and "
18178 "%<-mstack-protector-guard-reg%> must be used "
18179 "with %<-mstack-protector-guard=sysreg%>");
18182 if (opts->x_aarch64_stack_protector_guard_reg_str)
18184 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18185 error ("specify a system register with a small string length");
18188 if (opts->x_aarch64_stack_protector_guard_offset_str)
18190 char *end;
18191 const char *str = aarch64_stack_protector_guard_offset_str;
18192 errno = 0;
18193 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18194 if (!*str || *end || errno)
18195 error ("%qs is not a valid offset in %qs", str,
18196 "-mstack-protector-guard-offset=");
18197 aarch64_stack_protector_guard_offset = offs;
18200 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18201 && !fixed_regs[R18_REGNUM])
18202 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18204 if ((opts->x_aarch64_isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18205 && !(opts->x_aarch64_isa_flags & AARCH64_FL_SME))
18207 if (opts->x_aarch64_isa_flags & AARCH64_FL_SM_ON)
18208 error ("streaming functions require the ISA extension %qs", "sme");
18209 else
18210 error ("functions with SME state require the ISA extension %qs",
18211 "sme");
18212 inform (input_location, "you can enable %qs using the command-line"
18213 " option %<-march%>, or by using the %<target%>"
18214 " attribute or pragma", "sme");
18215 opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18216 auto new_flags = (opts->x_aarch64_asm_isa_flags
18217 | feature_deps::SME ().enable);
18218 aarch64_set_asm_isa_flags (opts, new_flags);
18221 initialize_aarch64_code_model (opts);
18222 initialize_aarch64_tls_size (opts);
18223 aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18225 int queue_depth = 0;
18226 switch (aarch64_tune_params.autoprefetcher_model)
18228 case tune_params::AUTOPREFETCHER_OFF:
18229 queue_depth = -1;
18230 break;
18231 case tune_params::AUTOPREFETCHER_WEAK:
18232 queue_depth = 0;
18233 break;
18234 case tune_params::AUTOPREFETCHER_STRONG:
18235 queue_depth = max_insn_queue_index + 1;
18236 break;
18237 default:
18238 gcc_unreachable ();
18241 /* We don't mind passing in global_options_set here as we don't use
18242 the *options_set structs anyway. */
18243 SET_OPTION_IF_UNSET (opts, &global_options_set,
18244 param_sched_autopref_queue_depth, queue_depth);
18246 /* Set up parameters to be used in prefetching algorithm. Do not
18247 override the defaults unless we are tuning for a core we have
18248 researched values for. */
18249 if (aarch64_tune_params.prefetch->num_slots > 0)
18250 SET_OPTION_IF_UNSET (opts, &global_options_set,
18251 param_simultaneous_prefetches,
18252 aarch64_tune_params.prefetch->num_slots);
18253 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18254 SET_OPTION_IF_UNSET (opts, &global_options_set,
18255 param_l1_cache_size,
18256 aarch64_tune_params.prefetch->l1_cache_size);
18257 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18258 SET_OPTION_IF_UNSET (opts, &global_options_set,
18259 param_l1_cache_line_size,
18260 aarch64_tune_params.prefetch->l1_cache_line_size);
18262 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18264 SET_OPTION_IF_UNSET (opts, &global_options_set,
18265 param_destruct_interfere_size,
18266 aarch64_tune_params.prefetch->l1_cache_line_size);
18267 SET_OPTION_IF_UNSET (opts, &global_options_set,
18268 param_construct_interfere_size,
18269 aarch64_tune_params.prefetch->l1_cache_line_size);
18271 else
18273 /* For a generic AArch64 target, cover the current range of cache line
18274 sizes. */
18275 SET_OPTION_IF_UNSET (opts, &global_options_set,
18276 param_destruct_interfere_size,
18277 256);
18278 SET_OPTION_IF_UNSET (opts, &global_options_set,
18279 param_construct_interfere_size,
18280 64);
18283 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18284 SET_OPTION_IF_UNSET (opts, &global_options_set,
18285 param_l2_cache_size,
18286 aarch64_tune_params.prefetch->l2_cache_size);
18287 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18288 SET_OPTION_IF_UNSET (opts, &global_options_set,
18289 param_prefetch_dynamic_strides, 0);
18290 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18291 SET_OPTION_IF_UNSET (opts, &global_options_set,
18292 param_prefetch_minimum_stride,
18293 aarch64_tune_params.prefetch->minimum_stride);
18295 /* Use the alternative scheduling-pressure algorithm by default. */
18296 SET_OPTION_IF_UNSET (opts, &global_options_set,
18297 param_sched_pressure_algorithm,
18298 SCHED_PRESSURE_MODEL);
18300 /* Validate the guard size. */
18301 int guard_size = param_stack_clash_protection_guard_size;
18303 if (guard_size != 12 && guard_size != 16)
18304 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18305 "size. Given value %d (%llu KB) is out of range",
18306 guard_size, (1ULL << guard_size) / 1024ULL);
18308 /* Enforce that interval is the same size as size so the mid-end does the
18309 right thing. */
18310 SET_OPTION_IF_UNSET (opts, &global_options_set,
18311 param_stack_clash_protection_probe_interval,
18312 guard_size);
18314 /* The maybe_set calls won't update the value if the user has explicitly set
18315 one. Which means we need to validate that probing interval and guard size
18316 are equal. */
18317 int probe_interval
18318 = param_stack_clash_protection_probe_interval;
18319 if (guard_size != probe_interval)
18320 error ("stack clash guard size %<%d%> must be equal to probing interval "
18321 "%<%d%>", guard_size, probe_interval);
18323 /* Enable sw prefetching at specified optimization level for
18324 CPUS that have prefetch. Lower optimization level threshold by 1
18325 when profiling is enabled. */
18326 if (opts->x_flag_prefetch_loop_arrays < 0
18327 && !opts->x_optimize_size
18328 && aarch64_tune_params.prefetch->default_opt_level >= 0
18329 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18330 opts->x_flag_prefetch_loop_arrays = 1;
18332 /* Avoid loop-dependant FMA chains. */
18333 if (aarch64_tune_params.extra_tuning_flags
18334 & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18335 SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18336 512);
18338 /* Consider fully pipelined FMA in reassociation. */
18339 if (aarch64_tune_params.extra_tuning_flags
18340 & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18341 SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18344 aarch64_override_options_after_change_1 (opts);
18347 /* Print a hint with a suggestion for a core or architecture name that
18348 most closely resembles what the user passed in STR. ARCH is true if
18349 the user is asking for an architecture name. ARCH is false if the user
18350 is asking for a core name. */
18352 static void
18353 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18355 auto_vec<const char *> candidates;
18356 const struct processor *entry = arch ? all_architectures : all_cores;
18357 for (; entry->name != NULL; entry++)
18358 candidates.safe_push (entry->name);
18360 #ifdef HAVE_LOCAL_CPU_DETECT
18361 /* Add also "native" as possible value. */
18362 if (arch)
18363 candidates.safe_push ("native");
18364 #endif
18366 char *s;
18367 const char *hint = candidates_list_and_hint (str, s, candidates);
18368 if (hint)
18369 inform (input_location, "valid arguments are: %s;"
18370 " did you mean %qs?", s, hint);
18371 else
18372 inform (input_location, "valid arguments are: %s", s);
18374 XDELETEVEC (s);
18377 /* Print a hint with a suggestion for a core name that most closely resembles
18378 what the user passed in STR. */
18380 inline static void
18381 aarch64_print_hint_for_core (const char *str)
18383 aarch64_print_hint_for_core_or_arch (str, false);
18386 /* Print a hint with a suggestion for an architecture name that most closely
18387 resembles what the user passed in STR. */
18389 inline static void
18390 aarch64_print_hint_for_arch (const char *str)
18392 aarch64_print_hint_for_core_or_arch (str, true);
18396 /* Print a hint with a suggestion for an extension name
18397 that most closely resembles what the user passed in STR. */
18399 void
18400 aarch64_print_hint_for_extensions (const std::string &str)
18402 auto_vec<const char *> candidates;
18403 aarch64_get_all_extension_candidates (&candidates);
18404 char *s;
18405 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18406 if (hint)
18407 inform (input_location, "valid arguments are: %s;"
18408 " did you mean %qs?", s, hint);
18409 else
18410 inform (input_location, "valid arguments are: %s", s);
18412 XDELETEVEC (s);
18415 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18416 specified in STR and throw errors if appropriate. Put the results if
18417 they are valid in RES and ISA_FLAGS. Return whether the option is
18418 valid. */
18420 static bool
18421 aarch64_validate_mcpu (const char *str, const struct processor **res,
18422 aarch64_feature_flags *isa_flags)
18424 std::string invalid_extension;
18425 enum aarch_parse_opt_result parse_res
18426 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18428 if (parse_res == AARCH_PARSE_OK)
18429 return true;
18431 switch (parse_res)
18433 case AARCH_PARSE_MISSING_ARG:
18434 error ("missing cpu name in %<-mcpu=%s%>", str);
18435 break;
18436 case AARCH_PARSE_INVALID_ARG:
18437 error ("unknown value %qs for %<-mcpu%>", str);
18438 aarch64_print_hint_for_core (str);
18439 /* A common user error is confusing -march and -mcpu.
18440 If the -mcpu string matches a known architecture then suggest
18441 -march=. */
18442 parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18443 if (parse_res == AARCH_PARSE_OK)
18444 inform (input_location, "did you mean %<-march=%s%>?", str);
18445 break;
18446 case AARCH_PARSE_INVALID_FEATURE:
18447 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18448 invalid_extension.c_str (), str);
18449 aarch64_print_hint_for_extensions (invalid_extension);
18450 break;
18451 default:
18452 gcc_unreachable ();
18455 return false;
18458 /* Straight line speculation indicators. */
18459 enum aarch64_sls_hardening_type
18461 SLS_NONE = 0,
18462 SLS_RETBR = 1,
18463 SLS_BLR = 2,
18464 SLS_ALL = 3,
18466 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18468 /* Return whether we should mitigatate Straight Line Speculation for the RET
18469 and BR instructions. */
18470 bool
18471 aarch64_harden_sls_retbr_p (void)
18473 return aarch64_sls_hardening & SLS_RETBR;
18476 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18477 instruction. */
18478 bool
18479 aarch64_harden_sls_blr_p (void)
18481 return aarch64_sls_hardening & SLS_BLR;
18484 /* As of yet we only allow setting these options globally, in the future we may
18485 allow setting them per function. */
18486 static void
18487 aarch64_validate_sls_mitigation (const char *const_str)
18489 char *token_save = NULL;
18490 char *str = NULL;
18492 if (strcmp (const_str, "none") == 0)
18494 aarch64_sls_hardening = SLS_NONE;
18495 return;
18497 if (strcmp (const_str, "all") == 0)
18499 aarch64_sls_hardening = SLS_ALL;
18500 return;
18503 char *str_root = xstrdup (const_str);
18504 str = strtok_r (str_root, ",", &token_save);
18505 if (!str)
18506 error ("invalid argument given to %<-mharden-sls=%>");
18508 int temp = SLS_NONE;
18509 while (str)
18511 if (strcmp (str, "blr") == 0)
18512 temp |= SLS_BLR;
18513 else if (strcmp (str, "retbr") == 0)
18514 temp |= SLS_RETBR;
18515 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18517 error ("%qs must be by itself for %<-mharden-sls=%>", str);
18518 break;
18520 else
18522 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18523 break;
18525 str = strtok_r (NULL, ",", &token_save);
18527 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18528 free (str_root);
18531 /* Validate a command-line -march option. Parse the arch and extensions
18532 (if any) specified in STR and throw errors if appropriate. Put the
18533 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18534 option is valid. */
18536 static bool
18537 aarch64_validate_march (const char *str, const struct processor **res,
18538 aarch64_feature_flags *isa_flags)
18540 std::string invalid_extension;
18541 enum aarch_parse_opt_result parse_res
18542 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18544 if (parse_res == AARCH_PARSE_OK)
18545 return true;
18547 switch (parse_res)
18549 case AARCH_PARSE_MISSING_ARG:
18550 error ("missing arch name in %<-march=%s%>", str);
18551 break;
18552 case AARCH_PARSE_INVALID_ARG:
18553 error ("unknown value %qs for %<-march%>", str);
18554 aarch64_print_hint_for_arch (str);
18555 /* A common user error is confusing -march and -mcpu.
18556 If the -march string matches a known CPU suggest -mcpu. */
18557 parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18558 if (parse_res == AARCH_PARSE_OK)
18559 inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18560 break;
18561 case AARCH_PARSE_INVALID_FEATURE:
18562 error ("invalid feature modifier %qs in %<-march=%s%>",
18563 invalid_extension.c_str (), str);
18564 aarch64_print_hint_for_extensions (invalid_extension);
18565 break;
18566 default:
18567 gcc_unreachable ();
18570 return false;
18573 /* Validate a command-line -mtune option. Parse the cpu
18574 specified in STR and throw errors if appropriate. Put the
18575 result, if it is valid, in RES. Return whether the option is
18576 valid. */
18578 static bool
18579 aarch64_validate_mtune (const char *str, const struct processor **res)
18581 enum aarch_parse_opt_result parse_res
18582 = aarch64_parse_tune (str, res);
18584 if (parse_res == AARCH_PARSE_OK)
18585 return true;
18587 switch (parse_res)
18589 case AARCH_PARSE_MISSING_ARG:
18590 error ("missing cpu name in %<-mtune=%s%>", str);
18591 break;
18592 case AARCH_PARSE_INVALID_ARG:
18593 error ("unknown value %qs for %<-mtune%>", str);
18594 aarch64_print_hint_for_core (str);
18595 break;
18596 default:
18597 gcc_unreachable ();
18599 return false;
18602 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18604 static poly_uint16
18605 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18607 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18608 on big-endian targets, so we would need to forbid subregs that convert
18609 from one to the other. By default a reinterpret sequence would then
18610 involve a store to memory in one mode and a load back in the other.
18611 Even if we optimize that sequence using reverse instructions,
18612 it would still be a significant potential overhead.
18614 For now, it seems better to generate length-agnostic code for that
18615 case instead. */
18616 if (value == SVE_SCALABLE
18617 || (value == SVE_128 && BYTES_BIG_ENDIAN))
18618 return poly_uint16 (2, 2);
18619 else
18620 return (int) value / 64;
18623 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18624 aarch64_isa_flags accordingly. */
18626 void
18627 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18629 aarch64_set_asm_isa_flags (&global_options, flags);
18632 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18633 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18634 tuning structs. In particular it must set selected_tune and
18635 aarch64_asm_isa_flags that define the available ISA features and tuning
18636 decisions. It must also set selected_arch as this will be used to
18637 output the .arch asm tags for each function. */
18639 static void
18640 aarch64_override_options (void)
18642 aarch64_feature_flags cpu_isa = 0;
18643 aarch64_feature_flags arch_isa = 0;
18644 aarch64_set_asm_isa_flags (0);
18646 const struct processor *cpu = NULL;
18647 const struct processor *arch = NULL;
18648 const struct processor *tune = NULL;
18650 if (aarch64_harden_sls_string)
18651 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18653 if (aarch64_branch_protection_string)
18654 aarch_validate_mbranch_protection (aarch64_branch_protection_string,
18655 "-mbranch-protection=");
18657 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18658 If either of -march or -mtune is given, they override their
18659 respective component of -mcpu. */
18660 if (aarch64_cpu_string)
18661 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18663 if (aarch64_arch_string)
18664 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18666 if (aarch64_tune_string)
18667 aarch64_validate_mtune (aarch64_tune_string, &tune);
18669 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18670 SUBTARGET_OVERRIDE_OPTIONS;
18671 #endif
18673 auto isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
18674 if (cpu && arch)
18676 /* If both -mcpu and -march are specified, warn if they are not
18677 feature compatible. feature compatible means that the inclusion of the
18678 cpu features would end up disabling an achitecture feature. In
18679 otherwords the cpu features need to be a strict superset of the arch
18680 features and if so prefer the -march ISA flags. */
18681 auto full_arch_flags = arch->flags | arch_isa;
18682 auto full_cpu_flags = cpu->flags | cpu_isa;
18683 if (~full_cpu_flags & full_arch_flags)
18685 std::string ext_diff
18686 = aarch64_get_extension_string_for_isa_flags (full_arch_flags,
18687 full_cpu_flags);
18688 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18689 "and resulted in options %<%s%> being added",
18690 aarch64_cpu_string,
18691 aarch64_arch_string,
18692 ext_diff.c_str ());
18695 selected_arch = arch->arch;
18696 aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18698 else if (cpu)
18700 selected_arch = cpu->arch;
18701 aarch64_set_asm_isa_flags (cpu_isa | isa_mode);
18703 else if (arch)
18705 cpu = &all_cores[arch->ident];
18706 selected_arch = arch->arch;
18707 aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18709 else
18711 /* No -mcpu or -march specified, so use the default CPU. */
18712 cpu = &all_cores[TARGET_CPU_DEFAULT];
18713 selected_arch = cpu->arch;
18714 aarch64_set_asm_isa_flags (cpu->flags | isa_mode);
18717 selected_tune = tune ? tune->ident : cpu->ident;
18719 if (aarch_enable_bti == 2)
18721 #ifdef TARGET_ENABLE_BTI
18722 aarch_enable_bti = 1;
18723 #else
18724 aarch_enable_bti = 0;
18725 #endif
18728 /* Return address signing is currently not supported for ILP32 targets. For
18729 LP64 targets use the configured option in the absence of a command-line
18730 option for -mbranch-protection. */
18731 if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
18733 #ifdef TARGET_ENABLE_PAC_RET
18734 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18735 #else
18736 aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18737 #endif
18740 #ifndef HAVE_AS_MABI_OPTION
18741 /* The compiler may have been configured with 2.23.* binutils, which does
18742 not have support for ILP32. */
18743 if (TARGET_ILP32)
18744 error ("assembler does not support %<-mabi=ilp32%>");
18745 #endif
18747 /* Convert -msve-vector-bits to a VG count. */
18748 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18750 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
18751 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18753 /* The pass to insert speculation tracking runs before
18754 shrink-wrapping and the latter does not know how to update the
18755 tracking status. So disable it in this case. */
18756 if (aarch64_track_speculation)
18757 flag_shrink_wrap = 0;
18759 aarch64_override_options_internal (&global_options);
18761 /* Save these options as the default ones in case we push and pop them later
18762 while processing functions with potential target attributes. */
18763 target_option_default_node = target_option_current_node
18764 = build_target_option_node (&global_options, &global_options_set);
18767 /* Implement targetm.override_options_after_change. */
18769 static void
18770 aarch64_override_options_after_change (void)
18772 aarch64_override_options_after_change_1 (&global_options);
18775 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18776 static char *
18777 aarch64_offload_options (void)
18779 if (TARGET_ILP32)
18780 return xstrdup ("-foffload-abi=ilp32");
18781 else
18782 return xstrdup ("-foffload-abi=lp64");
18785 static struct machine_function *
18786 aarch64_init_machine_status (void)
18788 struct machine_function *machine;
18789 machine = ggc_cleared_alloc<machine_function> ();
18790 return machine;
18793 void
18794 aarch64_init_expanders (void)
18796 init_machine_status = aarch64_init_machine_status;
18799 /* A checking mechanism for the implementation of the various code models. */
18800 static void
18801 initialize_aarch64_code_model (struct gcc_options *opts)
18803 aarch64_cmodel = opts->x_aarch64_cmodel_var;
18804 switch (opts->x_aarch64_cmodel_var)
18806 case AARCH64_CMODEL_TINY:
18807 if (opts->x_flag_pic)
18808 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18809 break;
18810 case AARCH64_CMODEL_SMALL:
18811 if (opts->x_flag_pic)
18813 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18814 aarch64_cmodel = (flag_pic == 2
18815 ? AARCH64_CMODEL_SMALL_PIC
18816 : AARCH64_CMODEL_SMALL_SPIC);
18817 #else
18818 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18819 #endif
18821 break;
18822 case AARCH64_CMODEL_LARGE:
18823 if (opts->x_flag_pic)
18824 sorry ("code model %qs with %<-f%s%>", "large",
18825 opts->x_flag_pic > 1 ? "PIC" : "pic");
18826 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18827 sorry ("code model %qs not supported in ilp32 mode", "large");
18828 break;
18829 case AARCH64_CMODEL_TINY_PIC:
18830 case AARCH64_CMODEL_SMALL_PIC:
18831 case AARCH64_CMODEL_SMALL_SPIC:
18832 gcc_unreachable ();
18836 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
18837 using the information saved in PTR. */
18839 static void
18840 aarch64_option_restore (struct gcc_options *opts,
18841 struct gcc_options * /* opts_set */,
18842 struct cl_target_option * /* ptr */)
18844 aarch64_override_options_internal (opts);
18847 /* Implement TARGET_OPTION_PRINT. */
18849 static void
18850 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18852 const struct processor *cpu
18853 = aarch64_get_tune_cpu (ptr->x_selected_tune);
18854 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18855 std::string extension
18856 = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
18857 arch->flags);
18859 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18860 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18861 arch->name, extension.c_str ());
18864 static GTY(()) tree aarch64_previous_fndecl;
18866 void
18867 aarch64_reset_previous_fndecl (void)
18869 aarch64_previous_fndecl = NULL;
18872 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18873 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18874 make sure optab availability predicates are recomputed when necessary. */
18876 void
18877 aarch64_save_restore_target_globals (tree new_tree)
18879 if (TREE_TARGET_GLOBALS (new_tree))
18880 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18881 else if (new_tree == target_option_default_node)
18882 restore_target_globals (&default_target_globals);
18883 else
18884 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18887 /* Return the target_option_node for FNDECL, or the current options
18888 if FNDECL is null. */
18890 static tree
18891 aarch64_fndecl_options (tree fndecl)
18893 if (!fndecl)
18894 return target_option_current_node;
18896 if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
18897 return options;
18899 return target_option_default_node;
18902 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
18903 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18904 of the function, if such exists. This function may be called multiple
18905 times on a single function so use aarch64_previous_fndecl to avoid
18906 setting up identical state. */
18908 static void
18909 aarch64_set_current_function (tree fndecl)
18911 tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
18912 tree new_tree = aarch64_fndecl_options (fndecl);
18914 auto new_isa_mode = (fndecl
18915 ? aarch64_fndecl_isa_mode (fndecl)
18916 : AARCH64_FL_DEFAULT_ISA_MODE);
18917 auto isa_flags = TREE_TARGET_OPTION (new_tree)->x_aarch64_isa_flags;
18919 static bool reported_zt0_p;
18920 if (!reported_zt0_p
18921 && !(isa_flags & AARCH64_FL_SME2)
18922 && fndecl
18923 && aarch64_fndecl_has_state (fndecl, "zt0"))
18925 error ("functions with %qs state require the ISA extension %qs",
18926 "zt0", "sme2");
18927 inform (input_location, "you can enable %qs using the command-line"
18928 " option %<-march%>, or by using the %<target%>"
18929 " attribute or pragma", "sme2");
18930 reported_zt0_p = true;
18933 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
18934 the default have been handled by aarch64_save_restore_target_globals from
18935 aarch64_pragma_target_parse. */
18936 if (old_tree == new_tree
18937 && (!fndecl || aarch64_previous_fndecl)
18938 && (isa_flags & AARCH64_FL_ISA_MODES) == new_isa_mode)
18940 gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
18941 return;
18944 aarch64_previous_fndecl = fndecl;
18946 /* First set the target options. */
18947 cl_target_option_restore (&global_options, &global_options_set,
18948 TREE_TARGET_OPTION (new_tree));
18950 /* The ISA mode can vary based on function type attributes and
18951 function declaration attributes. Make sure that the target
18952 options correctly reflect these attributes. */
18953 if ((isa_flags & AARCH64_FL_ISA_MODES) != new_isa_mode)
18955 auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
18956 aarch64_set_asm_isa_flags (base_flags | new_isa_mode);
18958 aarch64_override_options_internal (&global_options);
18959 new_tree = build_target_option_node (&global_options,
18960 &global_options_set);
18961 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
18963 tree new_optimize = build_optimization_node (&global_options,
18964 &global_options_set);
18965 if (new_optimize != optimization_default_node)
18966 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
18969 aarch64_save_restore_target_globals (new_tree);
18971 gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
18974 /* Enum describing the various ways we can handle attributes.
18975 In many cases we can reuse the generic option handling machinery. */
18977 enum aarch64_attr_opt_type
18979 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
18980 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
18981 aarch64_attr_enum, /* Attribute sets an enum variable. */
18982 aarch64_attr_custom /* Attribute requires a custom handling function. */
18985 /* All the information needed to handle a target attribute.
18986 NAME is the name of the attribute.
18987 ATTR_TYPE specifies the type of behavior of the attribute as described
18988 in the definition of enum aarch64_attr_opt_type.
18989 ALLOW_NEG is true if the attribute supports a "no-" form.
18990 HANDLER is the function that takes the attribute string as an argument
18991 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18992 OPT_NUM is the enum specifying the option that the attribute modifies.
18993 This is needed for attributes that mirror the behavior of a command-line
18994 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18995 aarch64_attr_enum. */
18997 struct aarch64_attribute_info
18999 const char *name;
19000 enum aarch64_attr_opt_type attr_type;
19001 bool allow_neg;
19002 bool (*handler) (const char *);
19003 enum opt_code opt_num;
19006 /* Handle the ARCH_STR argument to the arch= target attribute. */
19008 static bool
19009 aarch64_handle_attr_arch (const char *str)
19011 const struct processor *tmp_arch = NULL;
19012 std::string invalid_extension;
19013 aarch64_feature_flags tmp_flags;
19014 enum aarch_parse_opt_result parse_res
19015 = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19017 if (parse_res == AARCH_PARSE_OK)
19019 gcc_assert (tmp_arch);
19020 selected_arch = tmp_arch->arch;
19021 aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19022 return true;
19025 switch (parse_res)
19027 case AARCH_PARSE_MISSING_ARG:
19028 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19029 break;
19030 case AARCH_PARSE_INVALID_ARG:
19031 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19032 aarch64_print_hint_for_arch (str);
19033 break;
19034 case AARCH_PARSE_INVALID_FEATURE:
19035 error ("invalid feature modifier %s of value %qs in "
19036 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19037 aarch64_print_hint_for_extensions (invalid_extension);
19038 break;
19039 default:
19040 gcc_unreachable ();
19043 return false;
19046 /* Handle the argument CPU_STR to the cpu= target attribute. */
19048 static bool
19049 aarch64_handle_attr_cpu (const char *str)
19051 const struct processor *tmp_cpu = NULL;
19052 std::string invalid_extension;
19053 aarch64_feature_flags tmp_flags;
19054 enum aarch_parse_opt_result parse_res
19055 = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19057 if (parse_res == AARCH_PARSE_OK)
19059 gcc_assert (tmp_cpu);
19060 selected_tune = tmp_cpu->ident;
19061 selected_arch = tmp_cpu->arch;
19062 aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19063 return true;
19066 switch (parse_res)
19068 case AARCH_PARSE_MISSING_ARG:
19069 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19070 break;
19071 case AARCH_PARSE_INVALID_ARG:
19072 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19073 aarch64_print_hint_for_core (str);
19074 break;
19075 case AARCH_PARSE_INVALID_FEATURE:
19076 error ("invalid feature modifier %qs of value %qs in "
19077 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19078 aarch64_print_hint_for_extensions (invalid_extension);
19079 break;
19080 default:
19081 gcc_unreachable ();
19084 return false;
19087 /* Handle the argument STR to the branch-protection= attribute. */
19089 static bool
19090 aarch64_handle_attr_branch_protection (const char* str)
19092 return aarch_validate_mbranch_protection (str,
19093 "target(\"branch-protection=\")");
19096 /* Handle the argument STR to the tune= target attribute. */
19098 static bool
19099 aarch64_handle_attr_tune (const char *str)
19101 const struct processor *tmp_tune = NULL;
19102 enum aarch_parse_opt_result parse_res
19103 = aarch64_parse_tune (str, &tmp_tune);
19105 if (parse_res == AARCH_PARSE_OK)
19107 gcc_assert (tmp_tune);
19108 selected_tune = tmp_tune->ident;
19109 return true;
19112 switch (parse_res)
19114 case AARCH_PARSE_INVALID_ARG:
19115 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19116 aarch64_print_hint_for_core (str);
19117 break;
19118 default:
19119 gcc_unreachable ();
19122 return false;
19125 /* Parse an architecture extensions target attribute string specified in STR.
19126 For example "+fp+nosimd". Show any errors if needed. Return TRUE
19127 if successful. Update aarch64_isa_flags to reflect the ISA features
19128 modified. */
19130 static bool
19131 aarch64_handle_attr_isa_flags (char *str)
19133 enum aarch_parse_opt_result parse_res;
19134 auto isa_flags = aarch64_asm_isa_flags;
19136 /* We allow "+nothing" in the beginning to clear out all architectural
19137 features if the user wants to handpick specific features. */
19138 if (strncmp ("+nothing", str, 8) == 0)
19140 isa_flags = AARCH64_ISA_MODE;
19141 str += 8;
19144 std::string invalid_extension;
19145 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19147 if (parse_res == AARCH_PARSE_OK)
19149 aarch64_set_asm_isa_flags (isa_flags);
19150 return true;
19153 switch (parse_res)
19155 case AARCH_PARSE_MISSING_ARG:
19156 error ("missing value in %<target()%> pragma or attribute");
19157 break;
19159 case AARCH_PARSE_INVALID_FEATURE:
19160 error ("invalid feature modifier %qs of value %qs in "
19161 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19162 break;
19164 default:
19165 gcc_unreachable ();
19168 return false;
19171 /* The target attributes that we support. On top of these we also support just
19172 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
19173 handled explicitly in aarch64_process_one_target_attr. */
19175 static const struct aarch64_attribute_info aarch64_attributes[] =
19177 { "general-regs-only", aarch64_attr_mask, false, NULL,
19178 OPT_mgeneral_regs_only },
19179 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19180 OPT_mfix_cortex_a53_835769 },
19181 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19182 OPT_mfix_cortex_a53_843419 },
19183 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19184 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19185 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19186 OPT_momit_leaf_frame_pointer },
19187 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19188 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19189 OPT_march_ },
19190 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19191 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19192 OPT_mtune_ },
19193 { "branch-protection", aarch64_attr_custom, false,
19194 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19195 { "sign-return-address", aarch64_attr_enum, false, NULL,
19196 OPT_msign_return_address_ },
19197 { "outline-atomics", aarch64_attr_bool, true, NULL,
19198 OPT_moutline_atomics},
19199 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19202 /* Parse ARG_STR which contains the definition of one target attribute.
19203 Show appropriate errors if any or return true if the attribute is valid. */
19205 static bool
19206 aarch64_process_one_target_attr (char *arg_str)
19208 bool invert = false;
19210 size_t len = strlen (arg_str);
19212 if (len == 0)
19214 error ("malformed %<target()%> pragma or attribute");
19215 return false;
19218 char *str_to_check = (char *) alloca (len + 1);
19219 strcpy (str_to_check, arg_str);
19221 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19222 It is easier to detect and handle it explicitly here rather than going
19223 through the machinery for the rest of the target attributes in this
19224 function. */
19225 if (*str_to_check == '+')
19226 return aarch64_handle_attr_isa_flags (str_to_check);
19228 if (len > 3 && startswith (str_to_check, "no-"))
19230 invert = true;
19231 str_to_check += 3;
19233 char *arg = strchr (str_to_check, '=');
19235 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19236 and point ARG to "foo". */
19237 if (arg)
19239 *arg = '\0';
19240 arg++;
19242 const struct aarch64_attribute_info *p_attr;
19243 bool found = false;
19244 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19246 /* If the names don't match up, or the user has given an argument
19247 to an attribute that doesn't accept one, or didn't give an argument
19248 to an attribute that expects one, fail to match. */
19249 if (strcmp (str_to_check, p_attr->name) != 0)
19250 continue;
19252 found = true;
19253 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19254 || p_attr->attr_type == aarch64_attr_enum;
19256 if (attr_need_arg_p ^ (arg != NULL))
19258 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19259 return false;
19262 /* If the name matches but the attribute does not allow "no-" versions
19263 then we can't match. */
19264 if (invert && !p_attr->allow_neg)
19266 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19267 return false;
19270 switch (p_attr->attr_type)
19272 /* Has a custom handler registered.
19273 For example, cpu=, arch=, tune=. */
19274 case aarch64_attr_custom:
19275 gcc_assert (p_attr->handler);
19276 if (!p_attr->handler (arg))
19277 return false;
19278 break;
19280 /* Either set or unset a boolean option. */
19281 case aarch64_attr_bool:
19283 struct cl_decoded_option decoded;
19285 generate_option (p_attr->opt_num, NULL, !invert,
19286 CL_TARGET, &decoded);
19287 aarch64_handle_option (&global_options, &global_options_set,
19288 &decoded, input_location);
19289 break;
19291 /* Set or unset a bit in the target_flags. aarch64_handle_option
19292 should know what mask to apply given the option number. */
19293 case aarch64_attr_mask:
19295 struct cl_decoded_option decoded;
19296 /* We only need to specify the option number.
19297 aarch64_handle_option will know which mask to apply. */
19298 decoded.opt_index = p_attr->opt_num;
19299 decoded.value = !invert;
19300 aarch64_handle_option (&global_options, &global_options_set,
19301 &decoded, input_location);
19302 break;
19304 /* Use the option setting machinery to set an option to an enum. */
19305 case aarch64_attr_enum:
19307 gcc_assert (arg);
19308 bool valid;
19309 int value;
19310 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19311 &value, CL_TARGET);
19312 if (valid)
19314 set_option (&global_options, NULL, p_attr->opt_num, value,
19315 NULL, DK_UNSPECIFIED, input_location,
19316 global_dc);
19318 else
19320 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19322 break;
19324 default:
19325 gcc_unreachable ();
19329 /* If we reached here we either have found an attribute and validated
19330 it or didn't match any. If we matched an attribute but its arguments
19331 were malformed we will have returned false already. */
19332 return found;
19335 /* Count how many times the character C appears in
19336 NULL-terminated string STR. */
19338 static unsigned int
19339 num_occurences_in_str (char c, char *str)
19341 unsigned int res = 0;
19342 while (*str != '\0')
19344 if (*str == c)
19345 res++;
19347 str++;
19350 return res;
19353 /* Parse the tree in ARGS that contains the target attribute information
19354 and update the global target options space. */
19356 bool
19357 aarch64_process_target_attr (tree args)
19359 if (TREE_CODE (args) == TREE_LIST)
19363 tree head = TREE_VALUE (args);
19364 if (head)
19366 if (!aarch64_process_target_attr (head))
19367 return false;
19369 args = TREE_CHAIN (args);
19370 } while (args);
19372 return true;
19375 if (TREE_CODE (args) != STRING_CST)
19377 error ("attribute %<target%> argument not a string");
19378 return false;
19381 size_t len = strlen (TREE_STRING_POINTER (args));
19382 char *str_to_check = (char *) alloca (len + 1);
19383 strcpy (str_to_check, TREE_STRING_POINTER (args));
19385 if (len == 0)
19387 error ("malformed %<target()%> pragma or attribute");
19388 return false;
19391 /* Used to catch empty spaces between commas i.e.
19392 attribute ((target ("attr1,,attr2"))). */
19393 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19395 /* Handle multiple target attributes separated by ','. */
19396 char *token = strtok_r (str_to_check, ",", &str_to_check);
19398 unsigned int num_attrs = 0;
19399 while (token)
19401 num_attrs++;
19402 if (!aarch64_process_one_target_attr (token))
19404 /* Check if token is possibly an arch extension without
19405 leading '+'. */
19406 aarch64_feature_flags isa_temp = 0;
19407 auto with_plus = std::string ("+") + token;
19408 enum aarch_parse_opt_result ext_res
19409 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19411 if (ext_res == AARCH_PARSE_OK)
19412 error ("arch extension %<%s%> should be prefixed by %<+%>",
19413 token);
19414 else
19415 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19416 return false;
19419 token = strtok_r (NULL, ",", &str_to_check);
19422 if (num_attrs != num_commas + 1)
19424 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19425 return false;
19428 return true;
19431 static bool aarch64_process_target_version_attr (tree args);
19433 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19434 process attribute ((target ("..."))). */
19436 static bool
19437 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19439 struct cl_target_option cur_target;
19440 bool ret;
19441 tree old_optimize;
19442 tree new_target, new_optimize;
19443 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19445 /* If what we're processing is the current pragma string then the
19446 target option node is already stored in target_option_current_node
19447 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19448 having to re-parse the string. This is especially useful to keep
19449 arm_neon.h compile times down since that header contains a lot
19450 of intrinsics enclosed in pragmas. */
19451 if (!existing_target && args == current_target_pragma)
19453 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19454 return true;
19456 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19458 old_optimize
19459 = build_optimization_node (&global_options, &global_options_set);
19460 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19462 /* If the function changed the optimization levels as well as setting
19463 target options, start with the optimizations specified. */
19464 if (func_optimize && func_optimize != old_optimize)
19465 cl_optimization_restore (&global_options, &global_options_set,
19466 TREE_OPTIMIZATION (func_optimize));
19468 /* Save the current target options to restore at the end. */
19469 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19471 /* If fndecl already has some target attributes applied to it, unpack
19472 them so that we add this attribute on top of them, rather than
19473 overwriting them. */
19474 if (existing_target)
19476 struct cl_target_option *existing_options
19477 = TREE_TARGET_OPTION (existing_target);
19479 if (existing_options)
19480 cl_target_option_restore (&global_options, &global_options_set,
19481 existing_options);
19483 else
19484 cl_target_option_restore (&global_options, &global_options_set,
19485 TREE_TARGET_OPTION (target_option_current_node));
19487 ret = aarch64_process_target_attr (args);
19488 ret = aarch64_process_target_attr (args);
19489 if (ret)
19491 tree version_attr = lookup_attribute ("target_version",
19492 DECL_ATTRIBUTES (fndecl));
19493 if (version_attr != NULL_TREE)
19495 /* Reapply any target_version attribute after target attribute.
19496 This should be equivalent to applying the target_version once
19497 after processing all target attributes. */
19498 tree version_args = TREE_VALUE (version_attr);
19499 ret = aarch64_process_target_version_attr (version_args);
19503 /* Set up any additional state. */
19504 if (ret)
19506 aarch64_override_options_internal (&global_options);
19507 new_target = build_target_option_node (&global_options,
19508 &global_options_set);
19510 else
19511 new_target = NULL;
19513 new_optimize = build_optimization_node (&global_options,
19514 &global_options_set);
19516 if (fndecl && ret)
19518 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19520 if (old_optimize != new_optimize)
19521 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19524 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19526 if (old_optimize != new_optimize)
19527 cl_optimization_restore (&global_options, &global_options_set,
19528 TREE_OPTIMIZATION (old_optimize));
19529 return ret;
19532 typedef unsigned long long aarch64_fmv_feature_mask;
19534 typedef struct
19536 const char *name;
19537 aarch64_fmv_feature_mask feature_mask;
19538 aarch64_feature_flags opt_flags;
19539 } aarch64_fmv_feature_datum;
19541 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19542 {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19544 /* FMV features are listed in priority order, to make it easier to sort target
19545 strings. */
19546 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19547 #include "config/aarch64/aarch64-option-extensions.def"
19550 /* Parse a function multiversioning feature string STR, as found in a
19551 target_version or target_clones attribute.
19553 If ISA_FLAGS is nonnull, then update it with the specified architecture
19554 features turned on. If FEATURE_MASK is nonnull, then assign to it a bitmask
19555 representing the set of features explicitly specified in the feature string.
19556 Return an aarch_parse_opt_result describing the result.
19558 When the STR string contains an invalid or duplicate extension, a copy of
19559 the extension string is created and stored to INVALID_EXTENSION. */
19561 static enum aarch_parse_opt_result
19562 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19563 aarch64_fmv_feature_mask *feature_mask,
19564 std::string *invalid_extension)
19566 if (feature_mask)
19567 *feature_mask = 0ULL;
19569 if (strcmp (str, "default") == 0)
19570 return AARCH_PARSE_OK;
19572 while (str != NULL && *str != 0)
19574 const char *ext;
19575 size_t len;
19577 ext = strchr (str, '+');
19579 if (ext != NULL)
19580 len = ext - str;
19581 else
19582 len = strlen (str);
19584 if (len == 0)
19585 return AARCH_PARSE_MISSING_ARG;
19587 static const int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19588 int i;
19589 for (i = 0; i < num_features; i++)
19591 if (strlen (aarch64_fmv_feature_data[i].name) == len
19592 && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19594 if (isa_flags)
19595 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19596 if (feature_mask)
19598 auto old_feature_mask = *feature_mask;
19599 *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19600 if (*feature_mask == old_feature_mask)
19602 /* Duplicate feature. */
19603 if (invalid_extension)
19604 *invalid_extension = std::string (str, len);
19605 return AARCH_PARSE_DUPLICATE_FEATURE;
19608 break;
19612 if (i == num_features)
19614 /* Feature not found in list. */
19615 if (invalid_extension)
19616 *invalid_extension = std::string (str, len);
19617 return AARCH_PARSE_INVALID_FEATURE;
19620 str = ext;
19621 if (str)
19622 /* Skip over the next '+'. */
19623 str++;
19626 return AARCH_PARSE_OK;
19629 /* Parse the tree in ARGS that contains the target_version attribute
19630 information and update the global target options space. */
19632 static bool
19633 aarch64_process_target_version_attr (tree args)
19635 if (TREE_CODE (args) == TREE_LIST)
19637 if (TREE_CHAIN (args))
19639 error ("attribute %<target_version%> has multiple values");
19640 return false;
19642 args = TREE_VALUE (args);
19645 if (!args || TREE_CODE (args) != STRING_CST)
19647 error ("attribute %<target_version%> argument not a string");
19648 return false;
19651 const char *str = TREE_STRING_POINTER (args);
19653 enum aarch_parse_opt_result parse_res;
19654 auto isa_flags = aarch64_asm_isa_flags;
19656 std::string invalid_extension;
19657 parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19658 &invalid_extension);
19660 if (parse_res == AARCH_PARSE_OK)
19662 aarch64_set_asm_isa_flags (isa_flags);
19663 return true;
19666 switch (parse_res)
19668 case AARCH_PARSE_MISSING_ARG:
19669 error ("missing value in %<target_version%> attribute");
19670 break;
19672 case AARCH_PARSE_INVALID_FEATURE:
19673 error ("invalid feature modifier %qs of value %qs in "
19674 "%<target_version%> attribute", invalid_extension.c_str (),
19675 str);
19676 break;
19678 case AARCH_PARSE_DUPLICATE_FEATURE:
19679 error ("duplicate feature modifier %qs of value %qs in "
19680 "%<target_version%> attribute", invalid_extension.c_str (),
19681 str);
19682 break;
19684 default:
19685 gcc_unreachable ();
19688 return false;
19691 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P. This is used to
19692 process attribute ((target_version ("..."))). */
19694 static bool
19695 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
19697 struct cl_target_option cur_target;
19698 bool ret;
19699 tree new_target;
19700 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19702 /* Save the current target options to restore at the end. */
19703 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19705 /* If fndecl already has some target attributes applied to it, unpack
19706 them so that we add this attribute on top of them, rather than
19707 overwriting them. */
19708 if (existing_target)
19710 struct cl_target_option *existing_options
19711 = TREE_TARGET_OPTION (existing_target);
19713 if (existing_options)
19714 cl_target_option_restore (&global_options, &global_options_set,
19715 existing_options);
19717 else
19718 cl_target_option_restore (&global_options, &global_options_set,
19719 TREE_TARGET_OPTION (target_option_current_node));
19721 ret = aarch64_process_target_version_attr (args);
19723 /* Set up any additional state. */
19724 if (ret)
19726 aarch64_override_options_internal (&global_options);
19727 new_target = build_target_option_node (&global_options,
19728 &global_options_set);
19730 else
19731 new_target = NULL;
19733 if (fndecl && ret)
19734 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19736 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19738 return ret;
19741 /* This parses the attribute arguments to target_version in DECL and the
19742 feature mask required to select those targets. No adjustments are made to
19743 add or remove redundant feature requirements. */
19745 static aarch64_fmv_feature_mask
19746 get_feature_mask_for_version (tree decl)
19748 tree version_attr = lookup_attribute ("target_version",
19749 DECL_ATTRIBUTES (decl));
19750 if (version_attr == NULL)
19751 return 0;
19753 const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
19754 (version_attr)));
19755 enum aarch_parse_opt_result parse_res;
19756 aarch64_fmv_feature_mask feature_mask;
19758 parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
19759 NULL);
19761 /* We should have detected any errors before getting here. */
19762 gcc_assert (parse_res == AARCH_PARSE_OK);
19764 return feature_mask;
19767 /* Compare priorities of two feature masks. Return:
19768 1: mask1 is higher priority
19769 -1: mask2 is higher priority
19770 0: masks are equal. */
19772 static int
19773 compare_feature_masks (aarch64_fmv_feature_mask mask1,
19774 aarch64_fmv_feature_mask mask2)
19776 int pop1 = popcount_hwi (mask1);
19777 int pop2 = popcount_hwi (mask2);
19778 if (pop1 > pop2)
19779 return 1;
19780 if (pop2 > pop1)
19781 return -1;
19783 auto diff_mask = mask1 ^ mask2;
19784 if (diff_mask == 0ULL)
19785 return 0;
19786 for (int i = FEAT_MAX - 1; i > 0; i--)
19788 auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
19789 if (diff_mask & bit_mask)
19790 return (mask1 & bit_mask) ? 1 : -1;
19792 gcc_unreachable();
19795 /* Compare priorities of two version decls. */
19798 aarch64_compare_version_priority (tree decl1, tree decl2)
19800 auto mask1 = get_feature_mask_for_version (decl1);
19801 auto mask2 = get_feature_mask_for_version (decl2);
19803 return compare_feature_masks (mask1, mask2);
19806 /* Build the struct __ifunc_arg_t type:
19808 struct __ifunc_arg_t
19810 unsigned long _size; // Size of the struct, so it can grow.
19811 unsigned long _hwcap;
19812 unsigned long _hwcap2;
19816 static tree
19817 build_ifunc_arg_type ()
19819 tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
19820 tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19821 get_identifier ("_size"),
19822 long_unsigned_type_node);
19823 tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19824 get_identifier ("_hwcap"),
19825 long_unsigned_type_node);
19826 tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19827 get_identifier ("_hwcap2"),
19828 long_unsigned_type_node);
19830 DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
19831 DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
19832 DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
19834 TYPE_FIELDS (ifunc_arg_type) = field1;
19835 DECL_CHAIN (field1) = field2;
19836 DECL_CHAIN (field2) = field3;
19838 layout_type (ifunc_arg_type);
19840 tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
19841 tree pointer_type = build_pointer_type (const_type);
19843 return pointer_type;
19846 /* Make the resolver function decl to dispatch the versions of
19847 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
19848 ifunc alias that will point to the created resolver. Create an
19849 empty basic block in the resolver and store the pointer in
19850 EMPTY_BB. Return the decl of the resolver function. */
19852 static tree
19853 make_resolver_func (const tree default_decl,
19854 const tree ifunc_alias_decl,
19855 basic_block *empty_bb)
19857 tree decl, type, t;
19859 /* Create resolver function name based on default_decl. */
19860 tree decl_name = clone_function_name (default_decl, "resolver");
19861 const char *resolver_name = IDENTIFIER_POINTER (decl_name);
19863 /* The resolver function should have signature
19864 (void *) resolver (uint64_t, const __ifunc_arg_t *) */
19865 type = build_function_type_list (ptr_type_node,
19866 uint64_type_node,
19867 build_ifunc_arg_type (),
19868 NULL_TREE);
19870 decl = build_fn_decl (resolver_name, type);
19871 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
19873 DECL_NAME (decl) = decl_name;
19874 TREE_USED (decl) = 1;
19875 DECL_ARTIFICIAL (decl) = 1;
19876 DECL_IGNORED_P (decl) = 1;
19877 TREE_PUBLIC (decl) = 0;
19878 DECL_UNINLINABLE (decl) = 1;
19880 /* Resolver is not external, body is generated. */
19881 DECL_EXTERNAL (decl) = 0;
19882 DECL_EXTERNAL (ifunc_alias_decl) = 0;
19884 DECL_CONTEXT (decl) = NULL_TREE;
19885 DECL_INITIAL (decl) = make_node (BLOCK);
19886 DECL_STATIC_CONSTRUCTOR (decl) = 0;
19888 if (DECL_COMDAT_GROUP (default_decl)
19889 || TREE_PUBLIC (default_decl))
19891 /* In this case, each translation unit with a call to this
19892 versioned function will put out a resolver. Ensure it
19893 is comdat to keep just one copy. */
19894 DECL_COMDAT (decl) = 1;
19895 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
19897 else
19898 TREE_PUBLIC (ifunc_alias_decl) = 0;
19900 /* Build result decl and add to function_decl. */
19901 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
19902 DECL_CONTEXT (t) = decl;
19903 DECL_ARTIFICIAL (t) = 1;
19904 DECL_IGNORED_P (t) = 1;
19905 DECL_RESULT (decl) = t;
19907 /* Build parameter decls and add to function_decl. */
19908 tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
19909 get_identifier ("hwcap"),
19910 uint64_type_node);
19911 tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
19912 get_identifier ("arg"),
19913 build_ifunc_arg_type());
19914 DECL_CONTEXT (arg1) = decl;
19915 DECL_CONTEXT (arg2) = decl;
19916 DECL_ARTIFICIAL (arg1) = 1;
19917 DECL_ARTIFICIAL (arg2) = 1;
19918 DECL_IGNORED_P (arg1) = 1;
19919 DECL_IGNORED_P (arg2) = 1;
19920 DECL_ARG_TYPE (arg1) = uint64_type_node;
19921 DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
19922 DECL_ARGUMENTS (decl) = arg1;
19923 TREE_CHAIN (arg1) = arg2;
19925 gimplify_function_tree (decl);
19926 push_cfun (DECL_STRUCT_FUNCTION (decl));
19927 *empty_bb = init_lowered_empty_function (decl, false,
19928 profile_count::uninitialized ());
19930 cgraph_node::add_new_function (decl, true);
19931 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
19933 pop_cfun ();
19935 gcc_assert (ifunc_alias_decl != NULL);
19936 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
19937 DECL_ATTRIBUTES (ifunc_alias_decl)
19938 = make_attribute ("ifunc", resolver_name,
19939 DECL_ATTRIBUTES (ifunc_alias_decl));
19941 /* Create the alias for dispatch to resolver here. */
19942 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
19943 return decl;
19946 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
19947 to return a pointer to VERSION_DECL if all feature bits specified in
19948 FEATURE_MASK are not set in MASK_VAR. This function will be called during
19949 version dispatch to decide which function version to execute. It returns
19950 the basic block at the end, to which more conditions can be added. */
19951 static basic_block
19952 add_condition_to_bb (tree function_decl, tree version_decl,
19953 aarch64_fmv_feature_mask feature_mask,
19954 tree mask_var, basic_block new_bb)
19956 gimple *return_stmt;
19957 tree convert_expr, result_var;
19958 gimple *convert_stmt;
19959 gimple *if_else_stmt;
19961 basic_block bb1, bb2, bb3;
19962 edge e12, e23;
19964 gimple_seq gseq;
19966 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
19968 gcc_assert (new_bb != NULL);
19969 gseq = bb_seq (new_bb);
19971 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
19972 build_fold_addr_expr (version_decl));
19973 result_var = create_tmp_var (ptr_type_node);
19974 convert_stmt = gimple_build_assign (result_var, convert_expr);
19975 return_stmt = gimple_build_return (result_var);
19977 if (feature_mask == 0ULL)
19979 /* Default version. */
19980 gimple_seq_add_stmt (&gseq, convert_stmt);
19981 gimple_seq_add_stmt (&gseq, return_stmt);
19982 set_bb_seq (new_bb, gseq);
19983 gimple_set_bb (convert_stmt, new_bb);
19984 gimple_set_bb (return_stmt, new_bb);
19985 pop_cfun ();
19986 return new_bb;
19989 tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
19990 tree and_expr = build2 (BIT_AND_EXPR,
19991 long_long_unsigned_type_node,
19992 mask_var,
19993 build_int_cst (long_long_unsigned_type_node,
19994 feature_mask));
19995 gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
19996 gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
19997 gimple_set_bb (and_stmt, new_bb);
19998 gimple_seq_add_stmt (&gseq, and_stmt);
20000 tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20001 if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20002 NULL_TREE, NULL_TREE);
20003 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20004 gimple_set_bb (if_else_stmt, new_bb);
20005 gimple_seq_add_stmt (&gseq, if_else_stmt);
20007 gimple_seq_add_stmt (&gseq, convert_stmt);
20008 gimple_seq_add_stmt (&gseq, return_stmt);
20009 set_bb_seq (new_bb, gseq);
20011 bb1 = new_bb;
20012 e12 = split_block (bb1, if_else_stmt);
20013 bb2 = e12->dest;
20014 e12->flags &= ~EDGE_FALLTHRU;
20015 e12->flags |= EDGE_TRUE_VALUE;
20017 e23 = split_block (bb2, return_stmt);
20019 gimple_set_bb (convert_stmt, bb2);
20020 gimple_set_bb (return_stmt, bb2);
20022 bb3 = e23->dest;
20023 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20025 remove_edge (e23);
20026 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20028 pop_cfun ();
20030 return bb3;
20033 /* This function generates the dispatch function for
20034 multi-versioned functions. DISPATCH_DECL is the function which will
20035 contain the dispatch logic. FNDECLS are the function choices for
20036 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
20037 in DISPATCH_DECL in which the dispatch code is generated. */
20039 static int
20040 dispatch_function_versions (tree dispatch_decl,
20041 void *fndecls_p,
20042 basic_block *empty_bb)
20044 gimple *ifunc_cpu_init_stmt;
20045 gimple_seq gseq;
20046 vec<tree> *fndecls;
20048 gcc_assert (dispatch_decl != NULL
20049 && fndecls_p != NULL
20050 && empty_bb != NULL);
20052 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20054 gseq = bb_seq (*empty_bb);
20055 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
20056 constructors, so explicity call __init_cpu_features_resolver here. */
20057 tree init_fn_type = build_function_type_list (void_type_node,
20058 long_unsigned_type_node,
20059 build_ifunc_arg_type(),
20060 NULL);
20061 tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20062 tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20063 init_fn_id, init_fn_type);
20064 tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20065 tree arg2 = TREE_CHAIN (arg1);
20066 ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20067 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20068 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20070 /* Build the struct type for __aarch64_cpu_features. */
20071 tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20072 tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20073 get_identifier ("features"),
20074 long_long_unsigned_type_node);
20075 DECL_FIELD_CONTEXT (field1) = global_type;
20076 TYPE_FIELDS (global_type) = field1;
20077 layout_type (global_type);
20079 tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20080 get_identifier ("__aarch64_cpu_features"),
20081 global_type);
20082 DECL_EXTERNAL (global_var) = 1;
20083 tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20085 tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20086 global_var, field1, NULL_TREE);
20087 gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20088 gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20089 gimple_set_bb (component_stmt, *empty_bb);
20090 gimple_seq_add_stmt (&gseq, component_stmt);
20092 tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20093 gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20094 gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20095 gimple_set_bb (not_stmt, *empty_bb);
20096 gimple_seq_add_stmt (&gseq, not_stmt);
20098 set_bb_seq (*empty_bb, gseq);
20100 pop_cfun ();
20102 /* fndecls_p is actually a vector. */
20103 fndecls = static_cast<vec<tree> *> (fndecls_p);
20105 /* At least one more version other than the default. */
20106 unsigned int num_versions = fndecls->length ();
20107 gcc_assert (num_versions >= 2);
20109 struct function_version_info
20111 tree version_decl;
20112 aarch64_fmv_feature_mask feature_mask;
20113 } *function_versions;
20115 function_versions = (struct function_version_info *)
20116 XNEWVEC (struct function_version_info, (num_versions));
20118 unsigned int actual_versions = 0;
20120 for (tree version_decl : *fndecls)
20122 aarch64_fmv_feature_mask feature_mask;
20123 /* Get attribute string, parse it and find the right features. */
20124 feature_mask = get_feature_mask_for_version (version_decl);
20125 function_versions [actual_versions].version_decl = version_decl;
20126 function_versions [actual_versions].feature_mask = feature_mask;
20127 actual_versions++;
20130 auto compare_feature_version_info = [](const void *p1, const void *p2) {
20131 const function_version_info v1 = *(const function_version_info *)p1;
20132 const function_version_info v2 = *(const function_version_info *)p2;
20133 return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20136 /* Sort the versions according to descending order of dispatch priority. */
20137 qsort (function_versions, actual_versions,
20138 sizeof (struct function_version_info), compare_feature_version_info);
20140 for (unsigned int i = 0; i < actual_versions; ++i)
20141 *empty_bb = add_condition_to_bb (dispatch_decl,
20142 function_versions[i].version_decl,
20143 function_versions[i].feature_mask,
20144 mask_var,
20145 *empty_bb);
20147 free (function_versions);
20148 return 0;
20151 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY. */
20153 tree
20154 aarch64_generate_version_dispatcher_body (void *node_p)
20156 tree resolver_decl;
20157 basic_block empty_bb;
20158 tree default_ver_decl;
20159 struct cgraph_node *versn;
20160 struct cgraph_node *node;
20162 struct cgraph_function_version_info *node_version_info = NULL;
20163 struct cgraph_function_version_info *versn_info = NULL;
20165 node = (cgraph_node *)node_p;
20167 node_version_info = node->function_version ();
20168 gcc_assert (node->dispatcher_function
20169 && node_version_info != NULL);
20171 if (node_version_info->dispatcher_resolver)
20172 return node_version_info->dispatcher_resolver;
20174 /* The first version in the chain corresponds to the default version. */
20175 default_ver_decl = node_version_info->next->this_node->decl;
20177 /* node is going to be an alias, so remove the finalized bit. */
20178 node->definition = false;
20180 resolver_decl = make_resolver_func (default_ver_decl,
20181 node->decl, &empty_bb);
20183 node_version_info->dispatcher_resolver = resolver_decl;
20185 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20187 auto_vec<tree, 2> fn_ver_vec;
20189 for (versn_info = node_version_info->next; versn_info;
20190 versn_info = versn_info->next)
20192 versn = versn_info->this_node;
20193 /* Check for virtual functions here again, as by this time it should
20194 have been determined if this function needs a vtable index or
20195 not. This happens for methods in derived classes that override
20196 virtual methods in base classes but are not explicitly marked as
20197 virtual. */
20198 if (DECL_VINDEX (versn->decl))
20199 sorry ("virtual function multiversioning not supported");
20201 fn_ver_vec.safe_push (versn->decl);
20204 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20205 cgraph_edge::rebuild_edges ();
20206 pop_cfun ();
20207 return resolver_decl;
20210 /* Make a dispatcher declaration for the multi-versioned function DECL.
20211 Calls to DECL function will be replaced with calls to the dispatcher
20212 by the front-end. Returns the decl of the dispatcher function. */
20214 tree
20215 aarch64_get_function_versions_dispatcher (void *decl)
20217 tree fn = (tree) decl;
20218 struct cgraph_node *node = NULL;
20219 struct cgraph_node *default_node = NULL;
20220 struct cgraph_function_version_info *node_v = NULL;
20221 struct cgraph_function_version_info *first_v = NULL;
20223 tree dispatch_decl = NULL;
20225 struct cgraph_function_version_info *default_version_info = NULL;
20227 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20229 node = cgraph_node::get (fn);
20230 gcc_assert (node != NULL);
20232 node_v = node->function_version ();
20233 gcc_assert (node_v != NULL);
20235 if (node_v->dispatcher_resolver != NULL)
20236 return node_v->dispatcher_resolver;
20238 /* Find the default version and make it the first node. */
20239 first_v = node_v;
20240 /* Go to the beginning of the chain. */
20241 while (first_v->prev != NULL)
20242 first_v = first_v->prev;
20243 default_version_info = first_v;
20244 while (default_version_info != NULL)
20246 if (get_feature_mask_for_version
20247 (default_version_info->this_node->decl) == 0ULL)
20248 break;
20249 default_version_info = default_version_info->next;
20252 /* If there is no default node, just return NULL. */
20253 if (default_version_info == NULL)
20254 return NULL;
20256 /* Make default info the first node. */
20257 if (first_v != default_version_info)
20259 default_version_info->prev->next = default_version_info->next;
20260 if (default_version_info->next)
20261 default_version_info->next->prev = default_version_info->prev;
20262 first_v->prev = default_version_info;
20263 default_version_info->next = first_v;
20264 default_version_info->prev = NULL;
20267 default_node = default_version_info->this_node;
20269 if (targetm.has_ifunc_p ())
20271 struct cgraph_function_version_info *it_v = NULL;
20272 struct cgraph_node *dispatcher_node = NULL;
20273 struct cgraph_function_version_info *dispatcher_version_info = NULL;
20275 /* Right now, the dispatching is done via ifunc. */
20276 dispatch_decl = make_dispatcher_decl (default_node->decl);
20277 TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20279 dispatcher_node = cgraph_node::get_create (dispatch_decl);
20280 gcc_assert (dispatcher_node != NULL);
20281 dispatcher_node->dispatcher_function = 1;
20282 dispatcher_version_info
20283 = dispatcher_node->insert_new_function_version ();
20284 dispatcher_version_info->next = default_version_info;
20285 dispatcher_node->definition = 1;
20287 /* Set the dispatcher for all the versions. */
20288 it_v = default_version_info;
20289 while (it_v != NULL)
20291 it_v->dispatcher_resolver = dispatch_decl;
20292 it_v = it_v->next;
20295 else
20297 error_at (DECL_SOURCE_LOCATION (default_node->decl),
20298 "multiversioning needs %<ifunc%> which is not supported "
20299 "on this target");
20302 return dispatch_decl;
20305 /* This function returns true if FN1 and FN2 are versions of the same function,
20306 that is, the target_version attributes of the function decls are different.
20307 This assumes that FN1 and FN2 have the same signature. */
20309 bool
20310 aarch64_common_function_versions (tree fn1, tree fn2)
20312 if (TREE_CODE (fn1) != FUNCTION_DECL
20313 || TREE_CODE (fn2) != FUNCTION_DECL)
20314 return false;
20316 return (aarch64_compare_version_priority (fn1, fn2) != 0);
20319 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20320 suffixes. */
20322 tree
20323 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20325 /* For function version, add the target suffix to the assembler name. */
20326 if (TREE_CODE (decl) == FUNCTION_DECL
20327 && DECL_FUNCTION_VERSIONED (decl))
20329 aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20331 /* No suffix for the default version. */
20332 if (feature_mask == 0ULL)
20333 return id;
20335 std::string name = IDENTIFIER_POINTER (id);
20336 name += "._";
20338 for (int i = 0; i < FEAT_MAX; i++)
20340 if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20342 name += "M";
20343 name += aarch64_fmv_feature_data[i].name;
20347 if (DECL_ASSEMBLER_NAME_SET_P (decl))
20348 SET_DECL_RTL (decl, NULL);
20350 id = get_identifier (name.c_str());
20352 return id;
20355 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P. Use an opt-out
20356 rather than an opt-in list. */
20358 static bool
20359 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20361 /* A function that has local SME state cannot be inlined into its caller,
20362 since we only support managing PSTATE.ZA switches at function scope. */
20363 return (!aarch64_fndecl_has_new_state (fndecl, "za")
20364 && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20367 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
20368 tri-bool options (yes, no, don't care) and the default value is
20369 DEF, determine whether to reject inlining. */
20371 static bool
20372 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20373 int dont_care, int def)
20375 /* If the callee doesn't care, always allow inlining. */
20376 if (callee == dont_care)
20377 return true;
20379 /* If the caller doesn't care, always allow inlining. */
20380 if (caller == dont_care)
20381 return true;
20383 /* Otherwise, allow inlining if either the callee and caller values
20384 agree, or if the callee is using the default value. */
20385 return (callee == caller || callee == def);
20388 /* Bit allocations for ipa_fn_summary::target_info. */
20390 /* Set if the function contains a stmt that relies on the function's
20391 choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20392 Not meaningful for streaming-compatible functions. */
20393 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20395 /* Set if the function clobbers ZA and ZT0. Not meaningful for functions that
20396 have ZA state. */
20397 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20398 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20400 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO. */
20402 static bool
20403 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20405 /* We could in principle skip this for streaming-compatible functions
20406 that have ZA state, but that's a rare combination. */
20407 return true;
20410 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO. */
20412 static bool
20413 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20415 if (auto *ga = dyn_cast<const gasm *> (stmt))
20417 /* We don't know what the asm does, so conservatively assume that
20418 it requires the function's current SM mode. */
20419 info |= AARCH64_IPA_SM_FIXED;
20420 for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20422 tree op = gimple_asm_clobber_op (ga, i);
20423 const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20424 if (strcmp (clobber, "za") == 0)
20425 info |= AARCH64_IPA_CLOBBERS_ZA;
20426 if (strcmp (clobber, "zt0") == 0)
20427 info |= AARCH64_IPA_CLOBBERS_ZT0;
20430 if (auto *call = dyn_cast<const gcall *> (stmt))
20432 if (gimple_call_builtin_p (call, BUILT_IN_MD))
20434 /* The attributes on AArch64 builtins are supposed to be accurate.
20435 If the function isn't marked streaming-compatible then it
20436 needs whichever SM mode it selects. */
20437 tree decl = gimple_call_fndecl (call);
20438 if (aarch64_fndecl_pstate_sm (decl) != 0)
20439 info |= AARCH64_IPA_SM_FIXED;
20442 return true;
20445 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
20446 to inline CALLEE into CALLER based on target-specific info.
20447 Make sure that the caller and callee have compatible architectural
20448 features. Then go through the other possible target attributes
20449 and see if they can block inlining. Try not to reject always_inline
20450 callees unless they are incompatible architecturally. */
20452 static bool
20453 aarch64_can_inline_p (tree caller, tree callee)
20455 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20456 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20458 struct cl_target_option *caller_opts
20459 = TREE_TARGET_OPTION (caller_tree ? caller_tree
20460 : target_option_default_node);
20462 struct cl_target_option *callee_opts
20463 = TREE_TARGET_OPTION (callee_tree ? callee_tree
20464 : target_option_default_node);
20466 /* Callee's ISA flags should be a subset of the caller's. */
20467 auto caller_asm_isa = (caller_opts->x_aarch64_asm_isa_flags
20468 & ~AARCH64_FL_ISA_MODES);
20469 auto callee_asm_isa = (callee_opts->x_aarch64_asm_isa_flags
20470 & ~AARCH64_FL_ISA_MODES);
20471 if (callee_asm_isa & ~caller_asm_isa)
20472 return false;
20474 auto caller_isa = (caller_opts->x_aarch64_isa_flags
20475 & ~AARCH64_FL_ISA_MODES);
20476 auto callee_isa = (callee_opts->x_aarch64_isa_flags
20477 & ~AARCH64_FL_ISA_MODES);
20478 if (callee_isa & ~caller_isa)
20479 return false;
20481 /* Return true if the callee might have target_info property PROPERTY.
20482 The answer must be true unless we have positive proof to the contrary. */
20483 auto callee_has_property = [&](unsigned int property)
20485 if (ipa_fn_summaries)
20486 if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20487 if (!(summary->target_info & property))
20488 return false;
20489 return true;
20492 /* Streaming-compatible code can be inlined into functions with any
20493 PSTATE.SM mode. Otherwise the caller and callee must agree on
20494 PSTATE.SM mode, unless we can prove that the callee is naturally
20495 streaming-compatible. */
20496 auto caller_sm = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20497 auto callee_sm = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20498 if (callee_sm
20499 && caller_sm != callee_sm
20500 && callee_has_property (AARCH64_IPA_SM_FIXED))
20501 return false;
20503 /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20504 functions from being inlined into others. We also need to prevent
20505 inlining of shared-ZA functions into functions without ZA state,
20506 since this is an error condition.
20508 The only other problematic case for ZA is inlining a function that
20509 directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state. */
20510 auto caller_za = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20511 auto callee_za = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20512 if (!caller_za && callee_za)
20513 return false;
20514 if (!callee_za
20515 && aarch64_fndecl_has_state (caller, "za")
20516 && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20517 return false;
20518 if (!callee_za
20519 && aarch64_fndecl_has_state (caller, "zt0")
20520 && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20521 return false;
20523 /* Allow non-strict aligned functions inlining into strict
20524 aligned ones. */
20525 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20526 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20527 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20528 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20529 return false;
20531 bool always_inline = lookup_attribute ("always_inline",
20532 DECL_ATTRIBUTES (callee));
20534 /* If the architectural features match up and the callee is always_inline
20535 then the other attributes don't matter. */
20536 if (always_inline)
20537 return true;
20539 if (caller_opts->x_aarch64_cmodel_var
20540 != callee_opts->x_aarch64_cmodel_var)
20541 return false;
20543 if (caller_opts->x_aarch64_tls_dialect
20544 != callee_opts->x_aarch64_tls_dialect)
20545 return false;
20547 /* Honour explicit requests to workaround errata. */
20548 if (!aarch64_tribools_ok_for_inlining_p (
20549 caller_opts->x_aarch64_fix_a53_err835769,
20550 callee_opts->x_aarch64_fix_a53_err835769,
20551 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20552 return false;
20554 if (!aarch64_tribools_ok_for_inlining_p (
20555 caller_opts->x_aarch64_fix_a53_err843419,
20556 callee_opts->x_aarch64_fix_a53_err843419,
20557 2, TARGET_FIX_ERR_A53_843419))
20558 return false;
20560 /* If the user explicitly specified -momit-leaf-frame-pointer for the
20561 caller and calle and they don't match up, reject inlining. */
20562 if (!aarch64_tribools_ok_for_inlining_p (
20563 caller_opts->x_flag_omit_leaf_frame_pointer,
20564 callee_opts->x_flag_omit_leaf_frame_pointer,
20565 2, 1))
20566 return false;
20568 /* If the callee has specific tuning overrides, respect them. */
20569 if (callee_opts->x_aarch64_override_tune_string != NULL
20570 && caller_opts->x_aarch64_override_tune_string == NULL)
20571 return false;
20573 /* If the user specified tuning override strings for the
20574 caller and callee and they don't match up, reject inlining.
20575 We just do a string compare here, we don't analyze the meaning
20576 of the string, as it would be too costly for little gain. */
20577 if (callee_opts->x_aarch64_override_tune_string
20578 && caller_opts->x_aarch64_override_tune_string
20579 && (strcmp (callee_opts->x_aarch64_override_tune_string,
20580 caller_opts->x_aarch64_override_tune_string) != 0))
20581 return false;
20583 return true;
20586 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20587 been already. */
20589 arm_pcs
20590 aarch64_tlsdesc_abi_id ()
20592 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20593 if (!tlsdesc_abi.initialized_p ())
20595 HARD_REG_SET full_reg_clobbers;
20596 CLEAR_HARD_REG_SET (full_reg_clobbers);
20597 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20598 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20599 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20600 SET_HARD_REG_BIT (full_reg_clobbers, regno);
20601 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20603 return ARM_PCS_TLSDESC;
20606 /* Return true if SYMBOL_REF X binds locally. */
20608 static bool
20609 aarch64_symbol_binds_local_p (const_rtx x)
20611 return (SYMBOL_REF_DECL (x)
20612 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20613 : SYMBOL_REF_LOCAL_P (x));
20616 /* Return true if SYMBOL_REF X is thread local */
20617 static bool
20618 aarch64_tls_symbol_p (rtx x)
20620 if (! TARGET_HAVE_TLS)
20621 return false;
20623 x = strip_salt (x);
20624 if (!SYMBOL_REF_P (x))
20625 return false;
20627 return SYMBOL_REF_TLS_MODEL (x) != 0;
20630 /* Classify a TLS symbol into one of the TLS kinds. */
20631 enum aarch64_symbol_type
20632 aarch64_classify_tls_symbol (rtx x)
20634 enum tls_model tls_kind = tls_symbolic_operand_type (x);
20636 switch (tls_kind)
20638 case TLS_MODEL_GLOBAL_DYNAMIC:
20639 case TLS_MODEL_LOCAL_DYNAMIC:
20640 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
20642 case TLS_MODEL_INITIAL_EXEC:
20643 switch (aarch64_cmodel)
20645 case AARCH64_CMODEL_TINY:
20646 case AARCH64_CMODEL_TINY_PIC:
20647 return SYMBOL_TINY_TLSIE;
20648 default:
20649 return SYMBOL_SMALL_TLSIE;
20652 case TLS_MODEL_LOCAL_EXEC:
20653 if (aarch64_tls_size == 12)
20654 return SYMBOL_TLSLE12;
20655 else if (aarch64_tls_size == 24)
20656 return SYMBOL_TLSLE24;
20657 else if (aarch64_tls_size == 32)
20658 return SYMBOL_TLSLE32;
20659 else if (aarch64_tls_size == 48)
20660 return SYMBOL_TLSLE48;
20661 else
20662 gcc_unreachable ();
20664 case TLS_MODEL_EMULATED:
20665 case TLS_MODEL_NONE:
20666 return SYMBOL_FORCE_TO_MEM;
20668 default:
20669 gcc_unreachable ();
20673 /* Return the correct method for accessing X + OFFSET, where X is either
20674 a SYMBOL_REF or LABEL_REF. */
20676 enum aarch64_symbol_type
20677 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
20679 x = strip_salt (x);
20681 if (LABEL_REF_P (x))
20683 switch (aarch64_cmodel)
20685 case AARCH64_CMODEL_LARGE:
20686 return SYMBOL_FORCE_TO_MEM;
20688 case AARCH64_CMODEL_TINY_PIC:
20689 case AARCH64_CMODEL_TINY:
20690 return SYMBOL_TINY_ABSOLUTE;
20692 case AARCH64_CMODEL_SMALL_SPIC:
20693 case AARCH64_CMODEL_SMALL_PIC:
20694 case AARCH64_CMODEL_SMALL:
20695 return SYMBOL_SMALL_ABSOLUTE;
20697 default:
20698 gcc_unreachable ();
20702 if (SYMBOL_REF_P (x))
20704 if (aarch64_tls_symbol_p (x))
20705 return aarch64_classify_tls_symbol (x);
20707 switch (aarch64_cmodel)
20709 case AARCH64_CMODEL_TINY_PIC:
20710 case AARCH64_CMODEL_TINY:
20711 /* With -fPIC non-local symbols use the GOT. For orthogonality
20712 always use the GOT for extern weak symbols. */
20713 if ((flag_pic || SYMBOL_REF_WEAK (x))
20714 && !aarch64_symbol_binds_local_p (x))
20715 return SYMBOL_TINY_GOT;
20717 /* When we retrieve symbol + offset address, we have to make sure
20718 the offset does not cause overflow of the final address. But
20719 we have no way of knowing the address of symbol at compile time
20720 so we can't accurately say if the distance between the PC and
20721 symbol + offset is outside the addressible range of +/-1MB in the
20722 TINY code model. So we limit the maximum offset to +/-64KB and
20723 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
20724 If offset_within_block_p is true we allow larger offsets. */
20725 if (!(IN_RANGE (offset, -0x10000, 0x10000)
20726 || offset_within_block_p (x, offset)))
20727 return SYMBOL_FORCE_TO_MEM;
20729 return SYMBOL_TINY_ABSOLUTE;
20732 case AARCH64_CMODEL_SMALL_SPIC:
20733 case AARCH64_CMODEL_SMALL_PIC:
20734 case AARCH64_CMODEL_SMALL:
20735 if ((flag_pic || SYMBOL_REF_WEAK (x))
20736 && !aarch64_symbol_binds_local_p (x))
20737 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
20738 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
20740 /* Same reasoning as the tiny code model, but the offset cap here is
20741 1MB, allowing +/-3.9GB for the offset to the symbol. */
20742 if (!(IN_RANGE (offset, -0x100000, 0x100000)
20743 || offset_within_block_p (x, offset)))
20744 return SYMBOL_FORCE_TO_MEM;
20746 return SYMBOL_SMALL_ABSOLUTE;
20748 case AARCH64_CMODEL_LARGE:
20749 /* This is alright even in PIC code as the constant
20750 pool reference is always PC relative and within
20751 the same translation unit. */
20752 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
20753 return SYMBOL_SMALL_ABSOLUTE;
20754 else
20755 return SYMBOL_FORCE_TO_MEM;
20757 default:
20758 gcc_unreachable ();
20762 /* By default push everything into the constant pool. */
20763 return SYMBOL_FORCE_TO_MEM;
20766 bool
20767 aarch64_constant_address_p (rtx x)
20769 return (CONSTANT_P (x) && memory_address_p (DImode, x));
20772 bool
20773 aarch64_legitimate_pic_operand_p (rtx x)
20775 poly_int64 offset;
20776 x = strip_offset_and_salt (x, &offset);
20777 if (SYMBOL_REF_P (x))
20778 return false;
20780 return true;
20783 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
20784 that should be rematerialized rather than spilled. */
20786 static bool
20787 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
20789 /* Support CSE and rematerialization of common constants. */
20790 if (CONST_INT_P (x)
20791 || CONST_DOUBLE_P (x))
20792 return true;
20794 /* Only accept variable-length vector constants if they can be
20795 handled directly.
20797 ??? It would be possible (but complex) to handle rematerialization
20798 of other constants via secondary reloads. */
20799 if (!GET_MODE_SIZE (mode).is_constant ())
20800 return aarch64_simd_valid_immediate (x, NULL);
20802 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
20803 least be forced to memory and loaded from there. */
20804 if (CONST_VECTOR_P (x))
20805 return !targetm.cannot_force_const_mem (mode, x);
20807 /* Do not allow vector struct mode constants for Advanced SIMD.
20808 We could support 0 and -1 easily, but they need support in
20809 aarch64-simd.md. */
20810 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20811 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
20812 return false;
20814 if (GET_CODE (x) == HIGH)
20815 x = XEXP (x, 0);
20817 /* Accept polynomial constants that can be calculated by using the
20818 destination of a move as the sole temporary. Constants that
20819 require a second temporary cannot be rematerialized (they can't be
20820 forced to memory and also aren't legitimate constants). */
20821 poly_int64 offset;
20822 if (poly_int_rtx_p (x, &offset))
20823 return aarch64_offset_temporaries (false, offset) <= 1;
20825 /* If an offset is being added to something else, we need to allow the
20826 base to be moved into the destination register, meaning that there
20827 are no free temporaries for the offset. */
20828 x = strip_offset_and_salt (x, &offset);
20829 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
20830 return false;
20832 /* Do not allow const (plus (anchor_symbol, const_int)). */
20833 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
20834 return false;
20836 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
20837 so spilling them is better than rematerialization. */
20838 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
20839 return true;
20841 /* Label references are always constant. */
20842 if (LABEL_REF_P (x))
20843 return true;
20845 return false;
20849 aarch64_load_tp (rtx target)
20851 if (!target
20852 || GET_MODE (target) != Pmode
20853 || !register_operand (target, Pmode))
20854 target = gen_reg_rtx (Pmode);
20856 /* Can return in any reg. */
20857 emit_insn (gen_aarch64_load_tp_hard (target));
20858 return target;
20861 /* On AAPCS systems, this is the "struct __va_list". */
20862 static GTY(()) tree va_list_type;
20864 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
20865 Return the type to use as __builtin_va_list.
20867 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
20869 struct __va_list
20871 void *__stack;
20872 void *__gr_top;
20873 void *__vr_top;
20874 int __gr_offs;
20875 int __vr_offs;
20876 }; */
20878 static tree
20879 aarch64_build_builtin_va_list (void)
20881 tree va_list_name;
20882 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
20884 /* Create the type. */
20885 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
20886 /* Give it the required name. */
20887 va_list_name = build_decl (BUILTINS_LOCATION,
20888 TYPE_DECL,
20889 get_identifier ("__va_list"),
20890 va_list_type);
20891 DECL_ARTIFICIAL (va_list_name) = 1;
20892 TYPE_NAME (va_list_type) = va_list_name;
20893 TYPE_STUB_DECL (va_list_type) = va_list_name;
20895 /* Create the fields. */
20896 f_stack = build_decl (BUILTINS_LOCATION,
20897 FIELD_DECL, get_identifier ("__stack"),
20898 ptr_type_node);
20899 f_grtop = build_decl (BUILTINS_LOCATION,
20900 FIELD_DECL, get_identifier ("__gr_top"),
20901 ptr_type_node);
20902 f_vrtop = build_decl (BUILTINS_LOCATION,
20903 FIELD_DECL, get_identifier ("__vr_top"),
20904 ptr_type_node);
20905 f_groff = build_decl (BUILTINS_LOCATION,
20906 FIELD_DECL, get_identifier ("__gr_offs"),
20907 integer_type_node);
20908 f_vroff = build_decl (BUILTINS_LOCATION,
20909 FIELD_DECL, get_identifier ("__vr_offs"),
20910 integer_type_node);
20912 /* Tell tree-stdarg pass about our internal offset fields.
20913 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
20914 purpose to identify whether the code is updating va_list internal
20915 offset fields through irregular way. */
20916 va_list_gpr_counter_field = f_groff;
20917 va_list_fpr_counter_field = f_vroff;
20919 DECL_ARTIFICIAL (f_stack) = 1;
20920 DECL_ARTIFICIAL (f_grtop) = 1;
20921 DECL_ARTIFICIAL (f_vrtop) = 1;
20922 DECL_ARTIFICIAL (f_groff) = 1;
20923 DECL_ARTIFICIAL (f_vroff) = 1;
20925 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
20926 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
20927 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
20928 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
20929 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
20931 TYPE_FIELDS (va_list_type) = f_stack;
20932 DECL_CHAIN (f_stack) = f_grtop;
20933 DECL_CHAIN (f_grtop) = f_vrtop;
20934 DECL_CHAIN (f_vrtop) = f_groff;
20935 DECL_CHAIN (f_groff) = f_vroff;
20937 /* Compute its layout. */
20938 layout_type (va_list_type);
20940 return va_list_type;
20943 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
20944 static void
20945 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
20947 const CUMULATIVE_ARGS *cum;
20948 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
20949 tree stack, grtop, vrtop, groff, vroff;
20950 tree t;
20951 int gr_save_area_size = cfun->va_list_gpr_size;
20952 int vr_save_area_size = cfun->va_list_fpr_size;
20953 int vr_offset;
20955 cum = &crtl->args.info;
20956 if (cfun->va_list_gpr_size)
20957 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
20958 cfun->va_list_gpr_size);
20959 if (cfun->va_list_fpr_size)
20960 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
20961 * UNITS_PER_VREG, cfun->va_list_fpr_size);
20963 if (!TARGET_FLOAT)
20965 gcc_assert (cum->aapcs_nvrn == 0);
20966 vr_save_area_size = 0;
20969 f_stack = TYPE_FIELDS (va_list_type_node);
20970 f_grtop = DECL_CHAIN (f_stack);
20971 f_vrtop = DECL_CHAIN (f_grtop);
20972 f_groff = DECL_CHAIN (f_vrtop);
20973 f_vroff = DECL_CHAIN (f_groff);
20975 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
20976 NULL_TREE);
20977 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
20978 NULL_TREE);
20979 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
20980 NULL_TREE);
20981 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
20982 NULL_TREE);
20983 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
20984 NULL_TREE);
20986 /* Emit code to initialize STACK, which points to the next varargs stack
20987 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
20988 by named arguments. STACK is 8-byte aligned. */
20989 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
20990 if (cum->aapcs_stack_size > 0)
20991 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
20992 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
20993 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
20995 /* Emit code to initialize GRTOP, the top of the GR save area.
20996 virtual_incoming_args_rtx should have been 16 byte aligned. */
20997 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
20998 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
20999 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21001 /* Emit code to initialize VRTOP, the top of the VR save area.
21002 This address is gr_save_area_bytes below GRTOP, rounded
21003 down to the next 16-byte boundary. */
21004 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21005 vr_offset = ROUND_UP (gr_save_area_size,
21006 STACK_BOUNDARY / BITS_PER_UNIT);
21008 if (vr_offset)
21009 t = fold_build_pointer_plus_hwi (t, -vr_offset);
21010 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21011 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21013 /* Emit code to initialize GROFF, the offset from GRTOP of the
21014 next GPR argument. */
21015 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21016 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21017 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21019 /* Likewise emit code to initialize VROFF, the offset from FTOP
21020 of the next VR argument. */
21021 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21022 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21023 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21026 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
21028 static tree
21029 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21030 gimple_seq *post_p ATTRIBUTE_UNUSED)
21032 tree addr;
21033 bool indirect_p;
21034 bool is_ha; /* is HFA or HVA. */
21035 bool dw_align; /* double-word align. */
21036 machine_mode ag_mode = VOIDmode;
21037 int nregs;
21038 machine_mode mode;
21040 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21041 tree stack, f_top, f_off, off, arg, roundup, on_stack;
21042 HOST_WIDE_INT size, rsize, adjust, align;
21043 tree t, u, cond1, cond2;
21045 indirect_p = pass_va_arg_by_reference (type);
21046 if (indirect_p)
21047 type = build_pointer_type (type);
21049 mode = TYPE_MODE (type);
21051 f_stack = TYPE_FIELDS (va_list_type_node);
21052 f_grtop = DECL_CHAIN (f_stack);
21053 f_vrtop = DECL_CHAIN (f_grtop);
21054 f_groff = DECL_CHAIN (f_vrtop);
21055 f_vroff = DECL_CHAIN (f_groff);
21057 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21058 f_stack, NULL_TREE);
21059 size = int_size_in_bytes (type);
21061 unsigned int abi_break_gcc_9;
21062 unsigned int abi_break_gcc_13;
21063 unsigned int abi_break_gcc_14;
21064 align
21065 = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21066 &abi_break_gcc_13, &abi_break_gcc_14)
21067 / BITS_PER_UNIT;
21069 dw_align = false;
21070 adjust = 0;
21071 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21072 &is_ha, false))
21074 /* No frontends can create types with variable-sized modes, so we
21075 shouldn't be asked to pass or return them. */
21076 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21078 /* TYPE passed in fp/simd registers. */
21079 if (!TARGET_FLOAT)
21080 aarch64_err_no_fpadvsimd (mode);
21082 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21083 unshare_expr (valist), f_vrtop, NULL_TREE);
21084 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21085 unshare_expr (valist), f_vroff, NULL_TREE);
21087 rsize = nregs * UNITS_PER_VREG;
21089 if (is_ha)
21091 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21092 adjust = UNITS_PER_VREG - ag_size;
21094 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21095 && size < UNITS_PER_VREG)
21097 adjust = UNITS_PER_VREG - size;
21100 else
21102 /* TYPE passed in general registers. */
21103 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21104 unshare_expr (valist), f_grtop, NULL_TREE);
21105 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21106 unshare_expr (valist), f_groff, NULL_TREE);
21107 rsize = ROUND_UP (size, UNITS_PER_WORD);
21108 nregs = rsize / UNITS_PER_WORD;
21110 if (align <= 8 && abi_break_gcc_13 && warn_psabi)
21111 inform (input_location, "parameter passing for argument of type "
21112 "%qT changed in GCC 13.1", type);
21114 if (warn_psabi
21115 && abi_break_gcc_14
21116 && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8))
21117 inform (input_location, "parameter passing for argument of type "
21118 "%qT changed in GCC 14.1", type);
21120 if (align > 8)
21122 if (abi_break_gcc_9 && warn_psabi)
21123 inform (input_location, "parameter passing for argument of type "
21124 "%qT changed in GCC 9.1", type);
21125 dw_align = true;
21128 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21129 && size < UNITS_PER_WORD)
21131 adjust = UNITS_PER_WORD - size;
21135 /* Get a local temporary for the field value. */
21136 off = get_initialized_tmp_var (f_off, pre_p, NULL);
21138 /* Emit code to branch if off >= 0. */
21139 t = build2 (GE_EXPR, boolean_type_node, off,
21140 build_int_cst (TREE_TYPE (off), 0));
21141 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21143 if (dw_align)
21145 /* Emit: offs = (offs + 15) & -16. */
21146 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21147 build_int_cst (TREE_TYPE (off), 15));
21148 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21149 build_int_cst (TREE_TYPE (off), -16));
21150 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21152 else
21153 roundup = NULL;
21155 /* Update ap.__[g|v]r_offs */
21156 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21157 build_int_cst (TREE_TYPE (off), rsize));
21158 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21160 /* String up. */
21161 if (roundup)
21162 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21164 /* [cond2] if (ap.__[g|v]r_offs > 0) */
21165 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21166 build_int_cst (TREE_TYPE (f_off), 0));
21167 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21169 /* String up: make sure the assignment happens before the use. */
21170 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21171 COND_EXPR_ELSE (cond1) = t;
21173 /* Prepare the trees handling the argument that is passed on the stack;
21174 the top level node will store in ON_STACK. */
21175 arg = get_initialized_tmp_var (stack, pre_p, NULL);
21176 if (align > 8)
21178 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
21179 t = fold_build_pointer_plus_hwi (arg, 15);
21180 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21181 build_int_cst (TREE_TYPE (t), -16));
21182 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21184 else
21185 roundup = NULL;
21186 /* Advance ap.__stack */
21187 t = fold_build_pointer_plus_hwi (arg, size + 7);
21188 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21189 build_int_cst (TREE_TYPE (t), -8));
21190 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21191 /* String up roundup and advance. */
21192 if (roundup)
21193 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21194 /* String up with arg */
21195 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21196 /* Big-endianness related address adjustment. */
21197 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21198 && size < UNITS_PER_WORD)
21200 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21201 size_int (UNITS_PER_WORD - size));
21202 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21205 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21206 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21208 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
21209 t = off;
21210 if (adjust)
21211 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21212 build_int_cst (TREE_TYPE (off), adjust));
21214 t = fold_convert (sizetype, t);
21215 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21217 if (is_ha)
21219 /* type ha; // treat as "struct {ftype field[n];}"
21220 ... [computing offs]
21221 for (i = 0; i <nregs; ++i, offs += 16)
21222 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21223 return ha; */
21224 int i;
21225 tree tmp_ha, field_t, field_ptr_t;
21227 /* Declare a local variable. */
21228 tmp_ha = create_tmp_var_raw (type, "ha");
21229 gimple_add_tmp_var (tmp_ha);
21231 /* Establish the base type. */
21232 switch (ag_mode)
21234 case E_SFmode:
21235 field_t = float_type_node;
21236 field_ptr_t = float_ptr_type_node;
21237 break;
21238 case E_DFmode:
21239 field_t = double_type_node;
21240 field_ptr_t = double_ptr_type_node;
21241 break;
21242 case E_TFmode:
21243 field_t = long_double_type_node;
21244 field_ptr_t = long_double_ptr_type_node;
21245 break;
21246 case E_SDmode:
21247 field_t = dfloat32_type_node;
21248 field_ptr_t = build_pointer_type (dfloat32_type_node);
21249 break;
21250 case E_DDmode:
21251 field_t = dfloat64_type_node;
21252 field_ptr_t = build_pointer_type (dfloat64_type_node);
21253 break;
21254 case E_TDmode:
21255 field_t = dfloat128_type_node;
21256 field_ptr_t = build_pointer_type (dfloat128_type_node);
21257 break;
21258 case E_HFmode:
21259 field_t = aarch64_fp16_type_node;
21260 field_ptr_t = aarch64_fp16_ptr_type_node;
21261 break;
21262 case E_BFmode:
21263 field_t = bfloat16_type_node;
21264 field_ptr_t = aarch64_bf16_ptr_type_node;
21265 break;
21266 case E_V2SImode:
21267 case E_V4SImode:
21269 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21270 field_t = build_vector_type_for_mode (innertype, ag_mode);
21271 field_ptr_t = build_pointer_type (field_t);
21273 break;
21274 default:
21275 gcc_assert (0);
21278 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
21279 TREE_ADDRESSABLE (tmp_ha) = 1;
21280 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21281 addr = t;
21282 t = fold_convert (field_ptr_t, addr);
21283 t = build2 (MODIFY_EXPR, field_t,
21284 build1 (INDIRECT_REF, field_t, tmp_ha),
21285 build1 (INDIRECT_REF, field_t, t));
21287 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
21288 for (i = 1; i < nregs; ++i)
21290 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21291 u = fold_convert (field_ptr_t, addr);
21292 u = build2 (MODIFY_EXPR, field_t,
21293 build2 (MEM_REF, field_t, tmp_ha,
21294 build_int_cst (field_ptr_t,
21295 (i *
21296 int_size_in_bytes (field_t)))),
21297 build1 (INDIRECT_REF, field_t, u));
21298 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21301 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21302 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21305 COND_EXPR_ELSE (cond2) = t;
21306 addr = fold_convert (build_pointer_type (type), cond1);
21307 addr = build_va_arg_indirect_ref (addr);
21309 if (indirect_p)
21310 addr = build_va_arg_indirect_ref (addr);
21312 return addr;
21315 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
21317 static void
21318 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21319 const function_arg_info &arg,
21320 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21322 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21323 CUMULATIVE_ARGS local_cum;
21324 int gr_saved = cfun->va_list_gpr_size;
21325 int vr_saved = cfun->va_list_fpr_size;
21327 /* The caller has advanced CUM up to, but not beyond, the last named
21328 argument. Advance a local copy of CUM past the last "real" named
21329 argument, to find out how many registers are left over. */
21330 local_cum = *cum;
21331 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21332 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21334 /* Found out how many registers we need to save.
21335 Honor tree-stdvar analysis results. */
21336 if (cfun->va_list_gpr_size)
21337 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21338 cfun->va_list_gpr_size / UNITS_PER_WORD);
21339 if (cfun->va_list_fpr_size)
21340 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21341 cfun->va_list_fpr_size / UNITS_PER_VREG);
21343 if (!TARGET_FLOAT)
21345 gcc_assert (local_cum.aapcs_nvrn == 0);
21346 vr_saved = 0;
21349 if (!no_rtl)
21351 if (gr_saved > 0)
21353 rtx ptr, mem;
21355 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
21356 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21357 - gr_saved * UNITS_PER_WORD);
21358 mem = gen_frame_mem (BLKmode, ptr);
21359 set_mem_alias_set (mem, get_varargs_alias_set ());
21361 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21362 mem, gr_saved);
21364 if (vr_saved > 0)
21366 /* We can't use move_block_from_reg, because it will use
21367 the wrong mode, storing D regs only. */
21368 machine_mode mode = TImode;
21369 int off, i, vr_start;
21371 /* Set OFF to the offset from virtual_incoming_args_rtx of
21372 the first vector register. The VR save area lies below
21373 the GR one, and is aligned to 16 bytes. */
21374 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21375 STACK_BOUNDARY / BITS_PER_UNIT);
21376 off -= vr_saved * UNITS_PER_VREG;
21378 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21379 for (i = 0; i < vr_saved; ++i)
21381 rtx ptr, mem;
21383 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21384 mem = gen_frame_mem (mode, ptr);
21385 set_mem_alias_set (mem, get_varargs_alias_set ());
21386 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21387 off += UNITS_PER_VREG;
21392 /* We don't save the size into *PRETEND_SIZE because we want to avoid
21393 any complication of having crtl->args.pretend_args_size changed. */
21394 cfun->machine->frame.saved_varargs_size
21395 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21396 STACK_BOUNDARY / BITS_PER_UNIT)
21397 + vr_saved * UNITS_PER_VREG);
21400 static void
21401 aarch64_conditional_register_usage (void)
21403 int i;
21404 if (!TARGET_FLOAT)
21406 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21408 fixed_regs[i] = 1;
21409 call_used_regs[i] = 1;
21410 CLEAR_HARD_REG_BIT (operand_reg_set, i);
21413 if (!TARGET_SVE)
21414 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21416 fixed_regs[i] = 1;
21417 call_used_regs[i] = 1;
21420 /* Only allow these registers to be accessed via special patterns. */
21421 CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21422 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21423 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21424 for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21425 CLEAR_HARD_REG_BIT (operand_reg_set, i);
21427 /* When tracking speculation, we need a couple of call-clobbered registers
21428 to track the speculation state. It would be nice to just use
21429 IP0 and IP1, but currently there are numerous places that just
21430 assume these registers are free for other uses (eg pointer
21431 authentication). */
21432 if (aarch64_track_speculation)
21434 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21435 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21436 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21437 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21441 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
21443 bool
21444 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21446 /* For records we're passed a FIELD_DECL, for arrays we're passed
21447 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
21448 const_tree type = TREE_TYPE (field_or_array);
21450 /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21451 For structures, the "multiple" case is indicated by MODE being
21452 VOIDmode. */
21453 unsigned int num_zr, num_pr;
21454 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21456 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21457 return !simple_cst_equal (TYPE_SIZE (field_or_array),
21458 TYPE_SIZE (type));
21459 return mode == VOIDmode;
21462 return default_member_type_forces_blk (field_or_array, mode);
21465 /* Bitmasks that indicate whether earlier versions of GCC would have
21466 taken a different path through the ABI logic. This should result in
21467 a -Wpsabi warning if the earlier path led to a different ABI decision.
21469 WARN_PSABI_EMPTY_CXX17_BASE
21470 Indicates that the type includes an artificial empty C++17 base field
21471 that, prior to GCC 10.1, would prevent the type from being treated as
21472 a HFA or HVA. See PR94383 for details.
21474 WARN_PSABI_NO_UNIQUE_ADDRESS
21475 Indicates that the type includes an empty [[no_unique_address]] field
21476 that, prior to GCC 10.1, would prevent the type from being treated as
21477 a HFA or HVA. */
21478 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21479 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21480 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21482 /* Walk down the type tree of TYPE counting consecutive base elements.
21483 If *MODEP is VOIDmode, then set it to the first valid floating point
21484 type. If a non-floating point type is found, or if a floating point
21485 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21486 otherwise return the count in the sub-tree.
21488 The WARN_PSABI_FLAGS argument allows the caller to check whether this
21489 function has changed its behavior relative to earlier versions of GCC.
21490 Normally the argument should be nonnull and point to a zero-initialized
21491 variable. The function then records whether the ABI decision might
21492 be affected by a known fix to the ABI logic, setting the associated
21493 WARN_PSABI_* bits if so.
21495 When the argument is instead a null pointer, the function tries to
21496 simulate the behavior of GCC before all such ABI fixes were made.
21497 This is useful to check whether the function returns something
21498 different after the ABI fixes. */
21499 static int
21500 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21501 unsigned int *warn_psabi_flags)
21503 machine_mode mode;
21504 HOST_WIDE_INT size;
21506 if (aarch64_sve::builtin_type_p (type))
21507 return -1;
21509 switch (TREE_CODE (type))
21511 case REAL_TYPE:
21512 mode = TYPE_MODE (type);
21513 if (mode != DFmode && mode != SFmode
21514 && mode != TFmode && mode != HFmode
21515 && mode != SDmode && mode != DDmode && mode != TDmode)
21516 return -1;
21518 if (*modep == VOIDmode)
21519 *modep = mode;
21521 if (*modep == mode)
21522 return 1;
21524 break;
21526 case COMPLEX_TYPE:
21527 mode = TYPE_MODE (TREE_TYPE (type));
21528 if (mode != DFmode && mode != SFmode
21529 && mode != TFmode && mode != HFmode)
21530 return -1;
21532 if (*modep == VOIDmode)
21533 *modep = mode;
21535 if (*modep == mode)
21536 return 2;
21538 break;
21540 case VECTOR_TYPE:
21541 /* Use V2SImode and V4SImode as representatives of all 64-bit
21542 and 128-bit vector types. */
21543 size = int_size_in_bytes (type);
21544 switch (size)
21546 case 8:
21547 mode = V2SImode;
21548 break;
21549 case 16:
21550 mode = V4SImode;
21551 break;
21552 default:
21553 return -1;
21556 if (*modep == VOIDmode)
21557 *modep = mode;
21559 /* Vector modes are considered to be opaque: two vectors are
21560 equivalent for the purposes of being homogeneous aggregates
21561 if they are the same size. */
21562 if (*modep == mode)
21563 return 1;
21565 break;
21567 case ARRAY_TYPE:
21569 int count;
21570 tree index = TYPE_DOMAIN (type);
21572 /* Can't handle incomplete types nor sizes that are not
21573 fixed. */
21574 if (!COMPLETE_TYPE_P (type)
21575 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21576 return -1;
21578 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21579 warn_psabi_flags);
21580 if (count == -1
21581 || !index
21582 || !TYPE_MAX_VALUE (index)
21583 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21584 || !TYPE_MIN_VALUE (index)
21585 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21586 || count < 0)
21587 return -1;
21589 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21590 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21592 /* There must be no padding. */
21593 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21594 count * GET_MODE_BITSIZE (*modep)))
21595 return -1;
21597 return count;
21600 case RECORD_TYPE:
21602 int count = 0;
21603 int sub_count;
21604 tree field;
21606 /* Can't handle incomplete types nor sizes that are not
21607 fixed. */
21608 if (!COMPLETE_TYPE_P (type)
21609 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21610 return -1;
21612 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21614 if (TREE_CODE (field) != FIELD_DECL)
21615 continue;
21617 if (DECL_FIELD_ABI_IGNORED (field))
21619 /* See whether this is something that earlier versions of
21620 GCC failed to ignore. */
21621 unsigned int flag;
21622 if (lookup_attribute ("no_unique_address",
21623 DECL_ATTRIBUTES (field)))
21624 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
21625 else if (cxx17_empty_base_field_p (field))
21626 flag = WARN_PSABI_EMPTY_CXX17_BASE;
21627 else
21628 /* No compatibility problem. */
21629 continue;
21631 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
21632 if (warn_psabi_flags)
21634 *warn_psabi_flags |= flag;
21635 continue;
21638 /* A zero-width bitfield may affect layout in some
21639 circumstances, but adds no members. The determination
21640 of whether or not a type is an HFA is performed after
21641 layout is complete, so if the type still looks like an
21642 HFA afterwards, it is still classed as one. This is
21643 potentially an ABI break for the hard-float ABI. */
21644 else if (DECL_BIT_FIELD (field)
21645 && integer_zerop (DECL_SIZE (field)))
21647 /* Prior to GCC-12 these fields were striped early,
21648 hiding them from the back-end entirely and
21649 resulting in the correct behaviour for argument
21650 passing. Simulate that old behaviour without
21651 generating a warning. */
21652 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
21653 continue;
21654 if (warn_psabi_flags)
21656 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
21657 continue;
21661 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21662 warn_psabi_flags);
21663 if (sub_count < 0)
21664 return -1;
21665 count += sub_count;
21668 /* There must be no padding. */
21669 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21670 count * GET_MODE_BITSIZE (*modep)))
21671 return -1;
21673 return count;
21676 case UNION_TYPE:
21677 case QUAL_UNION_TYPE:
21679 /* These aren't very interesting except in a degenerate case. */
21680 int count = 0;
21681 int sub_count;
21682 tree field;
21684 /* Can't handle incomplete types nor sizes that are not
21685 fixed. */
21686 if (!COMPLETE_TYPE_P (type)
21687 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21688 return -1;
21690 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21692 if (TREE_CODE (field) != FIELD_DECL)
21693 continue;
21695 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21696 warn_psabi_flags);
21697 if (sub_count < 0)
21698 return -1;
21699 count = count > sub_count ? count : sub_count;
21702 /* There must be no padding. */
21703 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21704 count * GET_MODE_BITSIZE (*modep)))
21705 return -1;
21707 return count;
21710 default:
21711 break;
21714 return -1;
21717 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
21718 type as described in AAPCS64 \S 4.1.2.
21720 See the comment above aarch64_composite_type_p for the notes on MODE. */
21722 static bool
21723 aarch64_short_vector_p (const_tree type,
21724 machine_mode mode)
21726 poly_int64 size = -1;
21728 if (type && VECTOR_TYPE_P (type))
21730 if (aarch64_sve::builtin_type_p (type))
21731 return false;
21732 size = int_size_in_bytes (type);
21734 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
21735 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
21737 /* The containing "else if" is too loose: it means that we look at TYPE
21738 if the type is a vector type (good), but that we otherwise ignore TYPE
21739 and look only at the mode. This is wrong because the type describes
21740 the language-level information whereas the mode is purely an internal
21741 GCC concept. We can therefore reach here for types that are not
21742 vectors in the AAPCS64 sense.
21744 We can't "fix" that for the traditional Advanced SIMD vector modes
21745 without breaking backwards compatibility. However, there's no such
21746 baggage for the structure modes, which were introduced in GCC 12. */
21747 if (aarch64_advsimd_struct_mode_p (mode))
21748 return false;
21750 /* For similar reasons, rely only on the type, not the mode, when
21751 processing SVE types. */
21752 if (type && aarch64_some_values_include_pst_objects_p (type))
21753 /* Leave later code to report an error if SVE is disabled. */
21754 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
21755 else
21756 size = GET_MODE_SIZE (mode);
21758 if (known_eq (size, 8) || known_eq (size, 16))
21760 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
21761 they are being treated as scalable AAPCS64 types. */
21762 gcc_assert (!aarch64_sve_mode_p (mode)
21763 && !aarch64_advsimd_struct_mode_p (mode));
21764 return true;
21766 return false;
21769 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
21770 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
21771 array types. The C99 floating-point complex types are also considered
21772 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
21773 types, which are GCC extensions and out of the scope of AAPCS64, are
21774 treated as composite types here as well.
21776 Note that MODE itself is not sufficient in determining whether a type
21777 is such a composite type or not. This is because
21778 stor-layout.cc:compute_record_mode may have already changed the MODE
21779 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
21780 structure with only one field may have its MODE set to the mode of the
21781 field. Also an integer mode whose size matches the size of the
21782 RECORD_TYPE type may be used to substitute the original mode
21783 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
21784 solely relied on. */
21786 static bool
21787 aarch64_composite_type_p (const_tree type,
21788 machine_mode mode)
21790 if (aarch64_short_vector_p (type, mode))
21791 return false;
21793 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
21794 return true;
21796 if (mode == BLKmode
21797 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
21798 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
21799 return true;
21801 return false;
21804 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
21805 shall be passed or returned in simd/fp register(s) (providing these
21806 parameter passing registers are available).
21808 Upon successful return, *COUNT returns the number of needed registers,
21809 *BASE_MODE returns the mode of the individual register and when IS_HA
21810 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
21811 floating-point aggregate or a homogeneous short-vector aggregate.
21813 SILENT_P is true if the function should refrain from reporting any
21814 diagnostics. This should only be used if the caller is certain that
21815 any ABI decisions would eventually come through this function with
21816 SILENT_P set to false. */
21818 static bool
21819 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
21820 const_tree type,
21821 machine_mode *base_mode,
21822 int *count,
21823 bool *is_ha,
21824 bool silent_p)
21826 if (is_ha != NULL) *is_ha = false;
21828 machine_mode new_mode = VOIDmode;
21829 bool composite_p = aarch64_composite_type_p (type, mode);
21831 if ((!composite_p
21832 && (GET_MODE_CLASS (mode) == MODE_FLOAT
21833 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
21834 || aarch64_short_vector_p (type, mode))
21836 *count = 1;
21837 new_mode = mode;
21839 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
21841 if (is_ha != NULL) *is_ha = true;
21842 *count = 2;
21843 new_mode = GET_MODE_INNER (mode);
21845 else if (type && composite_p)
21847 unsigned int warn_psabi_flags = 0;
21848 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
21849 &warn_psabi_flags);
21850 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
21852 static unsigned last_reported_type_uid;
21853 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
21854 int alt;
21855 if (!silent_p
21856 && warn_psabi
21857 && warn_psabi_flags
21858 && uid != last_reported_type_uid
21859 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
21860 != ag_count))
21862 const char *url10
21863 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
21864 const char *url12
21865 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
21866 gcc_assert (alt == -1);
21867 last_reported_type_uid = uid;
21868 /* Use TYPE_MAIN_VARIANT to strip any redundant const
21869 qualification. */
21870 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
21871 inform (input_location, "parameter passing for argument of "
21872 "type %qT with %<[[no_unique_address]]%> members "
21873 "changed %{in GCC 10.1%}",
21874 TYPE_MAIN_VARIANT (type), url10);
21875 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
21876 inform (input_location, "parameter passing for argument of "
21877 "type %qT when C++17 is enabled changed to match "
21878 "C++14 %{in GCC 10.1%}",
21879 TYPE_MAIN_VARIANT (type), url10);
21880 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
21881 inform (input_location, "parameter passing for argument of "
21882 "type %qT changed %{in GCC 12.1%}",
21883 TYPE_MAIN_VARIANT (type), url12);
21886 if (is_ha != NULL) *is_ha = true;
21887 *count = ag_count;
21889 else
21890 return false;
21892 else
21893 return false;
21895 gcc_assert (!aarch64_sve_mode_p (new_mode));
21896 *base_mode = new_mode;
21897 return true;
21900 /* Implement TARGET_STRUCT_VALUE_RTX. */
21902 static rtx
21903 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
21904 int incoming ATTRIBUTE_UNUSED)
21906 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
21909 /* Implements target hook vector_mode_supported_p. */
21910 static bool
21911 aarch64_vector_mode_supported_p (machine_mode mode)
21913 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21914 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
21917 /* Implements target hook vector_mode_supported_any_target_p. */
21918 static bool
21919 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
21921 unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
21922 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
21925 /* Return the full-width SVE vector mode for element mode MODE, if one
21926 exists. */
21927 opt_machine_mode
21928 aarch64_full_sve_mode (scalar_mode mode)
21930 switch (mode)
21932 case E_DFmode:
21933 return VNx2DFmode;
21934 case E_SFmode:
21935 return VNx4SFmode;
21936 case E_HFmode:
21937 return VNx8HFmode;
21938 case E_BFmode:
21939 return VNx8BFmode;
21940 case E_DImode:
21941 return VNx2DImode;
21942 case E_SImode:
21943 return VNx4SImode;
21944 case E_HImode:
21945 return VNx8HImode;
21946 case E_QImode:
21947 return VNx16QImode;
21948 default:
21949 return opt_machine_mode ();
21953 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
21954 if it exists. */
21955 opt_machine_mode
21956 aarch64_vq_mode (scalar_mode mode)
21958 switch (mode)
21960 case E_DFmode:
21961 return V2DFmode;
21962 case E_SFmode:
21963 return V4SFmode;
21964 case E_HFmode:
21965 return V8HFmode;
21966 case E_BFmode:
21967 return V8BFmode;
21968 case E_SImode:
21969 return V4SImode;
21970 case E_HImode:
21971 return V8HImode;
21972 case E_QImode:
21973 return V16QImode;
21974 case E_DImode:
21975 return V2DImode;
21976 default:
21977 return opt_machine_mode ();
21981 /* Return appropriate SIMD container
21982 for MODE within a vector of WIDTH bits. */
21983 static machine_mode
21984 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
21986 if (TARGET_SVE
21987 && maybe_ne (width, 128)
21988 && known_eq (width, BITS_PER_SVE_VECTOR))
21989 return aarch64_full_sve_mode (mode).else_mode (word_mode);
21991 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
21992 if (TARGET_BASE_SIMD)
21994 if (known_eq (width, 128))
21995 return aarch64_vq_mode (mode).else_mode (word_mode);
21996 else
21997 switch (mode)
21999 case E_SFmode:
22000 return V2SFmode;
22001 case E_HFmode:
22002 return V4HFmode;
22003 case E_BFmode:
22004 return V4BFmode;
22005 case E_SImode:
22006 return V2SImode;
22007 case E_HImode:
22008 return V4HImode;
22009 case E_QImode:
22010 return V8QImode;
22011 default:
22012 break;
22015 return word_mode;
22018 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22019 and return whether the SVE mode should be preferred over the
22020 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
22021 static bool
22022 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22024 /* Take into account the aarch64-autovec-preference param if non-zero. */
22025 bool only_asimd_p = aarch64_autovec_preference == 1;
22026 bool only_sve_p = aarch64_autovec_preference == 2;
22028 if (only_asimd_p)
22029 return false;
22030 if (only_sve_p)
22031 return true;
22033 /* The preference in case of a tie in costs. */
22034 bool prefer_asimd = aarch64_autovec_preference == 3;
22035 bool prefer_sve = aarch64_autovec_preference == 4;
22037 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22038 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22039 /* If the CPU information does not have an SVE width registered use the
22040 generic poly_int comparison that prefers SVE. If a preference is
22041 explicitly requested avoid this path. */
22042 if (aarch64_tune_params.sve_width == SVE_SCALABLE
22043 && !prefer_asimd
22044 && !prefer_sve)
22045 return maybe_gt (nunits_sve, nunits_asimd);
22047 /* Otherwise estimate the runtime width of the modes involved. */
22048 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22049 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22051 /* Preferring SVE means picking it first unless the Advanced SIMD mode
22052 is clearly wider. */
22053 if (prefer_sve)
22054 return est_sve >= est_asimd;
22055 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22056 is clearly wider. */
22057 if (prefer_asimd)
22058 return est_sve > est_asimd;
22060 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
22061 return est_sve > est_asimd;
22064 /* Return 128-bit container as the preferred SIMD mode for MODE. */
22065 static machine_mode
22066 aarch64_preferred_simd_mode (scalar_mode mode)
22068 /* Take into account explicit auto-vectorization ISA preferences through
22069 aarch64_cmp_autovec_modes. */
22070 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22071 return aarch64_full_sve_mode (mode).else_mode (word_mode);
22072 if (TARGET_SIMD)
22073 return aarch64_vq_mode (mode).else_mode (word_mode);
22074 return word_mode;
22077 /* Return a list of possible vector sizes for the vectorizer
22078 to iterate over. */
22079 static unsigned int
22080 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22082 static const machine_mode sve_modes[] = {
22083 /* Try using full vectors for all element types. */
22084 VNx16QImode,
22086 /* Try using 16-bit containers for 8-bit elements and full vectors
22087 for wider elements. */
22088 VNx8QImode,
22090 /* Try using 32-bit containers for 8-bit and 16-bit elements and
22091 full vectors for wider elements. */
22092 VNx4QImode,
22094 /* Try using 64-bit containers for all element types. */
22095 VNx2QImode
22098 static const machine_mode advsimd_modes[] = {
22099 /* Try using 128-bit vectors for all element types. */
22100 V16QImode,
22102 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22103 for wider elements. */
22104 V8QImode,
22106 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22107 for wider elements.
22109 TODO: We could support a limited form of V4QImode too, so that
22110 we use 32-bit vectors for 8-bit elements. */
22111 V4HImode,
22113 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22114 for 64-bit elements.
22116 TODO: We could similarly support limited forms of V2QImode and V2HImode
22117 for this case. */
22118 V2SImode
22121 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22122 This is because:
22124 - If we can't use N-byte Advanced SIMD vectors then the placement
22125 doesn't matter; we'll just continue as though the Advanced SIMD
22126 entry didn't exist.
22128 - If an SVE main loop with N bytes ends up being cheaper than an
22129 Advanced SIMD main loop with N bytes then by default we'll replace
22130 the Advanced SIMD version with the SVE one.
22132 - If an Advanced SIMD main loop with N bytes ends up being cheaper
22133 than an SVE main loop with N bytes then by default we'll try to
22134 use the SVE loop to vectorize the epilogue instead. */
22136 bool only_asimd_p = aarch64_autovec_preference == 1;
22137 bool only_sve_p = aarch64_autovec_preference == 2;
22139 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22140 unsigned int advsimd_i = 0;
22142 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22144 if (sve_i < ARRAY_SIZE (sve_modes)
22145 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22146 advsimd_modes[advsimd_i]))
22147 modes->safe_push (sve_modes[sve_i++]);
22148 else
22149 modes->safe_push (advsimd_modes[advsimd_i++]);
22151 while (sve_i < ARRAY_SIZE (sve_modes))
22152 modes->safe_push (sve_modes[sve_i++]);
22154 unsigned int flags = 0;
22155 if (aarch64_vect_compare_costs)
22156 flags |= VECT_COMPARE_COSTS;
22157 return flags;
22160 /* Implement TARGET_MANGLE_TYPE. */
22162 static const char *
22163 aarch64_mangle_type (const_tree type)
22165 /* The AArch64 ABI documents say that "__va_list" has to be
22166 mangled as if it is in the "std" namespace. */
22167 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22168 return "St9__va_list";
22170 /* Half-precision floating point types. */
22171 if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22173 if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22174 return NULL;
22175 if (TYPE_MODE (type) == BFmode)
22176 return "u6__bf16";
22177 else
22178 return "Dh";
22181 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
22182 builtin types. */
22183 if (TYPE_NAME (type) != NULL)
22185 const char *res;
22186 if ((res = aarch64_general_mangle_builtin_type (type))
22187 || (res = aarch64_sve::mangle_builtin_type (type)))
22188 return res;
22191 /* Use the default mangling. */
22192 return NULL;
22195 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
22197 static bool
22198 aarch64_verify_type_context (location_t loc, type_context_kind context,
22199 const_tree type, bool silent_p)
22201 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22204 /* Find the first rtx_insn before insn that will generate an assembly
22205 instruction. */
22207 static rtx_insn *
22208 aarch64_prev_real_insn (rtx_insn *insn)
22210 if (!insn)
22211 return NULL;
22215 insn = prev_real_insn (insn);
22217 while (insn && recog_memoized (insn) < 0);
22219 return insn;
22222 static bool
22223 is_madd_op (enum attr_type t1)
22225 unsigned int i;
22226 /* A number of these may be AArch32 only. */
22227 enum attr_type mlatypes[] = {
22228 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22229 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22230 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22233 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22235 if (t1 == mlatypes[i])
22236 return true;
22239 return false;
22242 /* Check if there is a register dependency between a load and the insn
22243 for which we hold recog_data. */
22245 static bool
22246 dep_between_memop_and_curr (rtx memop)
22248 rtx load_reg;
22249 int opno;
22251 gcc_assert (GET_CODE (memop) == SET);
22253 if (!REG_P (SET_DEST (memop)))
22254 return false;
22256 load_reg = SET_DEST (memop);
22257 for (opno = 1; opno < recog_data.n_operands; opno++)
22259 rtx operand = recog_data.operand[opno];
22260 if (REG_P (operand)
22261 && reg_overlap_mentioned_p (load_reg, operand))
22262 return true;
22265 return false;
22269 /* When working around the Cortex-A53 erratum 835769,
22270 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22271 instruction and has a preceding memory instruction such that a NOP
22272 should be inserted between them. */
22274 bool
22275 aarch64_madd_needs_nop (rtx_insn* insn)
22277 enum attr_type attr_type;
22278 rtx_insn *prev;
22279 rtx body;
22281 if (!TARGET_FIX_ERR_A53_835769)
22282 return false;
22284 if (!INSN_P (insn) || recog_memoized (insn) < 0)
22285 return false;
22287 attr_type = get_attr_type (insn);
22288 if (!is_madd_op (attr_type))
22289 return false;
22291 prev = aarch64_prev_real_insn (insn);
22292 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22293 Restore recog state to INSN to avoid state corruption. */
22294 extract_constrain_insn_cached (insn);
22296 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22297 return false;
22299 body = single_set (prev);
22301 /* If the previous insn is a memory op and there is no dependency between
22302 it and the DImode madd, emit a NOP between them. If body is NULL then we
22303 have a complex memory operation, probably a load/store pair.
22304 Be conservative for now and emit a NOP. */
22305 if (GET_MODE (recog_data.operand[0]) == DImode
22306 && (!body || !dep_between_memop_and_curr (body)))
22307 return true;
22309 return false;
22314 /* Implement FINAL_PRESCAN_INSN. */
22316 void
22317 aarch64_final_prescan_insn (rtx_insn *insn)
22319 if (aarch64_madd_needs_nop (insn))
22320 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22324 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22325 instruction. */
22327 bool
22328 aarch64_sve_index_immediate_p (rtx base_or_step)
22330 return (CONST_INT_P (base_or_step)
22331 && IN_RANGE (INTVAL (base_or_step), -16, 15));
22334 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22335 when applied to mode MODE. Negate X first if NEGATE_P is true. */
22337 bool
22338 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22340 rtx elt = unwrap_const_vec_duplicate (x);
22341 if (!CONST_INT_P (elt))
22342 return false;
22344 HOST_WIDE_INT val = INTVAL (elt);
22345 if (negate_p)
22346 val = -val;
22347 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22349 if (val & 0xff)
22350 return IN_RANGE (val, 0, 0xff);
22351 return IN_RANGE (val, 0, 0xff00);
22354 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22355 instructions when applied to mode MODE. Negate X first if NEGATE_P
22356 is true. */
22358 bool
22359 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22361 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22362 return false;
22364 /* After the optional negation, the immediate must be nonnegative.
22365 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22366 instead of SQADD Zn.B, Zn.B, #129. */
22367 rtx elt = unwrap_const_vec_duplicate (x);
22368 return negate_p == (INTVAL (elt) < 0);
22371 /* Return true if X is a valid immediate operand for an SVE logical
22372 instruction such as AND. */
22374 bool
22375 aarch64_sve_bitmask_immediate_p (rtx x)
22377 rtx elt;
22379 return (const_vec_duplicate_p (x, &elt)
22380 && CONST_INT_P (elt)
22381 && aarch64_bitmask_imm (INTVAL (elt),
22382 GET_MODE_INNER (GET_MODE (x))));
22385 /* Return true if X is a valid immediate for the SVE DUP and CPY
22386 instructions. */
22388 bool
22389 aarch64_sve_dup_immediate_p (rtx x)
22391 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22392 if (!CONST_INT_P (x))
22393 return false;
22395 HOST_WIDE_INT val = INTVAL (x);
22396 if (val & 0xff)
22397 return IN_RANGE (val, -0x80, 0x7f);
22398 return IN_RANGE (val, -0x8000, 0x7f00);
22401 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22402 SIGNED_P says whether the operand is signed rather than unsigned. */
22404 bool
22405 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22407 x = unwrap_const_vec_duplicate (x);
22408 return (CONST_INT_P (x)
22409 && (signed_p
22410 ? IN_RANGE (INTVAL (x), -16, 15)
22411 : IN_RANGE (INTVAL (x), 0, 127)));
22414 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22415 instruction. Negate X first if NEGATE_P is true. */
22417 bool
22418 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22420 rtx elt;
22421 REAL_VALUE_TYPE r;
22423 if (!const_vec_duplicate_p (x, &elt)
22424 || !CONST_DOUBLE_P (elt))
22425 return false;
22427 r = *CONST_DOUBLE_REAL_VALUE (elt);
22429 if (negate_p)
22430 r = real_value_negate (&r);
22432 if (real_equal (&r, &dconst1))
22433 return true;
22434 if (real_equal (&r, &dconsthalf))
22435 return true;
22436 return false;
22439 /* Return true if X is a valid immediate operand for an SVE FMUL
22440 instruction. */
22442 bool
22443 aarch64_sve_float_mul_immediate_p (rtx x)
22445 rtx elt;
22447 return (const_vec_duplicate_p (x, &elt)
22448 && CONST_DOUBLE_P (elt)
22449 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22450 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22453 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22454 for the Advanced SIMD operation described by WHICH and INSN. If INFO
22455 is nonnull, use it to describe valid immediates. */
22456 static bool
22457 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22458 simd_immediate_info *info,
22459 enum simd_immediate_check which,
22460 simd_immediate_info::insn_type insn)
22462 /* Try a 4-byte immediate with LSL. */
22463 for (unsigned int shift = 0; shift < 32; shift += 8)
22464 if ((val32 & (0xff << shift)) == val32)
22466 if (info)
22467 *info = simd_immediate_info (SImode, val32 >> shift, insn,
22468 simd_immediate_info::LSL, shift);
22469 return true;
22472 /* Try a 2-byte immediate with LSL. */
22473 unsigned int imm16 = val32 & 0xffff;
22474 if (imm16 == (val32 >> 16))
22475 for (unsigned int shift = 0; shift < 16; shift += 8)
22476 if ((imm16 & (0xff << shift)) == imm16)
22478 if (info)
22479 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22480 simd_immediate_info::LSL, shift);
22481 return true;
22484 /* Try a 4-byte immediate with MSL, except for cases that MVN
22485 can handle. */
22486 if (which == AARCH64_CHECK_MOV)
22487 for (unsigned int shift = 8; shift < 24; shift += 8)
22489 unsigned int low = (1 << shift) - 1;
22490 if (((val32 & (0xff << shift)) | low) == val32)
22492 if (info)
22493 *info = simd_immediate_info (SImode, val32 >> shift, insn,
22494 simd_immediate_info::MSL, shift);
22495 return true;
22499 return false;
22502 /* Return true if replicating VAL64 is a valid immediate for the
22503 Advanced SIMD operation described by WHICH. If INFO is nonnull,
22504 use it to describe valid immediates. */
22505 static bool
22506 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22507 simd_immediate_info *info,
22508 enum simd_immediate_check which)
22510 unsigned int val32 = val64 & 0xffffffff;
22511 unsigned int val16 = val64 & 0xffff;
22512 unsigned int val8 = val64 & 0xff;
22514 if (val32 == (val64 >> 32))
22516 if ((which & AARCH64_CHECK_ORR) != 0
22517 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22518 simd_immediate_info::MOV))
22519 return true;
22521 if ((which & AARCH64_CHECK_BIC) != 0
22522 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22523 simd_immediate_info::MVN))
22524 return true;
22526 /* Try using a replicated byte. */
22527 if (which == AARCH64_CHECK_MOV
22528 && val16 == (val32 >> 16)
22529 && val8 == (val16 >> 8))
22531 if (info)
22532 *info = simd_immediate_info (QImode, val8);
22533 return true;
22537 /* Try using a bit-to-bytemask. */
22538 if (which == AARCH64_CHECK_MOV)
22540 unsigned int i;
22541 for (i = 0; i < 64; i += 8)
22543 unsigned char byte = (val64 >> i) & 0xff;
22544 if (byte != 0 && byte != 0xff)
22545 break;
22547 if (i == 64)
22549 if (info)
22550 *info = simd_immediate_info (DImode, val64);
22551 return true;
22554 return false;
22557 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22558 instruction. If INFO is nonnull, use it to describe valid immediates. */
22560 static bool
22561 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
22562 simd_immediate_info *info)
22564 scalar_int_mode mode = DImode;
22565 unsigned int val32 = val64 & 0xffffffff;
22566 if (val32 == (val64 >> 32))
22568 mode = SImode;
22569 unsigned int val16 = val32 & 0xffff;
22570 if (val16 == (val32 >> 16))
22572 mode = HImode;
22573 unsigned int val8 = val16 & 0xff;
22574 if (val8 == (val16 >> 8))
22575 mode = QImode;
22578 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
22579 if (IN_RANGE (val, -0x80, 0x7f))
22581 /* DUP with no shift. */
22582 if (info)
22583 *info = simd_immediate_info (mode, val);
22584 return true;
22586 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
22588 /* DUP with LSL #8. */
22589 if (info)
22590 *info = simd_immediate_info (mode, val);
22591 return true;
22593 if (aarch64_bitmask_imm (val64, mode))
22595 /* DUPM. */
22596 if (info)
22597 *info = simd_immediate_info (mode, val);
22598 return true;
22600 return false;
22603 /* Return true if X is an UNSPEC_PTRUE constant of the form:
22605 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
22607 where PATTERN is the svpattern as a CONST_INT and where ZERO
22608 is a zero constant of the required PTRUE mode (which can have
22609 fewer elements than X's mode, if zero bits are significant).
22611 If so, and if INFO is nonnull, describe the immediate in INFO. */
22612 bool
22613 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
22615 if (GET_CODE (x) != CONST)
22616 return false;
22618 x = XEXP (x, 0);
22619 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
22620 return false;
22622 if (info)
22624 aarch64_svpattern pattern
22625 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
22626 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
22627 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
22628 *info = simd_immediate_info (int_mode, pattern);
22630 return true;
22633 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
22634 it to describe valid immediates. */
22636 static bool
22637 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
22639 if (aarch64_sve_ptrue_svpattern_p (x, info))
22640 return true;
22642 if (x == CONST0_RTX (GET_MODE (x)))
22644 if (info)
22645 *info = simd_immediate_info (DImode, 0);
22646 return true;
22649 /* Analyze the value as a VNx16BImode. This should be relatively
22650 efficient, since rtx_vector_builder has enough built-in capacity
22651 to store all VLA predicate constants without needing the heap. */
22652 rtx_vector_builder builder;
22653 if (!aarch64_get_sve_pred_bits (builder, x))
22654 return false;
22656 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
22657 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
22659 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
22660 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
22661 if (pattern != AARCH64_NUM_SVPATTERNS)
22663 if (info)
22665 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
22666 *info = simd_immediate_info (int_mode, pattern);
22668 return true;
22671 return false;
22674 /* Return true if OP is a valid SIMD immediate for the operation
22675 described by WHICH. If INFO is nonnull, use it to describe valid
22676 immediates. */
22677 bool
22678 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
22679 enum simd_immediate_check which)
22681 machine_mode mode = GET_MODE (op);
22682 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22683 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
22684 return false;
22686 if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
22687 return false;
22689 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
22690 return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
22692 if (vec_flags & VEC_SVE_PRED)
22693 return aarch64_sve_pred_valid_immediate (op, info);
22695 scalar_mode elt_mode = GET_MODE_INNER (mode);
22696 rtx base, step;
22697 unsigned int n_elts;
22698 if (CONST_VECTOR_P (op)
22699 && CONST_VECTOR_DUPLICATE_P (op))
22700 n_elts = CONST_VECTOR_NPATTERNS (op);
22701 else if ((vec_flags & VEC_SVE_DATA)
22702 && const_vec_series_p (op, &base, &step))
22704 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
22705 if (!aarch64_sve_index_immediate_p (base)
22706 || !aarch64_sve_index_immediate_p (step))
22707 return false;
22709 if (info)
22711 /* Get the corresponding container mode. E.g. an INDEX on V2SI
22712 should yield two integer values per 128-bit block, meaning
22713 that we need to treat it in the same way as V2DI and then
22714 ignore the upper 32 bits of each element. */
22715 elt_mode = aarch64_sve_container_int_mode (mode);
22716 *info = simd_immediate_info (elt_mode, base, step);
22718 return true;
22720 else if (CONST_VECTOR_P (op)
22721 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
22722 /* N_ELTS set above. */;
22723 else
22724 return false;
22726 scalar_float_mode elt_float_mode;
22727 if (n_elts == 1
22728 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
22730 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
22731 if (aarch64_float_const_zero_rtx_p (elt)
22732 || aarch64_float_const_representable_p (elt))
22734 if (info)
22735 *info = simd_immediate_info (elt_float_mode, elt);
22736 return true;
22740 /* If all elements in an SVE vector have the same value, we have a free
22741 choice between using the element mode and using the container mode.
22742 Using the element mode means that unused parts of the vector are
22743 duplicates of the used elements, while using the container mode means
22744 that the unused parts are an extension of the used elements. Using the
22745 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
22746 for its container mode VNx4SI while 0x00000101 isn't.
22748 If not all elements in an SVE vector have the same value, we need the
22749 transition from one element to the next to occur at container boundaries.
22750 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
22751 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
22752 scalar_int_mode elt_int_mode;
22753 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
22754 elt_int_mode = aarch64_sve_container_int_mode (mode);
22755 else
22756 elt_int_mode = int_mode_for_mode (elt_mode).require ();
22758 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
22759 if (elt_size > 8)
22760 return false;
22762 /* Expand the vector constant out into a byte vector, with the least
22763 significant byte of the register first. */
22764 auto_vec<unsigned char, 16> bytes;
22765 bytes.reserve (n_elts * elt_size);
22766 for (unsigned int i = 0; i < n_elts; i++)
22768 /* The vector is provided in gcc endian-neutral fashion.
22769 For aarch64_be Advanced SIMD, it must be laid out in the vector
22770 register in reverse order. */
22771 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
22772 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
22774 if (elt_mode != elt_int_mode)
22775 elt = gen_lowpart (elt_int_mode, elt);
22777 if (!CONST_INT_P (elt))
22778 return false;
22780 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
22781 for (unsigned int byte = 0; byte < elt_size; byte++)
22783 bytes.quick_push (elt_val & 0xff);
22784 elt_val >>= BITS_PER_UNIT;
22788 /* The immediate must repeat every eight bytes. */
22789 unsigned int nbytes = bytes.length ();
22790 for (unsigned i = 8; i < nbytes; ++i)
22791 if (bytes[i] != bytes[i - 8])
22792 return false;
22794 /* Get the repeating 8-byte value as an integer. No endian correction
22795 is needed here because bytes is already in lsb-first order. */
22796 unsigned HOST_WIDE_INT val64 = 0;
22797 for (unsigned int i = 0; i < 8; i++)
22798 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
22799 << (i * BITS_PER_UNIT));
22801 if (vec_flags & VEC_SVE_DATA)
22802 return aarch64_sve_valid_immediate (val64, info);
22803 else
22804 return aarch64_advsimd_valid_immediate (val64, info, which);
22807 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
22808 has a step in the range of INDEX. Return the index expression if so,
22809 otherwise return null. */
22811 aarch64_check_zero_based_sve_index_immediate (rtx x)
22813 rtx base, step;
22814 if (const_vec_series_p (x, &base, &step)
22815 && base == const0_rtx
22816 && aarch64_sve_index_immediate_p (step))
22817 return step;
22818 return NULL_RTX;
22821 /* Check of immediate shift constants are within range. */
22822 bool
22823 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
22825 x = unwrap_const_vec_duplicate (x);
22826 if (!CONST_INT_P (x))
22827 return false;
22828 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
22829 if (left)
22830 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
22831 else
22832 return IN_RANGE (INTVAL (x), 1, bit_width);
22835 /* Return the bitmask CONST_INT to select the bits required by a zero extract
22836 operation of width WIDTH at bit position POS. */
22839 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
22841 gcc_assert (CONST_INT_P (width));
22842 gcc_assert (CONST_INT_P (pos));
22844 unsigned HOST_WIDE_INT mask
22845 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
22846 return GEN_INT (mask << UINTVAL (pos));
22849 bool
22850 aarch64_mov_operand_p (rtx x, machine_mode mode)
22852 if (GET_CODE (x) == HIGH
22853 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
22854 return true;
22856 if (CONST_INT_P (x))
22857 return true;
22859 if (VECTOR_MODE_P (GET_MODE (x)))
22861 /* Require predicate constants to be VNx16BI before RA, so that we
22862 force everything to have a canonical form. */
22863 if (!lra_in_progress
22864 && !reload_completed
22865 && aarch64_sve_pred_mode_p (GET_MODE (x))
22866 && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
22867 && GET_MODE (x) != VNx16BImode)
22868 return false;
22870 return aarch64_simd_valid_immediate (x, NULL);
22873 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
22874 x = strip_salt (x);
22876 /* GOT accesses are valid moves. */
22877 if (SYMBOL_REF_P (x)
22878 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
22879 return true;
22881 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
22882 return true;
22884 if (TARGET_SVE
22885 && (aarch64_sve_cnt_immediate_p (x)
22886 || aarch64_sve_rdvl_immediate_p (x)))
22887 return true;
22889 if (aarch64_rdsvl_immediate_p (x))
22890 return true;
22892 return aarch64_classify_symbolic_expression (x)
22893 == SYMBOL_TINY_ABSOLUTE;
22896 /* Return a function-invariant register that contains VALUE. *CACHED_INSN
22897 caches instructions that set up such registers, so that they can be
22898 reused by future calls. */
22900 static rtx
22901 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
22903 rtx_insn *insn = *cached_insn;
22904 if (insn && INSN_P (insn) && !insn->deleted ())
22906 rtx pat = PATTERN (insn);
22907 if (GET_CODE (pat) == SET)
22909 rtx dest = SET_DEST (pat);
22910 if (REG_P (dest)
22911 && !HARD_REGISTER_P (dest)
22912 && rtx_equal_p (SET_SRC (pat), value))
22913 return dest;
22916 rtx reg = gen_reg_rtx (GET_MODE (value));
22917 *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
22918 function_beg_insn);
22919 return reg;
22922 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
22923 the constant creation. */
22926 aarch64_gen_shareable_zero (machine_mode mode)
22928 rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
22929 CONST0_RTX (V4SImode));
22930 return lowpart_subreg (mode, reg, GET_MODE (reg));
22933 /* INSN is some form of extension or shift that can be split into a
22934 permutation involving a shared zero. Return true if we should
22935 perform such a split.
22937 ??? For now, make sure that the split instruction executes more
22938 frequently than the zero that feeds it. In future it would be good
22939 to split without that restriction and instead recombine shared zeros
22940 if they turn out not to be worthwhile. This would allow splits in
22941 single-block functions and would also cope more naturally with
22942 rematerialization. */
22944 bool
22945 aarch64_split_simd_shift_p (rtx_insn *insn)
22947 return (can_create_pseudo_p ()
22948 && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
22949 && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
22950 < BLOCK_FOR_INSN (insn)->count));
22953 /* Return a const_int vector of VAL. */
22955 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
22957 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
22958 return gen_const_vec_duplicate (mode, c);
22961 /* Check OP is a legal scalar immediate for the MOVI instruction. */
22963 bool
22964 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
22966 machine_mode vmode;
22968 vmode = aarch64_simd_container_mode (mode, 64);
22969 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
22970 return aarch64_simd_valid_immediate (op_v, NULL);
22973 /* Construct and return a PARALLEL RTX vector with elements numbering the
22974 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
22975 the vector - from the perspective of the architecture. This does not
22976 line up with GCC's perspective on lane numbers, so we end up with
22977 different masks depending on our target endian-ness. The diagram
22978 below may help. We must draw the distinction when building masks
22979 which select one half of the vector. An instruction selecting
22980 architectural low-lanes for a big-endian target, must be described using
22981 a mask selecting GCC high-lanes.
22983 Big-Endian Little-Endian
22985 GCC 0 1 2 3 3 2 1 0
22986 | x | x | x | x | | x | x | x | x |
22987 Architecture 3 2 1 0 3 2 1 0
22989 Low Mask: { 2, 3 } { 0, 1 }
22990 High Mask: { 0, 1 } { 2, 3 }
22992 MODE Is the mode of the vector and NUNITS is the number of units in it. */
22995 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
22997 rtvec v = rtvec_alloc (nunits / 2);
22998 int high_base = nunits / 2;
22999 int low_base = 0;
23000 int base;
23001 rtx t1;
23002 int i;
23004 if (BYTES_BIG_ENDIAN)
23005 base = high ? low_base : high_base;
23006 else
23007 base = high ? high_base : low_base;
23009 for (i = 0; i < nunits / 2; i++)
23010 RTVEC_ELT (v, i) = GEN_INT (base + i);
23012 t1 = gen_rtx_PARALLEL (mode, v);
23013 return t1;
23016 /* Check OP for validity as a PARALLEL RTX vector with elements
23017 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23018 from the perspective of the architecture. See the diagram above
23019 aarch64_simd_vect_par_cnst_half for more details. */
23021 bool
23022 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23023 bool high)
23025 int nelts;
23026 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23027 return false;
23029 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23030 HOST_WIDE_INT count_op = XVECLEN (op, 0);
23031 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23032 int i = 0;
23034 if (count_op != count_ideal)
23035 return false;
23037 for (i = 0; i < count_ideal; i++)
23039 rtx elt_op = XVECEXP (op, 0, i);
23040 rtx elt_ideal = XVECEXP (ideal, 0, i);
23042 if (!CONST_INT_P (elt_op)
23043 || INTVAL (elt_ideal) != INTVAL (elt_op))
23044 return false;
23046 return true;
23049 /* Return a PARALLEL containing NELTS elements, with element I equal
23050 to BASE + I * STEP. */
23053 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23055 rtvec vec = rtvec_alloc (nelts);
23056 for (unsigned int i = 0; i < nelts; ++i)
23057 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23058 return gen_rtx_PARALLEL (VOIDmode, vec);
23061 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23062 series with step STEP. */
23064 bool
23065 aarch64_stepped_int_parallel_p (rtx op, int step)
23067 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23068 return false;
23070 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23071 for (int i = 1; i < XVECLEN (op, 0); ++i)
23072 if (!CONST_INT_P (XVECEXP (op, 0, i))
23073 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23074 return false;
23076 return true;
23079 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23080 sequence of strided registers, with the stride being equal STRIDE.
23081 The operands are already known to be FPRs. */
23082 bool
23083 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23084 unsigned int stride)
23086 for (unsigned int i = 1; i < num_operands; ++i)
23087 if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23088 return false;
23089 return true;
23092 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
23093 HIGH (exclusive). */
23094 void
23095 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23096 const_tree exp)
23098 HOST_WIDE_INT lane;
23099 gcc_assert (CONST_INT_P (operand));
23100 lane = INTVAL (operand);
23102 if (lane < low || lane >= high)
23104 if (exp)
23105 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23106 lane, low, high - 1);
23107 else
23108 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23112 /* Peform endian correction on lane number N, which indexes a vector
23113 of mode MODE, and return the result as an SImode rtx. */
23116 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23118 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23121 /* Return TRUE if OP is a valid vector addressing mode. */
23123 bool
23124 aarch64_simd_mem_operand_p (rtx op)
23126 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
23127 || REG_P (XEXP (op, 0)));
23130 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
23132 bool
23133 aarch64_sve_ld1r_operand_p (rtx op)
23135 struct aarch64_address_info addr;
23136 scalar_mode mode;
23138 return (MEM_P (op)
23139 && is_a <scalar_mode> (GET_MODE (op), &mode)
23140 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23141 && addr.type == ADDRESS_REG_IMM
23142 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23145 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23146 where the size of the read data is specified by `mode` and the size of the
23147 vector elements are specified by `elem_mode`. */
23148 bool
23149 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23150 scalar_mode elem_mode)
23152 struct aarch64_address_info addr;
23153 if (!MEM_P (op)
23154 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23155 return false;
23157 if (addr.type == ADDRESS_REG_IMM)
23158 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23160 if (addr.type == ADDRESS_REG_REG)
23161 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23163 return false;
23166 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
23167 bool
23168 aarch64_sve_ld1rq_operand_p (rtx op)
23170 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23171 GET_MODE_INNER (GET_MODE (op)));
23174 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23175 accessing a vector where the element size is specified by `elem_mode`. */
23176 bool
23177 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23179 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23182 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
23183 bool
23184 aarch64_sve_ldff1_operand_p (rtx op)
23186 if (!MEM_P (op))
23187 return false;
23189 struct aarch64_address_info addr;
23190 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23191 return false;
23193 if (addr.type == ADDRESS_REG_IMM)
23194 return known_eq (addr.const_offset, 0);
23196 return addr.type == ADDRESS_REG_REG;
23199 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
23200 bool
23201 aarch64_sve_ldnf1_operand_p (rtx op)
23203 struct aarch64_address_info addr;
23205 return (MEM_P (op)
23206 && aarch64_classify_address (&addr, XEXP (op, 0),
23207 GET_MODE (op), false)
23208 && addr.type == ADDRESS_REG_IMM);
23211 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23212 The conditions for STR are the same. */
23213 bool
23214 aarch64_sve_ldr_operand_p (rtx op)
23216 struct aarch64_address_info addr;
23218 return (MEM_P (op)
23219 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23220 false, ADDR_QUERY_ANY)
23221 && addr.type == ADDRESS_REG_IMM);
23224 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23225 addressing memory of mode MODE. */
23226 bool
23227 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23229 struct aarch64_address_info addr;
23230 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23231 return false;
23233 if (addr.type == ADDRESS_REG_IMM)
23234 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23236 return addr.type == ADDRESS_REG_REG;
23239 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23240 We need to be able to access the individual pieces, so the range
23241 is different from LD[234] and ST[234]. */
23242 bool
23243 aarch64_sve_struct_memory_operand_p (rtx op)
23245 if (!MEM_P (op))
23246 return false;
23248 machine_mode mode = GET_MODE (op);
23249 struct aarch64_address_info addr;
23250 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23251 ADDR_QUERY_ANY)
23252 || addr.type != ADDRESS_REG_IMM)
23253 return false;
23255 poly_int64 first = addr.const_offset;
23256 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23257 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23258 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23261 /* Return true if OFFSET is a constant integer and if VNUM is
23262 OFFSET * the number of bytes in an SVE vector. This is the requirement
23263 that exists in SME LDR and STR instructions, where the VL offset must
23264 equal the ZA slice offset. */
23265 bool
23266 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23268 if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23269 return false;
23271 if (TARGET_STREAMING)
23273 poly_int64 const_vnum;
23274 return (poly_int_rtx_p (vnum, &const_vnum)
23275 && known_eq (const_vnum,
23276 INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23278 else
23280 HOST_WIDE_INT factor;
23281 return (aarch64_sme_vq_unspec_p (vnum, &factor)
23282 && factor == INTVAL (offset) * 16);
23286 /* Emit a register copy from operand to operand, taking care not to
23287 early-clobber source registers in the process.
23289 COUNT is the number of components into which the copy needs to be
23290 decomposed. */
23291 void
23292 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23293 unsigned int count)
23295 unsigned int i;
23296 int rdest = REGNO (operands[0]);
23297 int rsrc = REGNO (operands[1]);
23299 if (!reg_overlap_mentioned_p (operands[0], operands[1])
23300 || rdest < rsrc)
23301 for (i = 0; i < count; i++)
23302 emit_move_insn (gen_rtx_REG (mode, rdest + i),
23303 gen_rtx_REG (mode, rsrc + i));
23304 else
23305 for (i = 0; i < count; i++)
23306 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23307 gen_rtx_REG (mode, rsrc + count - i - 1));
23310 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23311 one of VSTRUCT modes: OI, CI, or XI. */
23313 aarch64_simd_attr_length_rglist (machine_mode mode)
23315 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
23316 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23319 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
23320 alignment of a vector to 128 bits. SVE predicates have an alignment of
23321 16 bits. */
23322 static HOST_WIDE_INT
23323 aarch64_simd_vector_alignment (const_tree type)
23325 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23326 be set for non-predicate vectors of booleans. Modes are the most
23327 direct way we have of identifying real SVE predicate types. */
23328 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23329 return 16;
23330 widest_int min_size
23331 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23332 return wi::umin (min_size, 128).to_uhwi ();
23335 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
23336 static poly_uint64
23337 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23339 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23341 /* If the length of the vector is a fixed power of 2, try to align
23342 to that length, otherwise don't try to align at all. */
23343 HOST_WIDE_INT result;
23344 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23345 || !pow2p_hwi (result))
23346 result = TYPE_ALIGN (TREE_TYPE (type));
23347 return result;
23349 return TYPE_ALIGN (type);
23352 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
23353 static bool
23354 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23356 if (is_packed)
23357 return false;
23359 /* For fixed-length vectors, check that the vectorizer will aim for
23360 full-vector alignment. This isn't true for generic GCC vectors
23361 that are wider than the ABI maximum of 128 bits. */
23362 poly_uint64 preferred_alignment =
23363 aarch64_vectorize_preferred_vector_alignment (type);
23364 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23365 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23366 preferred_alignment))
23367 return false;
23369 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
23370 return true;
23373 /* Return true if the vector misalignment factor is supported by the
23374 target. */
23375 static bool
23376 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23377 const_tree type, int misalignment,
23378 bool is_packed)
23380 if (TARGET_SIMD && STRICT_ALIGNMENT)
23382 /* Return if movmisalign pattern is not supported for this mode. */
23383 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23384 return false;
23386 /* Misalignment factor is unknown at compile time. */
23387 if (misalignment == -1)
23388 return false;
23390 return default_builtin_support_vector_misalignment (mode, type, misalignment,
23391 is_packed);
23394 /* If VALS is a vector constant that can be loaded into a register
23395 using DUP, generate instructions to do so and return an RTX to
23396 assign to the register. Otherwise return NULL_RTX. */
23397 static rtx
23398 aarch64_simd_dup_constant (rtx vals)
23400 machine_mode mode = GET_MODE (vals);
23401 machine_mode inner_mode = GET_MODE_INNER (mode);
23402 rtx x;
23404 if (!const_vec_duplicate_p (vals, &x))
23405 return NULL_RTX;
23407 /* We can load this constant by using DUP and a constant in a
23408 single ARM register. This will be cheaper than a vector
23409 load. */
23410 x = force_reg (inner_mode, x);
23411 return gen_vec_duplicate (mode, x);
23415 /* Generate code to load VALS, which is a PARALLEL containing only
23416 constants (for vec_init) or CONST_VECTOR, efficiently into a
23417 register. Returns an RTX to copy into the register, or NULL_RTX
23418 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
23419 static rtx
23420 aarch64_simd_make_constant (rtx vals)
23422 machine_mode mode = GET_MODE (vals);
23423 rtx const_dup;
23424 rtx const_vec = NULL_RTX;
23425 int n_const = 0;
23426 int i;
23428 if (CONST_VECTOR_P (vals))
23429 const_vec = vals;
23430 else if (GET_CODE (vals) == PARALLEL)
23432 /* A CONST_VECTOR must contain only CONST_INTs and
23433 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23434 Only store valid constants in a CONST_VECTOR. */
23435 int n_elts = XVECLEN (vals, 0);
23436 for (i = 0; i < n_elts; ++i)
23438 rtx x = XVECEXP (vals, 0, i);
23439 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23440 n_const++;
23442 if (n_const == n_elts)
23443 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
23445 else
23446 gcc_unreachable ();
23448 if (const_vec != NULL_RTX
23449 && aarch64_simd_valid_immediate (const_vec, NULL))
23450 /* Load using MOVI/MVNI. */
23451 return const_vec;
23452 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
23453 /* Loaded using DUP. */
23454 return const_dup;
23455 else if (const_vec != NULL_RTX)
23456 /* Load from constant pool. We cannot take advantage of single-cycle
23457 LD1 because we need a PC-relative addressing mode. */
23458 return const_vec;
23459 else
23460 /* A PARALLEL containing something not valid inside CONST_VECTOR.
23461 We cannot construct an initializer. */
23462 return NULL_RTX;
23465 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23466 The caller has already tried a divide-and-conquer approach, so do
23467 not consider that case here. */
23469 void
23470 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
23472 machine_mode mode = GET_MODE (target);
23473 scalar_mode inner_mode = GET_MODE_INNER (mode);
23474 /* The number of vector elements. */
23475 int n_elts = XVECLEN (vals, 0);
23476 /* The number of vector elements which are not constant. */
23477 int n_var = 0;
23478 rtx any_const = NULL_RTX;
23479 /* The first element of vals. */
23480 rtx v0 = XVECEXP (vals, 0, 0);
23481 bool all_same = true;
23483 /* This is a special vec_init<M><N> where N is not an element mode but a
23484 vector mode with half the elements of M. We expect to find two entries
23485 of mode N in VALS and we must put their concatentation into TARGET. */
23486 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
23488 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
23489 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
23490 && known_eq (GET_MODE_SIZE (mode),
23491 2 * GET_MODE_SIZE (narrow_mode)));
23492 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
23493 XVECEXP (vals, 0, 0),
23494 XVECEXP (vals, 0, 1)));
23495 return;
23498 /* Count the number of variable elements to initialise. */
23499 for (int i = 0; i < n_elts; ++i)
23501 rtx x = XVECEXP (vals, 0, i);
23502 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
23503 ++n_var;
23504 else
23505 any_const = x;
23507 all_same &= rtx_equal_p (x, v0);
23510 /* No variable elements, hand off to aarch64_simd_make_constant which knows
23511 how best to handle this. */
23512 if (n_var == 0)
23514 rtx constant = aarch64_simd_make_constant (vals);
23515 if (constant != NULL_RTX)
23517 emit_move_insn (target, constant);
23518 return;
23522 /* Splat a single non-constant element if we can. */
23523 if (all_same)
23525 rtx x = force_reg (inner_mode, v0);
23526 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23527 return;
23530 enum insn_code icode = optab_handler (vec_set_optab, mode);
23531 gcc_assert (icode != CODE_FOR_nothing);
23533 /* If there are only variable elements, try to optimize
23534 the insertion using dup for the most common element
23535 followed by insertions. */
23537 /* The algorithm will fill matches[*][0] with the earliest matching element,
23538 and matches[X][1] with the count of duplicate elements (if X is the
23539 earliest element which has duplicates). */
23541 if (n_var >= n_elts - 1 && n_elts <= 16)
23543 int matches[16][2] = {0};
23544 for (int i = 0; i < n_elts; i++)
23546 for (int j = 0; j <= i; j++)
23548 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
23550 matches[i][0] = j;
23551 matches[j][1]++;
23552 break;
23556 int maxelement = 0;
23557 int maxv = 0;
23558 rtx const_elem = NULL_RTX;
23559 int const_elem_pos = 0;
23561 for (int i = 0; i < n_elts; i++)
23563 if (matches[i][1] > maxv)
23565 maxelement = i;
23566 maxv = matches[i][1];
23568 if (CONST_INT_P (XVECEXP (vals, 0, i))
23569 || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
23571 const_elem_pos = i;
23572 const_elem = XVECEXP (vals, 0, i);
23576 /* Create a duplicate of the most common element, unless all elements
23577 are equally useless to us, in which case just immediately set the
23578 vector register using the first element. */
23580 if (maxv == 1)
23582 /* For vectors of two 64-bit elements, we can do even better. */
23583 if (n_elts == 2
23584 && (inner_mode == E_DImode
23585 || inner_mode == E_DFmode))
23588 rtx x0 = XVECEXP (vals, 0, 0);
23589 rtx x1 = XVECEXP (vals, 0, 1);
23590 /* Combine can pick up this case, but handling it directly
23591 here leaves clearer RTL.
23593 This is load_pair_lanes<mode>, and also gives us a clean-up
23594 for store_pair_lanes<mode>. */
23595 if (memory_operand (x0, inner_mode)
23596 && memory_operand (x1, inner_mode)
23597 && aarch64_mergeable_load_pair_p (mode, x0, x1))
23599 rtx t;
23600 if (inner_mode == DFmode)
23601 t = gen_load_pair_lanesdf (target, x0, x1);
23602 else
23603 t = gen_load_pair_lanesdi (target, x0, x1);
23604 emit_insn (t);
23605 return;
23608 /* The subreg-move sequence below will move into lane zero of the
23609 vector register. For big-endian we want that position to hold
23610 the last element of VALS. */
23611 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
23613 /* If we have a single constant element, use that for duplicating
23614 instead. */
23615 if (const_elem)
23617 maxelement = const_elem_pos;
23618 aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
23620 else
23622 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23623 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
23626 else
23628 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23629 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23632 /* Insert the rest. */
23633 for (int i = 0; i < n_elts; i++)
23635 rtx x = XVECEXP (vals, 0, i);
23636 if (matches[i][0] == maxelement)
23637 continue;
23638 x = force_reg (inner_mode, x);
23639 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23641 return;
23644 /* Initialise a vector which is part-variable. We want to first try
23645 to build those lanes which are constant in the most efficient way we
23646 can. */
23647 if (n_var != n_elts)
23649 rtx copy = copy_rtx (vals);
23651 /* Load constant part of vector. We really don't care what goes into the
23652 parts we will overwrite, but we're more likely to be able to load the
23653 constant efficiently if it has fewer, larger, repeating parts
23654 (see aarch64_simd_valid_immediate). */
23655 for (int i = 0; i < n_elts; i++)
23657 rtx x = XVECEXP (vals, 0, i);
23658 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23659 continue;
23660 rtx subst = any_const;
23661 for (int bit = n_elts / 2; bit > 0; bit /= 2)
23663 /* Look in the copied vector, as more elements are const. */
23664 rtx test = XVECEXP (copy, 0, i ^ bit);
23665 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
23667 subst = test;
23668 break;
23671 XVECEXP (copy, 0, i) = subst;
23673 aarch64_expand_vector_init_fallback (target, copy);
23676 /* Insert the variable lanes directly. */
23677 for (int i = 0; i < n_elts; i++)
23679 rtx x = XVECEXP (vals, 0, i);
23680 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23681 continue;
23682 x = force_reg (inner_mode, x);
23683 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23687 /* Return even or odd half of VALS depending on EVEN_P. */
23689 static rtx
23690 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
23692 int n = XVECLEN (vals, 0);
23693 machine_mode new_mode
23694 = aarch64_simd_container_mode (GET_MODE_INNER (mode),
23695 GET_MODE_BITSIZE (mode).to_constant () / 2);
23696 rtvec vec = rtvec_alloc (n / 2);
23697 for (int i = 0; i < n / 2; i++)
23698 RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
23699 : XVECEXP (vals, 0, 2 * i + 1);
23700 return gen_rtx_PARALLEL (new_mode, vec);
23703 /* Return true if SET is a scalar move. */
23705 static bool
23706 scalar_move_insn_p (rtx set)
23708 rtx src = SET_SRC (set);
23709 rtx dest = SET_DEST (set);
23710 return (is_a<scalar_mode> (GET_MODE (dest))
23711 && aarch64_mov_operand (src, GET_MODE (dest)));
23714 /* Similar to seq_cost, but ignore cost for scalar moves. */
23716 static unsigned
23717 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
23719 unsigned cost = 0;
23721 for (; seq; seq = NEXT_INSN (seq))
23722 if (NONDEBUG_INSN_P (seq))
23724 if (rtx set = single_set (seq))
23726 if (!scalar_move_insn_p (set))
23727 cost += set_rtx_cost (set, speed);
23729 else
23731 int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
23732 if (this_cost > 0)
23733 cost += this_cost;
23734 else
23735 cost++;
23739 return cost;
23742 /* Expand a vector initialization sequence, such that TARGET is
23743 initialized to contain VALS. */
23745 void
23746 aarch64_expand_vector_init (rtx target, rtx vals)
23748 /* Try decomposing the initializer into even and odd halves and
23749 then ZIP them together. Use the resulting sequence if it is
23750 strictly cheaper than loading VALS directly.
23752 Prefer the fallback sequence in the event of a tie, since it
23753 will tend to use fewer registers. */
23755 machine_mode mode = GET_MODE (target);
23756 int n_elts = XVECLEN (vals, 0);
23758 if (n_elts < 4
23759 || maybe_ne (GET_MODE_BITSIZE (mode), 128))
23761 aarch64_expand_vector_init_fallback (target, vals);
23762 return;
23765 start_sequence ();
23766 rtx halves[2];
23767 unsigned costs[2];
23768 for (int i = 0; i < 2; i++)
23770 start_sequence ();
23771 rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
23772 rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
23773 aarch64_expand_vector_init (tmp_reg, new_vals);
23774 halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
23775 rtx_insn *rec_seq = get_insns ();
23776 end_sequence ();
23777 costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
23778 emit_insn (rec_seq);
23781 rtvec v = gen_rtvec (2, halves[0], halves[1]);
23782 rtx_insn *zip1_insn
23783 = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
23784 unsigned seq_total_cost
23785 = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
23786 seq_total_cost += insn_cost (zip1_insn, !optimize_size);
23788 rtx_insn *seq = get_insns ();
23789 end_sequence ();
23791 start_sequence ();
23792 aarch64_expand_vector_init_fallback (target, vals);
23793 rtx_insn *fallback_seq = get_insns ();
23794 unsigned fallback_seq_cost
23795 = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
23796 end_sequence ();
23798 emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
23801 /* Emit RTL corresponding to:
23802 insr TARGET, ELEM. */
23804 static void
23805 emit_insr (rtx target, rtx elem)
23807 machine_mode mode = GET_MODE (target);
23808 scalar_mode elem_mode = GET_MODE_INNER (mode);
23809 elem = force_reg (elem_mode, elem);
23811 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
23812 gcc_assert (icode != CODE_FOR_nothing);
23813 emit_insn (GEN_FCN (icode) (target, target, elem));
23816 /* Subroutine of aarch64_sve_expand_vector_init for handling
23817 trailing constants.
23818 This function works as follows:
23819 (a) Create a new vector consisting of trailing constants.
23820 (b) Initialize TARGET with the constant vector using emit_move_insn.
23821 (c) Insert remaining elements in TARGET using insr.
23822 NELTS is the total number of elements in original vector while
23823 while NELTS_REQD is the number of elements that are actually
23824 significant.
23826 ??? The heuristic used is to do above only if number of constants
23827 is at least half the total number of elements. May need fine tuning. */
23829 static bool
23830 aarch64_sve_expand_vector_init_handle_trailing_constants
23831 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
23833 machine_mode mode = GET_MODE (target);
23834 scalar_mode elem_mode = GET_MODE_INNER (mode);
23835 int n_trailing_constants = 0;
23837 for (int i = nelts_reqd - 1;
23838 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
23839 i--)
23840 n_trailing_constants++;
23842 if (n_trailing_constants >= nelts_reqd / 2)
23844 /* Try to use the natural pattern of BUILDER to extend the trailing
23845 constant elements to a full vector. Replace any variables in the
23846 extra elements with zeros.
23848 ??? It would be better if the builders supported "don't care"
23849 elements, with the builder filling in whichever elements
23850 give the most compact encoding. */
23851 rtx_vector_builder v (mode, nelts, 1);
23852 for (int i = 0; i < nelts; i++)
23854 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
23855 if (!valid_for_const_vector_p (elem_mode, x))
23856 x = CONST0_RTX (elem_mode);
23857 v.quick_push (x);
23859 rtx const_vec = v.build ();
23860 emit_move_insn (target, const_vec);
23862 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
23863 emit_insr (target, builder.elt (i));
23865 return true;
23868 return false;
23871 /* Subroutine of aarch64_sve_expand_vector_init.
23872 Works as follows:
23873 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
23874 (b) Skip trailing elements from BUILDER, which are the same as
23875 element NELTS_REQD - 1.
23876 (c) Insert earlier elements in reverse order in TARGET using insr. */
23878 static void
23879 aarch64_sve_expand_vector_init_insert_elems (rtx target,
23880 const rtx_vector_builder &builder,
23881 int nelts_reqd)
23883 machine_mode mode = GET_MODE (target);
23884 scalar_mode elem_mode = GET_MODE_INNER (mode);
23886 struct expand_operand ops[2];
23887 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
23888 gcc_assert (icode != CODE_FOR_nothing);
23890 create_output_operand (&ops[0], target, mode);
23891 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
23892 expand_insn (icode, 2, ops);
23894 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
23895 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
23896 emit_insr (target, builder.elt (i));
23899 /* Subroutine of aarch64_sve_expand_vector_init to handle case
23900 when all trailing elements of builder are same.
23901 This works as follows:
23902 (a) Use expand_insn interface to broadcast last vector element in TARGET.
23903 (b) Insert remaining elements in TARGET using insr.
23905 ??? The heuristic used is to do above if number of same trailing elements
23906 is at least 3/4 of total number of elements, loosely based on
23907 heuristic from mostly_zeros_p. May need fine-tuning. */
23909 static bool
23910 aarch64_sve_expand_vector_init_handle_trailing_same_elem
23911 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
23913 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
23914 if (ndups >= (3 * nelts_reqd) / 4)
23916 aarch64_sve_expand_vector_init_insert_elems (target, builder,
23917 nelts_reqd - ndups + 1);
23918 return true;
23921 return false;
23924 /* Initialize register TARGET from BUILDER. NELTS is the constant number
23925 of elements in BUILDER.
23927 The function tries to initialize TARGET from BUILDER if it fits one
23928 of the special cases outlined below.
23930 Failing that, the function divides BUILDER into two sub-vectors:
23931 v_even = even elements of BUILDER;
23932 v_odd = odd elements of BUILDER;
23934 and recursively calls itself with v_even and v_odd.
23936 if (recursive call succeeded for v_even or v_odd)
23937 TARGET = zip (v_even, v_odd)
23939 The function returns true if it managed to build TARGET from BUILDER
23940 with one of the special cases, false otherwise.
23942 Example: {a, 1, b, 2, c, 3, d, 4}
23944 The vector gets divided into:
23945 v_even = {a, b, c, d}
23946 v_odd = {1, 2, 3, 4}
23948 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
23949 initialize tmp2 from constant vector v_odd using emit_move_insn.
23951 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
23952 4 elements, so we construct tmp1 from v_even using insr:
23953 tmp1 = dup(d)
23954 insr tmp1, c
23955 insr tmp1, b
23956 insr tmp1, a
23958 And finally:
23959 TARGET = zip (tmp1, tmp2)
23960 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
23962 static bool
23963 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
23964 int nelts, int nelts_reqd)
23966 machine_mode mode = GET_MODE (target);
23968 /* Case 1: Vector contains trailing constants. */
23970 if (aarch64_sve_expand_vector_init_handle_trailing_constants
23971 (target, builder, nelts, nelts_reqd))
23972 return true;
23974 /* Case 2: Vector contains leading constants. */
23976 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
23977 for (int i = 0; i < nelts_reqd; i++)
23978 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
23979 rev_builder.finalize ();
23981 if (aarch64_sve_expand_vector_init_handle_trailing_constants
23982 (target, rev_builder, nelts, nelts_reqd))
23984 emit_insn (gen_aarch64_sve_rev (mode, target, target));
23985 return true;
23988 /* Case 3: Vector contains trailing same element. */
23990 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
23991 (target, builder, nelts_reqd))
23992 return true;
23994 /* Case 4: Vector contains leading same element. */
23996 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
23997 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
23999 emit_insn (gen_aarch64_sve_rev (mode, target, target));
24000 return true;
24003 /* Avoid recursing below 4-elements.
24004 ??? The threshold 4 may need fine-tuning. */
24006 if (nelts_reqd <= 4)
24007 return false;
24009 rtx_vector_builder v_even (mode, nelts, 1);
24010 rtx_vector_builder v_odd (mode, nelts, 1);
24012 for (int i = 0; i < nelts * 2; i += 2)
24014 v_even.quick_push (builder.elt (i));
24015 v_odd.quick_push (builder.elt (i + 1));
24018 v_even.finalize ();
24019 v_odd.finalize ();
24021 rtx tmp1 = gen_reg_rtx (mode);
24022 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24023 nelts, nelts_reqd / 2);
24025 rtx tmp2 = gen_reg_rtx (mode);
24026 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24027 nelts, nelts_reqd / 2);
24029 if (!did_even_p && !did_odd_p)
24030 return false;
24032 /* Initialize v_even and v_odd using INSR if it didn't match any of the
24033 special cases and zip v_even, v_odd. */
24035 if (!did_even_p)
24036 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24038 if (!did_odd_p)
24039 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24041 rtvec v = gen_rtvec (2, tmp1, tmp2);
24042 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24043 return true;
24046 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
24048 void
24049 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24051 machine_mode mode = GET_MODE (target);
24052 int nelts = XVECLEN (vals, 0);
24054 rtx_vector_builder v (mode, nelts, 1);
24055 for (int i = 0; i < nelts; i++)
24056 v.quick_push (XVECEXP (vals, 0, i));
24057 v.finalize ();
24059 /* If neither sub-vectors of v could be initialized specially,
24060 then use INSR to insert all elements from v into TARGET.
24061 ??? This might not be optimal for vectors with large
24062 initializers like 16-element or above.
24063 For nelts < 4, it probably isn't useful to handle specially. */
24065 if (nelts < 4
24066 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24067 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24070 /* Check whether VALUE is a vector constant in which every element
24071 is either a power of 2 or a negated power of 2. If so, return
24072 a constant vector of log2s, and flip CODE between PLUS and MINUS
24073 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
24075 static rtx
24076 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24078 if (!CONST_VECTOR_P (value))
24079 return NULL_RTX;
24081 rtx_vector_builder builder;
24082 if (!builder.new_unary_operation (GET_MODE (value), value, false))
24083 return NULL_RTX;
24085 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24086 /* 1 if the result of the multiplication must be negated,
24087 0 if it mustn't, or -1 if we don't yet care. */
24088 int negate = -1;
24089 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24090 for (unsigned int i = 0; i < encoded_nelts; ++i)
24092 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24093 if (!CONST_SCALAR_INT_P (elt))
24094 return NULL_RTX;
24095 rtx_mode_t val (elt, int_mode);
24096 wide_int pow2 = wi::neg (val);
24097 if (val != pow2)
24099 /* It matters whether we negate or not. Make that choice,
24100 and make sure that it's consistent with previous elements. */
24101 if (negate == !wi::neg_p (val))
24102 return NULL_RTX;
24103 negate = wi::neg_p (val);
24104 if (!negate)
24105 pow2 = val;
24107 /* POW2 is now the value that we want to be a power of 2. */
24108 int shift = wi::exact_log2 (pow2);
24109 if (shift < 0)
24110 return NULL_RTX;
24111 builder.quick_push (gen_int_mode (shift, int_mode));
24113 if (negate == -1)
24114 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
24115 code = PLUS;
24116 else if (negate == 1)
24117 code = code == PLUS ? MINUS : PLUS;
24118 return builder.build ();
24121 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24122 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
24123 operands array, in the same order as for fma_optab. Return true if
24124 the function emitted all the necessary instructions, false if the caller
24125 should generate the pattern normally with the new OPERANDS array. */
24127 bool
24128 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24130 machine_mode mode = GET_MODE (operands[0]);
24131 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24133 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24134 NULL_RTX, true, OPTAB_DIRECT);
24135 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24136 operands[3], product, operands[0], true,
24137 OPTAB_DIRECT);
24138 return true;
24140 operands[2] = force_reg (mode, operands[2]);
24141 return false;
24144 /* Likewise, but for a conditional pattern. */
24146 bool
24147 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24149 machine_mode mode = GET_MODE (operands[0]);
24150 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24152 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24153 NULL_RTX, true, OPTAB_DIRECT);
24154 emit_insn (gen_cond (code, mode, operands[0], operands[1],
24155 operands[4], product, operands[5]));
24156 return true;
24158 operands[3] = force_reg (mode, operands[3]);
24159 return false;
24162 static unsigned HOST_WIDE_INT
24163 aarch64_shift_truncation_mask (machine_mode mode)
24165 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24166 return 0;
24167 return GET_MODE_UNIT_BITSIZE (mode) - 1;
24170 /* Select a format to encode pointers in exception handling data. */
24172 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24174 int type;
24175 switch (aarch64_cmodel)
24177 case AARCH64_CMODEL_TINY:
24178 case AARCH64_CMODEL_TINY_PIC:
24179 case AARCH64_CMODEL_SMALL:
24180 case AARCH64_CMODEL_SMALL_PIC:
24181 case AARCH64_CMODEL_SMALL_SPIC:
24182 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
24183 for everything. */
24184 type = DW_EH_PE_sdata4;
24185 break;
24186 default:
24187 /* No assumptions here. 8-byte relocs required. */
24188 type = DW_EH_PE_sdata8;
24189 break;
24191 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24194 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
24196 static void
24197 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24199 if (TREE_CODE (decl) == FUNCTION_DECL)
24201 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24202 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24204 fprintf (stream, "\t.variant_pcs\t");
24205 assemble_name (stream, name);
24206 fprintf (stream, "\n");
24211 /* The last .arch and .tune assembly strings that we printed. */
24212 static std::string aarch64_last_printed_arch_string;
24213 static std::string aarch64_last_printed_tune_string;
24215 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
24216 by the function fndecl. */
24218 void
24219 aarch64_declare_function_name (FILE *stream, const char* name,
24220 tree fndecl)
24222 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24224 struct cl_target_option *targ_options;
24225 if (target_parts)
24226 targ_options = TREE_TARGET_OPTION (target_parts);
24227 else
24228 targ_options = TREE_TARGET_OPTION (target_option_current_node);
24229 gcc_assert (targ_options);
24231 const struct processor *this_arch
24232 = aarch64_get_arch (targ_options->x_selected_arch);
24234 auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
24235 std::string extension
24236 = aarch64_get_extension_string_for_isa_flags (isa_flags,
24237 this_arch->flags);
24238 /* Only update the assembler .arch string if it is distinct from the last
24239 such string we printed. */
24240 std::string to_print = this_arch->name + extension;
24241 if (to_print != aarch64_last_printed_arch_string)
24243 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24244 aarch64_last_printed_arch_string = to_print;
24247 /* Print the cpu name we're tuning for in the comments, might be
24248 useful to readers of the generated asm. Do it only when it changes
24249 from function to function and verbose assembly is requested. */
24250 const struct processor *this_tune
24251 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24253 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24255 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24256 this_tune->name);
24257 aarch64_last_printed_tune_string = this_tune->name;
24260 aarch64_asm_output_variant_pcs (stream, fndecl, name);
24262 /* Don't forget the type directive for ELF. */
24263 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24264 ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24266 cfun->machine->label_is_assembled = true;
24269 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
24271 void
24272 aarch64_print_patchable_function_entry (FILE *file,
24273 unsigned HOST_WIDE_INT patch_area_size,
24274 bool record_p)
24276 if (!cfun->machine->label_is_assembled)
24278 /* Emit the patching area before the entry label, if any. */
24279 default_print_patchable_function_entry (file, patch_area_size,
24280 record_p);
24281 return;
24284 rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24285 GEN_INT (record_p));
24286 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24288 if (!aarch_bti_enabled ()
24289 || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24291 /* Emit the patchable_area at the beginning of the function. */
24292 rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24293 INSN_ADDRESSES_NEW (insn, -1);
24294 return;
24297 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24298 if (!insn
24299 || !INSN_P (insn)
24300 || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24301 || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24303 /* Emit a BTI_C. */
24304 insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24307 /* Emit the patchable_area after BTI_C. */
24308 insn = emit_insn_after (pa, insn);
24309 INSN_ADDRESSES_NEW (insn, -1);
24312 /* Output patchable area. */
24314 void
24315 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24317 default_print_patchable_function_entry (asm_out_file, patch_area_size,
24318 record_p);
24321 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
24323 void
24324 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24326 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24327 const char *value = IDENTIFIER_POINTER (target);
24328 aarch64_asm_output_variant_pcs (stream, decl, name);
24329 ASM_OUTPUT_DEF (stream, name, value);
24332 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
24333 function symbol references. */
24335 void
24336 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24338 default_elf_asm_output_external (stream, decl, name);
24339 aarch64_asm_output_variant_pcs (stream, decl, name);
24342 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24343 Used to output the .cfi_b_key_frame directive when signing the current
24344 function with the B key. */
24346 void
24347 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24349 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24350 && aarch_ra_sign_key == AARCH_KEY_B)
24351 asm_fprintf (f, "\t.cfi_b_key_frame\n");
24354 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
24356 static void
24357 aarch64_start_file (void)
24359 struct cl_target_option *default_options
24360 = TREE_TARGET_OPTION (target_option_default_node);
24362 const struct processor *default_arch
24363 = aarch64_get_arch (default_options->x_selected_arch);
24364 auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
24365 std::string extension
24366 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
24367 default_arch->flags);
24369 aarch64_last_printed_arch_string = default_arch->name + extension;
24370 aarch64_last_printed_tune_string = "";
24371 asm_fprintf (asm_out_file, "\t.arch %s\n",
24372 aarch64_last_printed_arch_string.c_str ());
24374 default_file_start ();
24377 /* Emit load exclusive. */
24379 static void
24380 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24381 rtx mem, rtx model_rtx)
24383 if (mode == TImode)
24384 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
24385 gen_highpart (DImode, rval),
24386 mem, model_rtx));
24387 else
24388 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
24391 /* Emit store exclusive. */
24393 static void
24394 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
24395 rtx mem, rtx rval, rtx model_rtx)
24397 if (mode == TImode)
24398 emit_insn (gen_aarch64_store_exclusive_pair
24399 (bval, mem, operand_subword (rval, 0, 0, TImode),
24400 operand_subword (rval, 1, 0, TImode), model_rtx));
24401 else
24402 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
24405 /* Mark the previous jump instruction as unlikely. */
24407 static void
24408 aarch64_emit_unlikely_jump (rtx insn)
24410 rtx_insn *jump = emit_jump_insn (insn);
24411 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
24414 /* We store the names of the various atomic helpers in a 5x5 array.
24415 Return the libcall function given MODE, MODEL and NAMES. */
24418 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
24419 const atomic_ool_names *names)
24421 memmodel model = memmodel_from_int (INTVAL (model_rtx));
24422 int mode_idx, model_idx;
24424 switch (mode)
24426 case E_QImode:
24427 mode_idx = 0;
24428 break;
24429 case E_HImode:
24430 mode_idx = 1;
24431 break;
24432 case E_SImode:
24433 mode_idx = 2;
24434 break;
24435 case E_DImode:
24436 mode_idx = 3;
24437 break;
24438 case E_TImode:
24439 mode_idx = 4;
24440 break;
24441 default:
24442 gcc_unreachable ();
24445 switch (model)
24447 case MEMMODEL_RELAXED:
24448 model_idx = 0;
24449 break;
24450 case MEMMODEL_CONSUME:
24451 case MEMMODEL_ACQUIRE:
24452 model_idx = 1;
24453 break;
24454 case MEMMODEL_RELEASE:
24455 model_idx = 2;
24456 break;
24457 case MEMMODEL_ACQ_REL:
24458 case MEMMODEL_SEQ_CST:
24459 model_idx = 3;
24460 break;
24461 case MEMMODEL_SYNC_ACQUIRE:
24462 case MEMMODEL_SYNC_RELEASE:
24463 case MEMMODEL_SYNC_SEQ_CST:
24464 model_idx = 4;
24465 break;
24466 default:
24467 gcc_unreachable ();
24470 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
24471 VISIBILITY_HIDDEN);
24474 #define DEF0(B, N) \
24475 { "__aarch64_" #B #N "_relax", \
24476 "__aarch64_" #B #N "_acq", \
24477 "__aarch64_" #B #N "_rel", \
24478 "__aarch64_" #B #N "_acq_rel", \
24479 "__aarch64_" #B #N "_sync" }
24481 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24482 { NULL, NULL, NULL, NULL }
24483 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24485 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
24486 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
24487 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
24488 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
24489 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
24490 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
24492 #undef DEF0
24493 #undef DEF4
24494 #undef DEF5
24496 /* Expand a compare and swap pattern. */
24498 void
24499 aarch64_expand_compare_and_swap (rtx operands[])
24501 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
24502 machine_mode mode, r_mode;
24504 bval = operands[0];
24505 rval = operands[1];
24506 mem = operands[2];
24507 oldval = operands[3];
24508 newval = operands[4];
24509 is_weak = operands[5];
24510 mod_s = operands[6];
24511 mod_f = operands[7];
24512 mode = GET_MODE (mem);
24514 /* Normally the succ memory model must be stronger than fail, but in the
24515 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24516 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
24517 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
24518 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
24519 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
24521 r_mode = mode;
24522 if (mode == QImode || mode == HImode)
24524 r_mode = SImode;
24525 rval = gen_reg_rtx (r_mode);
24528 if (TARGET_LSE)
24530 /* The CAS insn requires oldval and rval overlap, but we need to
24531 have a copy of oldval saved across the operation to tell if
24532 the operation is successful. */
24533 if (reg_overlap_mentioned_p (rval, oldval))
24534 rval = copy_to_mode_reg (r_mode, oldval);
24535 else
24536 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
24538 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
24539 newval, mod_s));
24540 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24542 else if (TARGET_OUTLINE_ATOMICS)
24544 /* Oldval must satisfy compare afterward. */
24545 if (!aarch64_plus_operand (oldval, mode))
24546 oldval = force_reg (mode, oldval);
24547 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
24548 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
24549 oldval, mode, newval, mode,
24550 XEXP (mem, 0), Pmode);
24551 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24553 else
24555 /* The oldval predicate varies by mode. Test it and force to reg. */
24556 insn_code code = code_for_aarch64_compare_and_swap (mode);
24557 if (!insn_data[code].operand[2].predicate (oldval, mode))
24558 oldval = force_reg (mode, oldval);
24560 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
24561 is_weak, mod_s, mod_f));
24562 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
24565 if (r_mode != mode)
24566 rval = gen_lowpart (mode, rval);
24567 emit_move_insn (operands[1], rval);
24569 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
24570 emit_insn (gen_rtx_SET (bval, x));
24573 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24574 sequence implementing an atomic operation. */
24576 static void
24577 aarch64_emit_post_barrier (enum memmodel model)
24579 const enum memmodel base_model = memmodel_base (model);
24581 if (is_mm_sync (model)
24582 && (base_model == MEMMODEL_ACQUIRE
24583 || base_model == MEMMODEL_ACQ_REL
24584 || base_model == MEMMODEL_SEQ_CST))
24586 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
24590 /* Split a compare and swap pattern. */
24592 void
24593 aarch64_split_compare_and_swap (rtx operands[])
24595 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
24596 gcc_assert (epilogue_completed);
24598 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
24599 machine_mode mode;
24600 bool is_weak;
24601 rtx_code_label *label1, *label2;
24602 enum memmodel model;
24604 rval = operands[0];
24605 mem = operands[1];
24606 oldval = operands[2];
24607 newval = operands[3];
24608 model_rtx = operands[5];
24609 scratch = operands[7];
24610 mode = GET_MODE (mem);
24611 model = memmodel_from_int (INTVAL (model_rtx));
24612 is_weak = operands[4] != const0_rtx && mode != TImode;
24614 /* When OLDVAL is zero and we want the strong version we can emit a tighter
24615 loop:
24616 .label1:
24617 LD[A]XR rval, [mem]
24618 CBNZ rval, .label2
24619 ST[L]XR scratch, newval, [mem]
24620 CBNZ scratch, .label1
24621 .label2:
24622 CMP rval, 0. */
24623 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
24624 oldval == const0_rtx && mode != TImode);
24626 label1 = NULL;
24627 if (!is_weak)
24629 label1 = gen_label_rtx ();
24630 emit_label (label1);
24632 label2 = gen_label_rtx ();
24634 /* The initial load can be relaxed for a __sync operation since a final
24635 barrier will be emitted to stop code hoisting. */
24636 if (is_mm_sync (model))
24637 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
24638 else
24639 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
24641 if (strong_zero_p)
24642 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
24643 else
24645 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24646 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
24648 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24649 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
24650 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24652 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
24654 if (!is_weak)
24656 if (aarch64_track_speculation)
24658 /* Emit an explicit compare instruction, so that we can correctly
24659 track the condition codes. */
24660 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24661 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
24663 else
24664 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
24666 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24667 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
24668 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24670 else
24671 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24673 /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
24674 store the returned value and loop if the STLXP fails. */
24675 if (mode == TImode)
24677 rtx_code_label *label3 = gen_label_rtx ();
24678 emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
24679 emit_barrier ();
24681 emit_label (label2);
24682 aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
24684 if (aarch64_track_speculation)
24686 /* Emit an explicit compare instruction, so that we can correctly
24687 track the condition codes. */
24688 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24689 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
24691 else
24692 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
24693 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24694 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
24695 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24697 label2 = label3;
24700 emit_label (label2);
24702 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
24703 to set the condition flags. If this is not used it will be removed by
24704 later passes. */
24705 if (strong_zero_p)
24706 aarch64_gen_compare_reg (NE, rval, const0_rtx);
24708 /* Emit any final barrier needed for a __sync operation. */
24709 if (is_mm_sync (model))
24710 aarch64_emit_post_barrier (model);
24713 /* Split an atomic operation. */
24715 void
24716 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
24717 rtx value, rtx model_rtx, rtx cond)
24719 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
24720 gcc_assert (epilogue_completed);
24722 machine_mode mode = GET_MODE (mem);
24723 machine_mode wmode = (mode == DImode ? DImode : SImode);
24724 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
24725 const bool is_sync = is_mm_sync (model);
24726 rtx_code_label *label;
24727 rtx x;
24729 /* Split the atomic operation into a sequence. */
24730 label = gen_label_rtx ();
24731 emit_label (label);
24733 if (new_out)
24734 new_out = gen_lowpart (wmode, new_out);
24735 if (old_out)
24736 old_out = gen_lowpart (wmode, old_out);
24737 else
24738 old_out = new_out;
24739 value = simplify_gen_subreg (wmode, value, mode, 0);
24741 /* The initial load can be relaxed for a __sync operation since a final
24742 barrier will be emitted to stop code hoisting. */
24743 if (is_sync)
24744 aarch64_emit_load_exclusive (mode, old_out, mem,
24745 GEN_INT (MEMMODEL_RELAXED));
24746 else
24747 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
24749 switch (code)
24751 case SET:
24752 new_out = value;
24753 break;
24755 case NOT:
24756 x = gen_rtx_AND (wmode, old_out, value);
24757 emit_insn (gen_rtx_SET (new_out, x));
24758 x = gen_rtx_NOT (wmode, new_out);
24759 emit_insn (gen_rtx_SET (new_out, x));
24760 break;
24762 case MINUS:
24763 if (CONST_INT_P (value))
24765 value = GEN_INT (-UINTVAL (value));
24766 code = PLUS;
24768 /* Fall through. */
24770 default:
24771 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
24772 emit_insn (gen_rtx_SET (new_out, x));
24773 break;
24776 aarch64_emit_store_exclusive (mode, cond, mem,
24777 gen_lowpart (mode, new_out), model_rtx);
24779 if (aarch64_track_speculation)
24781 /* Emit an explicit compare instruction, so that we can correctly
24782 track the condition codes. */
24783 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
24784 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
24786 else
24787 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
24789 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24790 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
24791 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24793 /* Emit any final barrier needed for a __sync operation. */
24794 if (is_sync)
24795 aarch64_emit_post_barrier (model);
24798 static void
24799 aarch64_init_libfuncs (void)
24801 /* Half-precision float operations. The compiler handles all operations
24802 with NULL libfuncs by converting to SFmode. */
24804 /* Conversions. */
24805 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
24806 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
24808 /* Arithmetic. */
24809 set_optab_libfunc (add_optab, HFmode, NULL);
24810 set_optab_libfunc (sdiv_optab, HFmode, NULL);
24811 set_optab_libfunc (smul_optab, HFmode, NULL);
24812 set_optab_libfunc (neg_optab, HFmode, NULL);
24813 set_optab_libfunc (sub_optab, HFmode, NULL);
24815 /* Comparisons. */
24816 set_optab_libfunc (eq_optab, HFmode, NULL);
24817 set_optab_libfunc (ne_optab, HFmode, NULL);
24818 set_optab_libfunc (lt_optab, HFmode, NULL);
24819 set_optab_libfunc (le_optab, HFmode, NULL);
24820 set_optab_libfunc (ge_optab, HFmode, NULL);
24821 set_optab_libfunc (gt_optab, HFmode, NULL);
24822 set_optab_libfunc (unord_optab, HFmode, NULL);
24825 /* Target hook for c_mode_for_suffix. */
24826 static machine_mode
24827 aarch64_c_mode_for_suffix (char suffix)
24829 if (suffix == 'q')
24830 return TFmode;
24832 return VOIDmode;
24835 /* We can only represent floating point constants which will fit in
24836 "quarter-precision" values. These values are characterised by
24837 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
24840 (-1)^s * (n/16) * 2^r
24842 Where:
24843 's' is the sign bit.
24844 'n' is an integer in the range 16 <= n <= 31.
24845 'r' is an integer in the range -3 <= r <= 4. */
24847 /* Return true iff X can be represented by a quarter-precision
24848 floating point immediate operand X. Note, we cannot represent 0.0. */
24849 bool
24850 aarch64_float_const_representable_p (rtx x)
24852 /* This represents our current view of how many bits
24853 make up the mantissa. */
24854 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
24855 int exponent;
24856 unsigned HOST_WIDE_INT mantissa, mask;
24857 REAL_VALUE_TYPE r, m;
24858 bool fail;
24860 x = unwrap_const_vec_duplicate (x);
24861 if (!CONST_DOUBLE_P (x))
24862 return false;
24864 if (GET_MODE (x) == VOIDmode
24865 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
24866 return false;
24868 r = *CONST_DOUBLE_REAL_VALUE (x);
24870 /* We cannot represent infinities, NaNs or +/-zero. We won't
24871 know if we have +zero until we analyse the mantissa, but we
24872 can reject the other invalid values. */
24873 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
24874 || REAL_VALUE_MINUS_ZERO (r))
24875 return false;
24877 /* For BFmode, only handle 0.0. */
24878 if (GET_MODE (x) == BFmode)
24879 return real_iszero (&r, false);
24881 /* Extract exponent. */
24882 r = real_value_abs (&r);
24883 exponent = REAL_EXP (&r);
24885 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
24886 highest (sign) bit, with a fixed binary point at bit point_pos.
24887 m1 holds the low part of the mantissa, m2 the high part.
24888 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
24889 bits for the mantissa, this can fail (low bits will be lost). */
24890 real_ldexp (&m, &r, point_pos - exponent);
24891 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
24893 /* If the low part of the mantissa has bits set we cannot represent
24894 the value. */
24895 if (w.ulow () != 0)
24896 return false;
24897 /* We have rejected the lower HOST_WIDE_INT, so update our
24898 understanding of how many bits lie in the mantissa and
24899 look only at the high HOST_WIDE_INT. */
24900 mantissa = w.elt (1);
24901 point_pos -= HOST_BITS_PER_WIDE_INT;
24903 /* We can only represent values with a mantissa of the form 1.xxxx. */
24904 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
24905 if ((mantissa & mask) != 0)
24906 return false;
24908 /* Having filtered unrepresentable values, we may now remove all
24909 but the highest 5 bits. */
24910 mantissa >>= point_pos - 5;
24912 /* We cannot represent the value 0.0, so reject it. This is handled
24913 elsewhere. */
24914 if (mantissa == 0)
24915 return false;
24917 /* Then, as bit 4 is always set, we can mask it off, leaving
24918 the mantissa in the range [0, 15]. */
24919 mantissa &= ~(1 << 4);
24920 gcc_assert (mantissa <= 15);
24922 /* GCC internally does not use IEEE754-like encoding (where normalized
24923 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
24924 Our mantissa values are shifted 4 places to the left relative to
24925 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
24926 by 5 places to correct for GCC's representation. */
24927 exponent = 5 - exponent;
24929 return (exponent >= 0 && exponent <= 7);
24932 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
24933 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
24934 output MOVI/MVNI, ORR or BIC immediate. */
24935 char*
24936 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
24937 enum simd_immediate_check which)
24939 bool is_valid;
24940 static char templ[40];
24941 const char *mnemonic;
24942 const char *shift_op;
24943 unsigned int lane_count = 0;
24944 char element_char;
24946 struct simd_immediate_info info;
24948 /* This will return true to show const_vector is legal for use as either
24949 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
24950 It will also update INFO to show how the immediate should be generated.
24951 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
24952 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
24953 gcc_assert (is_valid);
24955 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
24956 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
24958 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
24960 gcc_assert (info.insn == simd_immediate_info::MOV
24961 && info.u.mov.shift == 0);
24962 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
24963 move immediate path. */
24964 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
24965 info.u.mov.value = GEN_INT (0);
24966 else
24968 const unsigned int buf_size = 20;
24969 char float_buf[buf_size] = {'\0'};
24970 real_to_decimal_for_mode (float_buf,
24971 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
24972 buf_size, buf_size, 1, info.elt_mode);
24974 if (lane_count == 1)
24975 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
24976 else
24977 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
24978 lane_count, element_char, float_buf);
24979 return templ;
24983 gcc_assert (CONST_INT_P (info.u.mov.value));
24985 if (which == AARCH64_CHECK_MOV)
24987 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
24988 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
24989 ? "msl" : "lsl");
24990 if (lane_count == 1)
24991 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
24992 mnemonic, UINTVAL (info.u.mov.value));
24993 else if (info.u.mov.shift)
24994 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
24995 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
24996 element_char, UINTVAL (info.u.mov.value), shift_op,
24997 info.u.mov.shift);
24998 else
24999 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25000 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25001 element_char, UINTVAL (info.u.mov.value));
25003 else
25005 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
25006 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
25007 if (info.u.mov.shift)
25008 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25009 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25010 element_char, UINTVAL (info.u.mov.value), "lsl",
25011 info.u.mov.shift);
25012 else
25013 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25014 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25015 element_char, UINTVAL (info.u.mov.value));
25017 return templ;
25020 char*
25021 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25024 /* If a floating point number was passed and we desire to use it in an
25025 integer mode do the conversion to integer. */
25026 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25028 unsigned HOST_WIDE_INT ival;
25029 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25030 gcc_unreachable ();
25031 immediate = gen_int_mode (ival, mode);
25034 machine_mode vmode;
25035 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25036 a 128 bit vector mode. */
25037 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25039 vmode = aarch64_simd_container_mode (mode, width);
25040 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25041 return aarch64_output_simd_mov_immediate (v_op, width);
25044 /* Return the output string to use for moving immediate CONST_VECTOR
25045 into an SVE register. */
25047 char *
25048 aarch64_output_sve_mov_immediate (rtx const_vector)
25050 static char templ[40];
25051 struct simd_immediate_info info;
25052 char element_char;
25054 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
25055 gcc_assert (is_valid);
25057 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25059 machine_mode vec_mode = GET_MODE (const_vector);
25060 if (aarch64_sve_pred_mode_p (vec_mode))
25062 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25063 if (info.insn == simd_immediate_info::MOV)
25065 gcc_assert (info.u.mov.value == const0_rtx);
25066 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25068 else
25070 gcc_assert (info.insn == simd_immediate_info::PTRUE);
25071 unsigned int total_bytes;
25072 if (info.u.pattern == AARCH64_SV_ALL
25073 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25074 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25075 total_bytes / GET_MODE_SIZE (info.elt_mode));
25076 else
25077 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25078 svpattern_token (info.u.pattern));
25080 return buf;
25083 if (info.insn == simd_immediate_info::INDEX)
25085 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25086 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25087 element_char, INTVAL (info.u.index.base),
25088 INTVAL (info.u.index.step));
25089 return templ;
25092 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25094 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25095 info.u.mov.value = GEN_INT (0);
25096 else
25098 const int buf_size = 20;
25099 char float_buf[buf_size] = {};
25100 real_to_decimal_for_mode (float_buf,
25101 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25102 buf_size, buf_size, 1, info.elt_mode);
25104 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25105 element_char, float_buf);
25106 return templ;
25110 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25111 element_char, INTVAL (info.u.mov.value));
25112 return templ;
25115 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
25116 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25117 pattern. */
25119 char *
25120 aarch64_output_sve_ptrues (rtx const_unspec)
25122 static char templ[40];
25124 struct simd_immediate_info info;
25125 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
25126 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25128 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25129 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25130 svpattern_token (info.u.pattern));
25131 return templ;
25134 /* Split operands into moves from op[1] + op[2] into op[0]. */
25136 void
25137 aarch64_split_combinev16qi (rtx operands[3])
25139 unsigned int dest = REGNO (operands[0]);
25140 unsigned int src1 = REGNO (operands[1]);
25141 unsigned int src2 = REGNO (operands[2]);
25142 machine_mode halfmode = GET_MODE (operands[1]);
25143 unsigned int halfregs = REG_NREGS (operands[1]);
25144 rtx destlo, desthi;
25146 gcc_assert (halfmode == V16QImode);
25148 if (src1 == dest && src2 == dest + halfregs)
25150 /* No-op move. Can't split to nothing; emit something. */
25151 emit_note (NOTE_INSN_DELETED);
25152 return;
25155 /* Preserve register attributes for variable tracking. */
25156 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
25157 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
25158 GET_MODE_SIZE (halfmode));
25160 /* Special case of reversed high/low parts. */
25161 if (reg_overlap_mentioned_p (operands[2], destlo)
25162 && reg_overlap_mentioned_p (operands[1], desthi))
25164 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25165 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25166 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25168 else if (!reg_overlap_mentioned_p (operands[2], destlo))
25170 /* Try to avoid unnecessary moves if part of the result
25171 is in the right place already. */
25172 if (src1 != dest)
25173 emit_move_insn (destlo, operands[1]);
25174 if (src2 != dest + halfregs)
25175 emit_move_insn (desthi, operands[2]);
25177 else
25179 if (src2 != dest + halfregs)
25180 emit_move_insn (desthi, operands[2]);
25181 if (src1 != dest)
25182 emit_move_insn (destlo, operands[1]);
25186 /* vec_perm support. */
25188 struct expand_vec_perm_d
25190 rtx target, op0, op1;
25191 vec_perm_indices perm;
25192 machine_mode vmode;
25193 machine_mode op_mode;
25194 unsigned int vec_flags;
25195 unsigned int op_vec_flags;
25196 bool one_vector_p;
25197 bool testing_p;
25200 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25202 /* Generate a variable permutation. */
25204 static void
25205 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25207 machine_mode vmode = GET_MODE (target);
25208 bool one_vector_p = rtx_equal_p (op0, op1);
25210 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25211 gcc_checking_assert (GET_MODE (op0) == vmode);
25212 gcc_checking_assert (GET_MODE (op1) == vmode);
25213 gcc_checking_assert (GET_MODE (sel) == vmode);
25214 gcc_checking_assert (TARGET_SIMD);
25216 if (one_vector_p)
25218 if (vmode == V8QImode)
25220 /* Expand the argument to a V16QI mode by duplicating it. */
25221 rtx pair = gen_reg_rtx (V16QImode);
25222 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25223 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25225 else
25227 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25230 else
25232 rtx pair;
25234 if (vmode == V8QImode)
25236 pair = gen_reg_rtx (V16QImode);
25237 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25238 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25240 else
25242 pair = gen_reg_rtx (V2x16QImode);
25243 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25244 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25249 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25250 NELT is the number of elements in the vector. */
25252 void
25253 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25254 unsigned int nelt)
25256 machine_mode vmode = GET_MODE (target);
25257 bool one_vector_p = rtx_equal_p (op0, op1);
25258 rtx mask;
25260 /* The TBL instruction does not use a modulo index, so we must take care
25261 of that ourselves. */
25262 mask = aarch64_simd_gen_const_vector_dup (vmode,
25263 one_vector_p ? nelt - 1 : 2 * nelt - 1);
25264 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25266 /* For big-endian, we also need to reverse the index within the vector
25267 (but not which vector). */
25268 if (BYTES_BIG_ENDIAN)
25270 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
25271 if (!one_vector_p)
25272 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25273 sel = expand_simple_binop (vmode, XOR, sel, mask,
25274 NULL, 0, OPTAB_LIB_WIDEN);
25276 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25279 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
25281 static void
25282 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25284 emit_insn (gen_rtx_SET (target,
25285 gen_rtx_UNSPEC (GET_MODE (target),
25286 gen_rtvec (2, op0, op1), code)));
25289 /* Expand an SVE vec_perm with the given operands. */
25291 void
25292 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25294 machine_mode data_mode = GET_MODE (target);
25295 machine_mode sel_mode = GET_MODE (sel);
25296 /* Enforced by the pattern condition. */
25297 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25299 /* Note: vec_perm indices are supposed to wrap when they go beyond the
25300 size of the two value vectors, i.e. the upper bits of the indices
25301 are effectively ignored. SVE TBL instead produces 0 for any
25302 out-of-range indices, so we need to modulo all the vec_perm indices
25303 to ensure they are all in range. */
25304 rtx sel_reg = force_reg (sel_mode, sel);
25306 /* Check if the sel only references the first values vector. */
25307 if (CONST_VECTOR_P (sel)
25308 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25310 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25311 return;
25314 /* Check if the two values vectors are the same. */
25315 if (rtx_equal_p (op0, op1))
25317 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25318 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25319 NULL, 0, OPTAB_DIRECT);
25320 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25321 return;
25324 /* Run TBL on for each value vector and combine the results. */
25326 rtx res0 = gen_reg_rtx (data_mode);
25327 rtx res1 = gen_reg_rtx (data_mode);
25328 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25329 if (!CONST_VECTOR_P (sel)
25330 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25332 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25333 2 * nunits - 1);
25334 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25335 NULL, 0, OPTAB_DIRECT);
25337 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25338 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25339 NULL, 0, OPTAB_DIRECT);
25340 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25341 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25342 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25343 else
25344 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25347 /* Recognize patterns suitable for the TRN instructions. */
25348 static bool
25349 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25351 HOST_WIDE_INT odd;
25352 poly_uint64 nelt = d->perm.length ();
25353 rtx out, in0, in1;
25354 machine_mode vmode = d->vmode;
25356 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25357 return false;
25359 /* Note that these are little-endian tests.
25360 We correct for big-endian later. */
25361 if (!d->perm[0].is_constant (&odd)
25362 || (odd != 0 && odd != 1)
25363 || !d->perm.series_p (0, 2, odd, 2)
25364 || !d->perm.series_p (1, 2, nelt + odd, 2))
25365 return false;
25367 /* Success! */
25368 if (d->testing_p)
25369 return true;
25371 in0 = d->op0;
25372 in1 = d->op1;
25373 /* We don't need a big-endian lane correction for SVE; see the comment
25374 at the head of aarch64-sve.md for details. */
25375 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25377 std::swap (in0, in1);
25378 odd = !odd;
25380 out = d->target;
25382 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25383 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25384 return true;
25387 /* Try to re-encode the PERM constant so it combines odd and even elements.
25388 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25389 We retry with this new constant with the full suite of patterns. */
25390 static bool
25391 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25393 expand_vec_perm_d newd;
25394 unsigned HOST_WIDE_INT nelt;
25396 if (d->vec_flags != VEC_ADVSIMD)
25397 return false;
25399 /* Get the new mode. Always twice the size of the inner
25400 and half the elements. */
25401 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
25402 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
25403 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
25404 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
25406 if (new_mode == word_mode)
25407 return false;
25409 /* to_constant is safe since this routine is specific to Advanced SIMD
25410 vectors. */
25411 nelt = d->perm.length ().to_constant ();
25413 vec_perm_builder newpermconst;
25414 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
25416 /* Convert the perm constant if we can. Require even, odd as the pairs. */
25417 for (unsigned int i = 0; i < nelt; i += 2)
25419 poly_int64 elt0 = d->perm[i];
25420 poly_int64 elt1 = d->perm[i + 1];
25421 poly_int64 newelt;
25422 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
25423 return false;
25424 newpermconst.quick_push (newelt.to_constant ());
25426 newpermconst.finalize ();
25428 newd.vmode = new_mode;
25429 newd.vec_flags = VEC_ADVSIMD;
25430 newd.op_mode = newd.vmode;
25431 newd.op_vec_flags = newd.vec_flags;
25432 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25433 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25434 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25435 newd.testing_p = d->testing_p;
25436 newd.one_vector_p = d->one_vector_p;
25438 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
25439 return aarch64_expand_vec_perm_const_1 (&newd);
25442 /* Recognize patterns suitable for the UZP instructions. */
25443 static bool
25444 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25446 HOST_WIDE_INT odd;
25447 rtx out, in0, in1;
25448 machine_mode vmode = d->vmode;
25450 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25451 return false;
25453 /* Note that these are little-endian tests.
25454 We correct for big-endian later. */
25455 if (!d->perm[0].is_constant (&odd)
25456 || (odd != 0 && odd != 1)
25457 || !d->perm.series_p (0, 1, odd, 2))
25458 return false;
25460 /* Success! */
25461 if (d->testing_p)
25462 return true;
25464 in0 = d->op0;
25465 in1 = d->op1;
25466 /* We don't need a big-endian lane correction for SVE; see the comment
25467 at the head of aarch64-sve.md for details. */
25468 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25470 std::swap (in0, in1);
25471 odd = !odd;
25473 out = d->target;
25475 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25476 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
25477 return true;
25480 /* Recognize patterns suitable for the ZIP instructions. */
25481 static bool
25482 aarch64_evpc_zip (struct expand_vec_perm_d *d)
25484 unsigned int high;
25485 poly_uint64 nelt = d->perm.length ();
25486 rtx out, in0, in1;
25487 machine_mode vmode = d->vmode;
25489 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25490 return false;
25492 /* Note that these are little-endian tests.
25493 We correct for big-endian later. */
25494 poly_uint64 first = d->perm[0];
25495 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
25496 || !d->perm.series_p (0, 2, first, 1)
25497 || !d->perm.series_p (1, 2, first + nelt, 1))
25498 return false;
25499 high = maybe_ne (first, 0U);
25501 /* Success! */
25502 if (d->testing_p)
25503 return true;
25505 in0 = d->op0;
25506 in1 = d->op1;
25507 /* We don't need a big-endian lane correction for SVE; see the comment
25508 at the head of aarch64-sve.md for details. */
25509 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25511 std::swap (in0, in1);
25512 high = !high;
25514 out = d->target;
25516 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25517 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
25518 return true;
25521 /* Recognize patterns for the EXT insn. */
25523 static bool
25524 aarch64_evpc_ext (struct expand_vec_perm_d *d)
25526 HOST_WIDE_INT location;
25527 rtx offset;
25529 /* The first element always refers to the first vector.
25530 Check if the extracted indices are increasing by one. */
25531 if ((d->vec_flags & VEC_SVE_PRED)
25532 || !d->perm[0].is_constant (&location)
25533 || !d->perm.series_p (0, 1, location, 1))
25534 return false;
25536 /* Success! */
25537 if (d->testing_p)
25538 return true;
25540 /* The case where (location == 0) is a no-op for both big- and little-endian,
25541 and is removed by the mid-end at optimization levels -O1 and higher.
25543 We don't need a big-endian lane correction for SVE; see the comment
25544 at the head of aarch64-sve.md for details. */
25545 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
25547 /* After setup, we want the high elements of the first vector (stored
25548 at the LSB end of the register), and the low elements of the second
25549 vector (stored at the MSB end of the register). So swap. */
25550 std::swap (d->op0, d->op1);
25551 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25552 to_constant () is safe since this is restricted to Advanced SIMD
25553 vectors. */
25554 location = d->perm.length ().to_constant () - location;
25557 offset = GEN_INT (location);
25558 emit_set_insn (d->target,
25559 gen_rtx_UNSPEC (d->vmode,
25560 gen_rtvec (3, d->op0, d->op1, offset),
25561 UNSPEC_EXT));
25562 return true;
25565 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25566 within each 64-bit, 32-bit or 16-bit granule. */
25568 static bool
25569 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
25571 HOST_WIDE_INT diff;
25572 unsigned int i, size, unspec;
25573 machine_mode pred_mode;
25575 if ((d->vec_flags & VEC_SVE_PRED)
25576 || !d->one_vector_p
25577 || !d->perm[0].is_constant (&diff)
25578 || !diff)
25579 return false;
25581 if (d->vec_flags & VEC_SVE_DATA)
25582 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
25583 else
25584 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
25585 if (size == 64)
25587 unspec = UNSPEC_REV64;
25588 pred_mode = VNx2BImode;
25590 else if (size == 32)
25592 unspec = UNSPEC_REV32;
25593 pred_mode = VNx4BImode;
25595 else if (size == 16)
25597 unspec = UNSPEC_REV16;
25598 pred_mode = VNx8BImode;
25600 else
25601 return false;
25603 unsigned int step = diff + 1;
25604 for (i = 0; i < step; ++i)
25605 if (!d->perm.series_p (i, step, diff - i, step))
25606 return false;
25608 /* Success! */
25609 if (d->testing_p)
25610 return true;
25612 if (d->vec_flags & VEC_SVE_DATA)
25614 rtx pred = aarch64_ptrue_reg (pred_mode);
25615 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
25616 d->target, pred, d->op0));
25617 return true;
25619 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
25620 emit_set_insn (d->target, src);
25621 return true;
25624 /* Recognize patterns for the REV insn, which reverses elements within
25625 a full vector. */
25627 static bool
25628 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
25630 poly_uint64 nelt = d->perm.length ();
25632 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
25633 return false;
25635 if (!d->perm.series_p (0, 1, nelt - 1, -1))
25636 return false;
25638 /* Success! */
25639 if (d->testing_p)
25640 return true;
25642 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
25643 emit_set_insn (d->target, src);
25644 return true;
25647 static bool
25648 aarch64_evpc_dup (struct expand_vec_perm_d *d)
25650 rtx out = d->target;
25651 rtx in0;
25652 HOST_WIDE_INT elt;
25653 machine_mode vmode = d->vmode;
25654 rtx lane;
25656 if ((d->vec_flags & VEC_SVE_PRED)
25657 || d->perm.encoding ().encoded_nelts () != 1
25658 || !d->perm[0].is_constant (&elt))
25659 return false;
25661 if ((d->vec_flags & VEC_SVE_DATA)
25662 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
25663 return false;
25665 /* Success! */
25666 if (d->testing_p)
25667 return true;
25669 /* The generic preparation in aarch64_expand_vec_perm_const_1
25670 swaps the operand order and the permute indices if it finds
25671 d->perm[0] to be in the second operand. Thus, we can always
25672 use d->op0 and need not do any extra arithmetic to get the
25673 correct lane number. */
25674 in0 = d->op0;
25675 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
25677 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
25678 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
25679 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
25680 return true;
25683 static bool
25684 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
25686 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
25687 machine_mode vmode = d->vmode;
25689 /* Make sure that the indices are constant. */
25690 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
25691 for (unsigned int i = 0; i < encoded_nelts; ++i)
25692 if (!d->perm[i].is_constant ())
25693 return false;
25695 if (d->testing_p)
25696 return true;
25698 /* Generic code will try constant permutation twice. Once with the
25699 original mode and again with the elements lowered to QImode.
25700 So wait and don't do the selector expansion ourselves. */
25701 if (vmode != V8QImode && vmode != V16QImode)
25702 return false;
25704 /* to_constant is safe since this routine is specific to Advanced SIMD
25705 vectors. */
25706 unsigned int nelt = d->perm.length ().to_constant ();
25707 for (unsigned int i = 0; i < nelt; ++i)
25708 /* If big-endian and two vectors we end up with a weird mixed-endian
25709 mode on NEON. Reverse the index within each word but not the word
25710 itself. to_constant is safe because we checked is_constant above. */
25711 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
25712 ? d->perm[i].to_constant () ^ (nelt - 1)
25713 : d->perm[i].to_constant ());
25715 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
25716 sel = force_reg (vmode, sel);
25718 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
25719 return true;
25722 /* Try to implement D using an SVE TBL instruction. */
25724 static bool
25725 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
25727 unsigned HOST_WIDE_INT nelt;
25729 /* Permuting two variable-length vectors could overflow the
25730 index range. */
25731 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
25732 return false;
25734 if (d->testing_p)
25735 return true;
25737 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
25738 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
25739 if (d->one_vector_p)
25740 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
25741 else
25742 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
25743 return true;
25746 /* Try to implement D using SVE dup instruction. */
25748 static bool
25749 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
25751 if (BYTES_BIG_ENDIAN
25752 || !d->one_vector_p
25753 || d->vec_flags != VEC_SVE_DATA
25754 || d->op_vec_flags != VEC_ADVSIMD
25755 || d->perm.encoding ().nelts_per_pattern () != 1
25756 || !known_eq (d->perm.encoding ().npatterns (),
25757 GET_MODE_NUNITS (d->op_mode))
25758 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
25759 return false;
25761 int npatterns = d->perm.encoding ().npatterns ();
25762 for (int i = 0; i < npatterns; i++)
25763 if (!known_eq (d->perm[i], i))
25764 return false;
25766 if (d->testing_p)
25767 return true;
25769 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
25770 return true;
25773 /* Try to implement D using SVE SEL instruction. */
25775 static bool
25776 aarch64_evpc_sel (struct expand_vec_perm_d *d)
25778 machine_mode vmode = d->vmode;
25779 int unit_size = GET_MODE_UNIT_SIZE (vmode);
25781 if (d->vec_flags != VEC_SVE_DATA
25782 || unit_size > 8)
25783 return false;
25785 int n_patterns = d->perm.encoding ().npatterns ();
25786 poly_int64 vec_len = d->perm.length ();
25788 for (int i = 0; i < n_patterns; ++i)
25789 if (!known_eq (d->perm[i], i)
25790 && !known_eq (d->perm[i], vec_len + i))
25791 return false;
25793 for (int i = n_patterns; i < n_patterns * 2; i++)
25794 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
25795 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
25796 return false;
25798 if (d->testing_p)
25799 return true;
25801 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
25803 /* Build a predicate that is true when op0 elements should be used. */
25804 rtx_vector_builder builder (pred_mode, n_patterns, 2);
25805 for (int i = 0; i < n_patterns * 2; i++)
25807 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
25808 : CONST0_RTX (BImode);
25809 builder.quick_push (elem);
25812 rtx const_vec = builder.build ();
25813 rtx pred = force_reg (pred_mode, const_vec);
25814 /* TARGET = PRED ? OP0 : OP1. */
25815 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
25816 return true;
25819 /* Recognize patterns suitable for the INS instructions. */
25820 static bool
25821 aarch64_evpc_ins (struct expand_vec_perm_d *d)
25823 machine_mode mode = d->vmode;
25824 unsigned HOST_WIDE_INT nelt;
25826 if (d->vec_flags != VEC_ADVSIMD)
25827 return false;
25829 /* to_constant is safe since this routine is specific to Advanced SIMD
25830 vectors. */
25831 nelt = d->perm.length ().to_constant ();
25832 rtx insv = d->op0;
25834 HOST_WIDE_INT idx = -1;
25836 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
25838 HOST_WIDE_INT elt;
25839 if (!d->perm[i].is_constant (&elt))
25840 return false;
25841 if (elt == (HOST_WIDE_INT) i)
25842 continue;
25843 if (idx != -1)
25845 idx = -1;
25846 break;
25848 idx = i;
25851 if (idx == -1)
25853 insv = d->op1;
25854 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
25856 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
25857 continue;
25858 if (idx != -1)
25859 return false;
25860 idx = i;
25863 if (idx == -1)
25864 return false;
25867 if (d->testing_p)
25868 return true;
25870 gcc_assert (idx != -1);
25872 unsigned extractindex = d->perm[idx].to_constant ();
25873 rtx extractv = d->op0;
25874 if (extractindex >= nelt)
25876 extractv = d->op1;
25877 extractindex -= nelt;
25879 gcc_assert (extractindex < nelt);
25881 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
25882 expand_operand ops[5];
25883 create_output_operand (&ops[0], d->target, mode);
25884 create_input_operand (&ops[1], insv, mode);
25885 create_integer_operand (&ops[2], 1 << idx);
25886 create_input_operand (&ops[3], extractv, mode);
25887 create_integer_operand (&ops[4], extractindex);
25888 expand_insn (icode, 5, ops);
25890 return true;
25893 static bool
25894 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
25896 gcc_assert (d->op_mode != E_VOIDmode);
25898 /* The pattern matching functions above are written to look for a small
25899 number to begin the sequence (0, 1, N/2). If we begin with an index
25900 from the second operand, we can swap the operands. */
25901 poly_int64 nelt = d->perm.length ();
25902 if (known_ge (d->perm[0], nelt))
25904 d->perm.rotate_inputs (1);
25905 std::swap (d->op0, d->op1);
25908 if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
25909 || d->vec_flags == VEC_SVE_DATA
25910 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
25911 || d->vec_flags == VEC_SVE_PRED)
25912 && known_gt (nelt, 1))
25914 if (d->vmode == d->op_mode)
25916 if (aarch64_evpc_rev_local (d))
25917 return true;
25918 else if (aarch64_evpc_rev_global (d))
25919 return true;
25920 else if (aarch64_evpc_ext (d))
25921 return true;
25922 else if (aarch64_evpc_dup (d))
25923 return true;
25924 else if (aarch64_evpc_zip (d))
25925 return true;
25926 else if (aarch64_evpc_uzp (d))
25927 return true;
25928 else if (aarch64_evpc_trn (d))
25929 return true;
25930 else if (aarch64_evpc_sel (d))
25931 return true;
25932 else if (aarch64_evpc_ins (d))
25933 return true;
25934 else if (aarch64_evpc_reencode (d))
25935 return true;
25937 if (d->vec_flags == VEC_SVE_DATA)
25938 return aarch64_evpc_sve_tbl (d);
25939 else if (d->vec_flags == VEC_ADVSIMD)
25940 return aarch64_evpc_tbl (d);
25942 else
25944 if (aarch64_evpc_sve_dup (d))
25945 return true;
25948 return false;
25951 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
25953 static bool
25954 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
25955 rtx target, rtx op0, rtx op1,
25956 const vec_perm_indices &sel)
25958 struct expand_vec_perm_d d;
25960 /* Check whether the mask can be applied to a single vector. */
25961 if (sel.ninputs () == 1
25962 || (op0 && rtx_equal_p (op0, op1)))
25963 d.one_vector_p = true;
25964 else if (sel.all_from_input_p (0))
25966 d.one_vector_p = true;
25967 op1 = op0;
25969 else if (sel.all_from_input_p (1))
25971 d.one_vector_p = true;
25972 op0 = op1;
25974 else
25975 d.one_vector_p = false;
25977 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
25978 sel.nelts_per_input ());
25979 d.vmode = vmode;
25980 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
25981 d.op_mode = op_mode;
25982 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
25983 d.target = target;
25984 d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
25985 if (op0 == op1)
25986 d.op1 = d.op0;
25987 else
25988 d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
25989 d.testing_p = !target;
25991 if (!d.testing_p)
25992 return aarch64_expand_vec_perm_const_1 (&d);
25994 rtx_insn *last = get_last_insn ();
25995 bool ret = aarch64_expand_vec_perm_const_1 (&d);
25996 gcc_assert (last == get_last_insn ());
25998 return ret;
26000 /* Generate a byte permute mask for a register of mode MODE,
26001 which has NUNITS units. */
26004 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26006 /* We have to reverse each vector because we dont have
26007 a permuted load that can reverse-load according to ABI rules. */
26008 rtx mask;
26009 rtvec v = rtvec_alloc (16);
26010 unsigned int i, j;
26011 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26013 gcc_assert (BYTES_BIG_ENDIAN);
26014 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26016 for (i = 0; i < nunits; i++)
26017 for (j = 0; j < usize; j++)
26018 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26019 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26020 return force_reg (V16QImode, mask);
26023 /* Expand an SVE integer comparison using the SVE equivalent of:
26025 (set TARGET (CODE OP0 OP1)). */
26027 void
26028 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26030 machine_mode pred_mode = GET_MODE (target);
26031 machine_mode data_mode = GET_MODE (op0);
26032 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26033 op0, op1);
26034 if (!rtx_equal_p (target, res))
26035 emit_move_insn (target, res);
26038 /* Return the UNSPEC_COND_* code for comparison CODE. */
26040 static unsigned int
26041 aarch64_unspec_cond_code (rtx_code code)
26043 switch (code)
26045 case NE:
26046 return UNSPEC_COND_FCMNE;
26047 case EQ:
26048 return UNSPEC_COND_FCMEQ;
26049 case LT:
26050 return UNSPEC_COND_FCMLT;
26051 case GT:
26052 return UNSPEC_COND_FCMGT;
26053 case LE:
26054 return UNSPEC_COND_FCMLE;
26055 case GE:
26056 return UNSPEC_COND_FCMGE;
26057 case UNORDERED:
26058 return UNSPEC_COND_FCMUO;
26059 default:
26060 gcc_unreachable ();
26064 /* Emit:
26066 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26068 where <X> is the operation associated with comparison CODE.
26069 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26071 static void
26072 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26073 bool known_ptrue_p, rtx op0, rtx op1)
26075 rtx flag = gen_int_mode (known_ptrue_p, SImode);
26076 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26077 gen_rtvec (4, pred, flag, op0, op1),
26078 aarch64_unspec_cond_code (code));
26079 emit_set_insn (target, unspec);
26082 /* Emit the SVE equivalent of:
26084 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26085 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26086 (set TARGET (ior:PRED_MODE TMP1 TMP2))
26088 where <Xi> is the operation associated with comparison CODEi.
26089 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26091 static void
26092 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26093 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26095 machine_mode pred_mode = GET_MODE (pred);
26096 rtx tmp1 = gen_reg_rtx (pred_mode);
26097 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26098 rtx tmp2 = gen_reg_rtx (pred_mode);
26099 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26100 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26103 /* Emit the SVE equivalent of:
26105 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26106 (set TARGET (not TMP))
26108 where <X> is the operation associated with comparison CODE.
26109 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26111 static void
26112 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26113 bool known_ptrue_p, rtx op0, rtx op1)
26115 machine_mode pred_mode = GET_MODE (pred);
26116 rtx tmp = gen_reg_rtx (pred_mode);
26117 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26118 aarch64_emit_unop (target, one_cmpl_optab, tmp);
26121 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26123 (set TARGET (CODE OP0 OP1))
26125 If CAN_INVERT_P is true, the caller can also handle inverted results;
26126 return true if the result is in fact inverted. */
26128 bool
26129 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26130 rtx op0, rtx op1, bool can_invert_p)
26132 machine_mode pred_mode = GET_MODE (target);
26133 machine_mode data_mode = GET_MODE (op0);
26135 rtx ptrue = aarch64_ptrue_reg (pred_mode);
26136 switch (code)
26138 case UNORDERED:
26139 /* UNORDERED has no immediate form. */
26140 op1 = force_reg (data_mode, op1);
26141 /* fall through */
26142 case LT:
26143 case LE:
26144 case GT:
26145 case GE:
26146 case EQ:
26147 case NE:
26149 /* There is native support for the comparison. */
26150 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26151 return false;
26154 case LTGT:
26155 /* This is a trapping operation (LT or GT). */
26156 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26157 return false;
26159 case UNEQ:
26160 if (!flag_trapping_math)
26162 /* This would trap for signaling NaNs. */
26163 op1 = force_reg (data_mode, op1);
26164 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26165 ptrue, true, op0, op1);
26166 return false;
26168 /* fall through */
26169 case UNLT:
26170 case UNLE:
26171 case UNGT:
26172 case UNGE:
26173 if (flag_trapping_math)
26175 /* Work out which elements are ordered. */
26176 rtx ordered = gen_reg_rtx (pred_mode);
26177 op1 = force_reg (data_mode, op1);
26178 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26179 ptrue, true, op0, op1);
26181 /* Test the opposite condition for the ordered elements,
26182 then invert the result. */
26183 if (code == UNEQ)
26184 code = NE;
26185 else
26186 code = reverse_condition_maybe_unordered (code);
26187 if (can_invert_p)
26189 aarch64_emit_sve_fp_cond (target, code,
26190 ordered, false, op0, op1);
26191 return true;
26193 aarch64_emit_sve_invert_fp_cond (target, code,
26194 ordered, false, op0, op1);
26195 return false;
26197 break;
26199 case ORDERED:
26200 /* ORDERED has no immediate form. */
26201 op1 = force_reg (data_mode, op1);
26202 break;
26204 default:
26205 gcc_unreachable ();
26208 /* There is native support for the inverse comparison. */
26209 code = reverse_condition_maybe_unordered (code);
26210 if (can_invert_p)
26212 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26213 return true;
26215 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26216 return false;
26219 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
26220 of the data being selected and CMP_MODE is the mode of the values being
26221 compared. */
26223 void
26224 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
26225 rtx *ops)
26227 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
26228 rtx pred = gen_reg_rtx (pred_mode);
26229 if (FLOAT_MODE_P (cmp_mode))
26231 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
26232 ops[4], ops[5], true))
26233 std::swap (ops[1], ops[2]);
26235 else
26236 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
26238 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
26239 ops[1] = force_reg (data_mode, ops[1]);
26240 /* The "false" value can only be zero if the "true" value is a constant. */
26241 if (register_operand (ops[1], data_mode)
26242 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
26243 ops[2] = force_reg (data_mode, ops[2]);
26245 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
26246 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
26249 /* Return true if:
26251 (a) MODE1 and MODE2 use the same layout for bytes that are common
26252 to both modes;
26254 (b) subregs involving the two modes behave as the target-independent
26255 subreg rules require; and
26257 (c) there is at least one register that can hold both modes.
26259 Return false otherwise. */
26261 static bool
26262 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26264 unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26265 unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26267 bool sve1_p = (flags1 & VEC_ANY_SVE);
26268 bool sve2_p = (flags2 & VEC_ANY_SVE);
26270 bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26271 bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26273 bool pred1_p = (flags1 & VEC_SVE_PRED);
26274 bool pred2_p = (flags2 & VEC_SVE_PRED);
26276 bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26277 | VEC_PARTIAL));
26278 bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26279 | VEC_PARTIAL));
26281 /* Don't allow changes between predicate modes and other modes.
26282 Only predicate registers can hold predicate modes and only
26283 non-predicate registers can hold non-predicate modes, so any
26284 attempt to mix them would require a round trip through memory. */
26285 if (pred1_p != pred2_p)
26286 return false;
26288 /* The contents of partial SVE modes are distributed evenly across
26289 the register, whereas GCC expects them to be clustered together.
26290 We therefore need to be careful about mode changes involving them. */
26291 if (partial_sve1_p && partial_sve2_p)
26293 /* Reject changes between partial SVE modes that have different
26294 patterns of significant and insignificant bits. */
26295 if ((aarch64_sve_container_bits (mode1)
26296 != aarch64_sve_container_bits (mode2))
26297 || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26298 return false;
26300 else if (partial_sve1_p)
26302 /* The first lane of MODE1 is where GCC expects it, but anything
26303 bigger than that is not. */
26304 if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26305 return false;
26307 else if (partial_sve2_p)
26309 /* Similarly in reverse. */
26310 if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26311 return false;
26314 /* Don't allow changes between partial Advanced SIMD structure modes
26315 and other modes that are bigger than 8 bytes. E.g. V16QI and V2x8QI
26316 are the same size, but the former occupies one Q register while the
26317 latter occupies two D registers. */
26318 if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26319 && maybe_gt (GET_MODE_SIZE (mode1), 8)
26320 && maybe_gt (GET_MODE_SIZE (mode2), 8))
26321 return false;
26323 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26325 /* Don't allow changes between SVE modes and other modes that might
26326 be bigger than 128 bits. In particular, OImode, CImode and XImode
26327 divide into 128-bit quantities while SVE modes divide into
26328 BITS_PER_SVE_VECTOR quantities. */
26329 if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26330 return false;
26331 if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26332 return false;
26335 if (BYTES_BIG_ENDIAN)
26337 /* Don't allow changes between SVE data modes and non-SVE modes.
26338 See the comment at the head of aarch64-sve.md for details. */
26339 if (sve1_p != sve2_p)
26340 return false;
26342 /* Don't allow changes in element size: lane 0 of the new vector
26343 would not then be lane 0 of the old vector. See the comment
26344 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26345 description.
26347 In the worst case, this forces a register to be spilled in
26348 one mode and reloaded in the other, which handles the
26349 endianness correctly. */
26350 if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26351 return false;
26353 return true;
26356 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always defer
26357 to aarch64_modes_compatible_p. However due to issues with register
26358 allocation it is preferable to avoid tieing integer scalar and FP
26359 scalar modes. Executing integer operations in general registers is
26360 better than treating them as scalar vector operations. This reduces
26361 latency and avoids redundant int<->FP moves. So tie modes if they
26362 are either the same class, or one of them is a vector mode. */
26364 static bool
26365 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
26367 if (aarch64_modes_compatible_p (mode1, mode2))
26369 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
26370 return true;
26371 if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
26372 return true;
26374 return false;
26377 /* Return a new RTX holding the result of moving POINTER forward by
26378 AMOUNT bytes. */
26380 static rtx
26381 aarch64_move_pointer (rtx pointer, poly_int64 amount)
26383 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
26385 return adjust_automodify_address (pointer, GET_MODE (pointer),
26386 next, amount);
26389 /* Return a new RTX holding the result of moving POINTER forward by the
26390 size of the mode it points to. */
26392 static rtx
26393 aarch64_progress_pointer (rtx pointer)
26395 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
26398 typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
26400 /* Copy one block of size MODE from SRC to DST at offset OFFSET. */
26401 static void
26402 aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
26403 int offset, machine_mode mode)
26405 /* Emit explict load/store pair instructions for 32-byte copies. */
26406 if (known_eq (GET_MODE_SIZE (mode), 32))
26408 mode = V4SImode;
26409 rtx src1 = adjust_address (src, mode, offset);
26410 rtx dst1 = adjust_address (dst, mode, offset);
26411 rtx reg1 = gen_reg_rtx (mode);
26412 rtx reg2 = gen_reg_rtx (mode);
26413 rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
26414 rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
26415 ops.safe_push ({ load, store });
26416 return;
26419 rtx reg = gen_reg_rtx (mode);
26420 rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
26421 rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
26422 ops.safe_push ({ load, store });
26425 /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
26426 from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
26427 rather than memcpy. Return true iff we succeeded. */
26428 bool
26429 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
26431 if (!TARGET_MOPS)
26432 return false;
26434 /* All three registers are changed by the instruction, so each one
26435 must be a fresh pseudo. */
26436 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26437 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
26438 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26439 rtx src_mem = replace_equiv_address (operands[1], src_addr);
26440 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
26441 if (is_memmove)
26442 emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
26443 else
26444 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
26445 return true;
26448 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26449 OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true
26450 if this is a memmove rather than memcpy. Return true if we succeed,
26451 otherwise return false, indicating that a libcall should be emitted. */
26452 bool
26453 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
26455 int mode_bytes;
26456 rtx dst = operands[0];
26457 rtx src = operands[1];
26458 unsigned align = UINTVAL (operands[3]);
26459 rtx base;
26460 machine_mode cur_mode = BLKmode, next_mode;
26462 /* Variable-sized or strict-align copies may use the MOPS expansion. */
26463 if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
26464 return aarch64_expand_cpymem_mops (operands, is_memmove);
26466 unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
26467 bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
26468 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
26470 /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */
26471 unsigned max_copy_size = use_ldpq ? 256 : 128;
26472 unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
26473 : aarch64_mops_memcpy_size_threshold;
26475 /* Reduce the maximum size with -Os. */
26476 if (optimize_function_for_size_p (cfun))
26477 max_copy_size /= 4;
26479 /* Large copies use MOPS when available or a library call. */
26480 if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
26481 return aarch64_expand_cpymem_mops (operands, is_memmove);
26483 unsigned copy_max = 32;
26485 /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
26486 support or slow LDP/STP fall back to 16-byte chunks.
26488 ??? Although it would be possible to use LDP/STP Qn in streaming mode
26489 (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26490 whether that would improve performance. */
26491 if (size <= 24 || !use_ldpq)
26492 copy_max = 16;
26494 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26495 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26497 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
26498 src = adjust_automodify_address (src, VOIDmode, base, 0);
26500 copy_ops ops;
26501 int offset = 0;
26503 while (size > 0)
26505 /* Find the largest mode in which to do the copy in without over reading
26506 or writing. */
26507 opt_scalar_int_mode mode_iter;
26508 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26509 if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
26510 cur_mode = mode_iter.require ();
26512 gcc_assert (cur_mode != BLKmode);
26514 mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
26516 /* Prefer Q-register accesses for the last bytes. */
26517 if (mode_bytes == 16 && copy_max == 32)
26518 cur_mode = V4SImode;
26519 aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
26520 size -= mode_bytes;
26521 offset += mode_bytes;
26523 /* Emit trailing copies using overlapping unaligned accesses
26524 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26525 if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
26527 next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
26528 int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26529 gcc_assert (n_bytes <= mode_bytes);
26530 offset -= n_bytes - size;
26531 size = n_bytes;
26535 /* Memcpy interleaves loads with stores, memmove emits all loads first. */
26536 int nops = ops.length();
26537 int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
26539 for (int i = 0; i < nops; i += inc)
26541 int m = MIN (nops, i + inc);
26542 /* Emit loads. */
26543 for (int j = i; j < m; j++)
26544 emit_insn (ops[j].first);
26545 /* Emit stores. */
26546 for (int j = i; j < m; j++)
26547 emit_insn (ops[j].second);
26549 return true;
26552 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
26553 SRC is a register we have created with the duplicated value to be set. */
26554 static void
26555 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
26556 machine_mode mode)
26558 /* If we are copying 128bits or 256bits, we can do that straight from
26559 the SIMD register we prepared. */
26560 if (known_eq (GET_MODE_BITSIZE (mode), 256))
26562 mode = GET_MODE (src);
26563 /* "Cast" the *dst to the correct mode. */
26564 *dst = adjust_address (*dst, mode, 0);
26565 /* Emit the memset. */
26566 emit_insn (aarch64_gen_store_pair (*dst, src, src));
26568 /* Move the pointers forward. */
26569 *dst = aarch64_move_pointer (*dst, 32);
26570 return;
26572 if (known_eq (GET_MODE_BITSIZE (mode), 128))
26574 /* "Cast" the *dst to the correct mode. */
26575 *dst = adjust_address (*dst, GET_MODE (src), 0);
26576 /* Emit the memset. */
26577 emit_move_insn (*dst, src);
26578 /* Move the pointers forward. */
26579 *dst = aarch64_move_pointer (*dst, 16);
26580 return;
26582 /* For copying less, we have to extract the right amount from src. */
26583 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
26585 /* "Cast" the *dst to the correct mode. */
26586 *dst = adjust_address (*dst, mode, 0);
26587 /* Emit the memset. */
26588 emit_move_insn (*dst, reg);
26589 /* Move the pointer forward. */
26590 *dst = aarch64_progress_pointer (*dst);
26593 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
26594 as for the setmem pattern. Return true iff we succeed. */
26595 static bool
26596 aarch64_expand_setmem_mops (rtx *operands)
26598 if (!TARGET_MOPS)
26599 return false;
26601 /* The first two registers are changed by the instruction, so both
26602 of them must be a fresh pseudo. */
26603 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26604 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26605 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
26606 rtx val = operands[2];
26607 if (val != CONST0_RTX (QImode))
26608 val = force_reg (QImode, val);
26609 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
26610 return true;
26613 /* Expand setmem, as if from a __builtin_memset. Return true if
26614 we succeed, otherwise return false. */
26616 bool
26617 aarch64_expand_setmem (rtx *operands)
26619 int n, mode_bits;
26620 unsigned HOST_WIDE_INT len;
26621 rtx dst = operands[0];
26622 rtx val = operands[2], src;
26623 unsigned align = UINTVAL (operands[3]);
26624 rtx base;
26625 machine_mode cur_mode = BLKmode, next_mode;
26627 /* Variable-sized or strict-align memset may use the MOPS expansion. */
26628 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
26629 || (STRICT_ALIGNMENT && align < 16))
26630 return aarch64_expand_setmem_mops (operands);
26632 bool size_p = optimize_function_for_size_p (cfun);
26634 /* Default the maximum to 256-bytes when considering only libcall vs
26635 SIMD broadcast sequence. */
26636 unsigned max_set_size = 256;
26637 unsigned mops_threshold = aarch64_mops_memset_size_threshold;
26639 len = UINTVAL (operands[1]);
26641 /* Large memset uses MOPS when available or a library call. */
26642 if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
26643 return aarch64_expand_setmem_mops (operands);
26645 int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
26646 /* The MOPS sequence takes:
26647 3 instructions for the memory storing
26648 + 1 to move the constant size into a reg
26649 + 1 if VAL is a non-zero constant to move into a reg
26650 (zero constants can use XZR directly). */
26651 unsigned mops_cost = 3 + 1 + cst_val;
26652 /* A libcall to memset in the worst case takes 3 instructions to prepare
26653 the arguments + 1 for the call. */
26654 unsigned libcall_cost = 4;
26656 /* Attempt a sequence with a vector broadcast followed by stores.
26657 Count the number of operations involved to see if it's worth it
26658 against the alternatives. A simple counter simd_ops on the
26659 algorithmically-relevant operations is used rather than an rtx_insn count
26660 as all the pointer adjusmtents and mode reinterprets will be optimized
26661 away later. */
26662 start_sequence ();
26663 unsigned simd_ops = 0;
26665 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26666 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26668 /* Prepare the val using a DUP/MOVI v0.16B, val. */
26669 src = expand_vector_broadcast (V16QImode, val);
26670 src = force_reg (V16QImode, src);
26671 simd_ops++;
26672 /* Convert len to bits to make the rest of the code simpler. */
26673 n = len * BITS_PER_UNIT;
26675 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
26676 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
26677 const int copy_limit = (aarch64_tune_params.extra_tuning_flags
26678 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
26679 ? GET_MODE_BITSIZE (TImode) : 256;
26681 while (n > 0)
26683 /* Find the largest mode in which to do the copy without
26684 over writing. */
26685 opt_scalar_int_mode mode_iter;
26686 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26687 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
26688 cur_mode = mode_iter.require ();
26690 gcc_assert (cur_mode != BLKmode);
26692 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
26693 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
26694 simd_ops++;
26695 n -= mode_bits;
26697 /* Emit trailing writes using overlapping unaligned accesses
26698 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26699 if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
26701 next_mode = smallest_mode_for_size (n, MODE_INT);
26702 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
26703 gcc_assert (n_bits <= mode_bits);
26704 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
26705 n = n_bits;
26708 rtx_insn *seq = get_insns ();
26709 end_sequence ();
26711 if (size_p)
26713 /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
26714 call to memset or the MOPS expansion. */
26715 if (TARGET_MOPS
26716 && mops_cost <= libcall_cost
26717 && mops_cost <= simd_ops)
26718 return aarch64_expand_setmem_mops (operands);
26719 /* If MOPS is not available or not shorter pick a libcall if the SIMD
26720 sequence is too long. */
26721 else if (libcall_cost < simd_ops)
26722 return false;
26723 emit_insn (seq);
26724 return true;
26727 /* At this point the SIMD broadcast sequence is the best choice when
26728 optimizing for speed. */
26729 emit_insn (seq);
26730 return true;
26734 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
26735 SImode stores. Handle the case when the constant has identical
26736 bottom and top halves. This is beneficial when the two stores can be
26737 merged into an STP and we avoid synthesising potentially expensive
26738 immediates twice. Return true if such a split is possible. */
26740 bool
26741 aarch64_split_dimode_const_store (rtx dst, rtx src)
26743 rtx lo = gen_lowpart (SImode, src);
26744 rtx hi = gen_highpart_mode (SImode, DImode, src);
26746 if (!rtx_equal_p (lo, hi))
26747 return false;
26749 unsigned int orig_cost
26750 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
26751 unsigned int lo_cost
26752 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
26754 /* We want to transform:
26755 MOV x1, 49370
26756 MOVK x1, 0x140, lsl 16
26757 MOVK x1, 0xc0da, lsl 32
26758 MOVK x1, 0x140, lsl 48
26759 STR x1, [x0]
26760 into:
26761 MOV w1, 49370
26762 MOVK w1, 0x140, lsl 16
26763 STP w1, w1, [x0]
26764 So we want to perform this when we save at least one instruction. */
26765 if (orig_cost <= lo_cost)
26766 return false;
26768 rtx mem_lo = adjust_address (dst, SImode, 0);
26769 if (!aarch64_mem_pair_operand (mem_lo, SImode))
26770 return false;
26772 rtx tmp_reg = gen_reg_rtx (SImode);
26773 aarch64_expand_mov_immediate (tmp_reg, lo);
26774 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
26775 /* Don't emit an explicit store pair as this may not be always profitable.
26776 Let the sched-fusion logic decide whether to merge them. */
26777 emit_move_insn (mem_lo, tmp_reg);
26778 emit_move_insn (mem_hi, tmp_reg);
26780 return true;
26783 /* Generate RTL for a conditional branch with rtx comparison CODE in
26784 mode CC_MODE. The destination of the unlikely conditional branch
26785 is LABEL_REF. */
26787 void
26788 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
26789 rtx label_ref)
26791 rtx x;
26792 x = gen_rtx_fmt_ee (code, VOIDmode,
26793 gen_rtx_REG (cc_mode, CC_REGNUM),
26794 const0_rtx);
26796 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
26797 gen_rtx_LABEL_REF (VOIDmode, label_ref),
26798 pc_rtx);
26799 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
26802 /* Generate DImode scratch registers for 128-bit (TImode) addition.
26804 OP1 represents the TImode destination operand 1
26805 OP2 represents the TImode destination operand 2
26806 LOW_DEST represents the low half (DImode) of TImode operand 0
26807 LOW_IN1 represents the low half (DImode) of TImode operand 1
26808 LOW_IN2 represents the low half (DImode) of TImode operand 2
26809 HIGH_DEST represents the high half (DImode) of TImode operand 0
26810 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26811 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
26813 void
26814 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26815 rtx *low_in1, rtx *low_in2,
26816 rtx *high_dest, rtx *high_in1,
26817 rtx *high_in2)
26819 *low_dest = gen_reg_rtx (DImode);
26820 *low_in1 = gen_lowpart (DImode, op1);
26821 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
26822 subreg_lowpart_offset (DImode, TImode));
26823 *high_dest = gen_reg_rtx (DImode);
26824 *high_in1 = gen_highpart (DImode, op1);
26825 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
26826 subreg_highpart_offset (DImode, TImode));
26829 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
26831 This function differs from 'arch64_addti_scratch_regs' in that
26832 OP1 can be an immediate constant (zero). We must call
26833 subreg_highpart_offset with DImode and TImode arguments, otherwise
26834 VOIDmode will be used for the const_int which generates an internal
26835 error from subreg_size_highpart_offset which does not expect a size of zero.
26837 OP1 represents the TImode destination operand 1
26838 OP2 represents the TImode destination operand 2
26839 LOW_DEST represents the low half (DImode) of TImode operand 0
26840 LOW_IN1 represents the low half (DImode) of TImode operand 1
26841 LOW_IN2 represents the low half (DImode) of TImode operand 2
26842 HIGH_DEST represents the high half (DImode) of TImode operand 0
26843 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26844 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
26847 void
26848 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26849 rtx *low_in1, rtx *low_in2,
26850 rtx *high_dest, rtx *high_in1,
26851 rtx *high_in2)
26853 *low_dest = gen_reg_rtx (DImode);
26854 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
26855 subreg_lowpart_offset (DImode, TImode));
26857 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
26858 subreg_lowpart_offset (DImode, TImode));
26859 *high_dest = gen_reg_rtx (DImode);
26861 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
26862 subreg_highpart_offset (DImode, TImode));
26863 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
26864 subreg_highpart_offset (DImode, TImode));
26867 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
26869 OP0 represents the TImode destination operand 0
26870 LOW_DEST represents the low half (DImode) of TImode operand 0
26871 LOW_IN1 represents the low half (DImode) of TImode operand 1
26872 LOW_IN2 represents the low half (DImode) of TImode operand 2
26873 HIGH_DEST represents the high half (DImode) of TImode operand 0
26874 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26875 HIGH_IN2 represents the high half (DImode) of TImode operand 2
26876 UNSIGNED_P is true if the operation is being performed on unsigned
26877 values. */
26878 void
26879 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
26880 rtx low_in2, rtx high_dest, rtx high_in1,
26881 rtx high_in2, bool unsigned_p)
26883 if (low_in2 == const0_rtx)
26885 low_dest = low_in1;
26886 high_in2 = force_reg (DImode, high_in2);
26887 if (unsigned_p)
26888 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
26889 else
26890 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
26892 else
26894 if (aarch64_plus_immediate (low_in2, DImode))
26895 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
26896 GEN_INT (-UINTVAL (low_in2))));
26897 else
26899 low_in2 = force_reg (DImode, low_in2);
26900 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
26902 high_in2 = force_reg (DImode, high_in2);
26904 if (unsigned_p)
26905 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
26906 else
26907 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
26910 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
26911 emit_move_insn (gen_highpart (DImode, op0), high_dest);
26915 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
26917 static unsigned HOST_WIDE_INT
26918 aarch64_asan_shadow_offset (void)
26920 if (TARGET_ILP32)
26921 return (HOST_WIDE_INT_1 << 29);
26922 else
26923 return (HOST_WIDE_INT_1 << 36);
26926 static rtx
26927 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
26928 rtx_code code, tree treeop0, tree treeop1)
26930 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
26931 rtx op0, op1;
26932 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
26933 insn_code icode;
26934 struct expand_operand ops[4];
26936 start_sequence ();
26937 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
26939 op_mode = GET_MODE (op0);
26940 if (op_mode == VOIDmode)
26941 op_mode = GET_MODE (op1);
26943 switch (op_mode)
26945 case E_QImode:
26946 case E_HImode:
26947 case E_SImode:
26948 cmp_mode = SImode;
26949 icode = CODE_FOR_cmpsi;
26950 break;
26952 case E_DImode:
26953 cmp_mode = DImode;
26954 icode = CODE_FOR_cmpdi;
26955 break;
26957 case E_SFmode:
26958 cmp_mode = SFmode;
26959 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
26960 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
26961 break;
26963 case E_DFmode:
26964 cmp_mode = DFmode;
26965 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
26966 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
26967 break;
26969 default:
26970 end_sequence ();
26971 return NULL_RTX;
26974 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
26975 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
26976 if (!op0 || !op1)
26978 end_sequence ();
26979 return NULL_RTX;
26981 *prep_seq = get_insns ();
26982 end_sequence ();
26984 create_fixed_operand (&ops[0], op0);
26985 create_fixed_operand (&ops[1], op1);
26987 start_sequence ();
26988 if (!maybe_expand_insn (icode, 2, ops))
26990 end_sequence ();
26991 return NULL_RTX;
26993 *gen_seq = get_insns ();
26994 end_sequence ();
26996 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
26997 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
27000 static rtx
27001 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27002 rtx_code cmp_code, tree treeop0, tree treeop1,
27003 rtx_code bit_code)
27005 rtx op0, op1, target;
27006 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27007 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27008 insn_code icode;
27009 struct expand_operand ops[6];
27010 int aarch64_cond;
27012 push_to_sequence (*prep_seq);
27013 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27015 op_mode = GET_MODE (op0);
27016 if (op_mode == VOIDmode)
27017 op_mode = GET_MODE (op1);
27019 switch (op_mode)
27021 case E_QImode:
27022 case E_HImode:
27023 case E_SImode:
27024 cmp_mode = SImode;
27025 break;
27027 case E_DImode:
27028 cmp_mode = DImode;
27029 break;
27031 case E_SFmode:
27032 cmp_mode = SFmode;
27033 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27034 break;
27036 case E_DFmode:
27037 cmp_mode = DFmode;
27038 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27039 break;
27041 default:
27042 end_sequence ();
27043 return NULL_RTX;
27046 icode = code_for_ccmp (cc_mode, cmp_mode);
27048 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27049 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27050 if (!op0 || !op1)
27052 end_sequence ();
27053 return NULL_RTX;
27055 *prep_seq = get_insns ();
27056 end_sequence ();
27058 target = gen_rtx_REG (cc_mode, CC_REGNUM);
27059 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
27061 if (bit_code != AND)
27063 /* Treat the ccmp patterns as canonical and use them where possible,
27064 but fall back to ccmp_rev patterns if there's no other option. */
27065 rtx_code prev_code = GET_CODE (prev);
27066 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27067 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27068 && !(prev_code == EQ
27069 || prev_code == NE
27070 || prev_code == ORDERED
27071 || prev_code == UNORDERED))
27072 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27073 else
27075 rtx_code code = reverse_condition (prev_code);
27076 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27078 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27081 create_fixed_operand (&ops[0], XEXP (prev, 0));
27082 create_fixed_operand (&ops[1], target);
27083 create_fixed_operand (&ops[2], op0);
27084 create_fixed_operand (&ops[3], op1);
27085 create_fixed_operand (&ops[4], prev);
27086 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27088 push_to_sequence (*gen_seq);
27089 if (!maybe_expand_insn (icode, 6, ops))
27091 end_sequence ();
27092 return NULL_RTX;
27095 *gen_seq = get_insns ();
27096 end_sequence ();
27098 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27101 #undef TARGET_GEN_CCMP_FIRST
27102 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27104 #undef TARGET_GEN_CCMP_NEXT
27105 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27107 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
27108 instruction fusion of some sort. */
27110 static bool
27111 aarch64_macro_fusion_p (void)
27113 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27117 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
27118 should be kept together during scheduling. */
27120 static bool
27121 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27123 rtx set_dest;
27124 rtx prev_set = single_set (prev);
27125 rtx curr_set = single_set (curr);
27126 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
27127 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27129 if (!aarch64_macro_fusion_p ())
27130 return false;
27132 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27134 /* We are trying to match:
27135 prev (mov) == (set (reg r0) (const_int imm16))
27136 curr (movk) == (set (zero_extract (reg r0)
27137 (const_int 16)
27138 (const_int 16))
27139 (const_int imm16_1)) */
27141 set_dest = SET_DEST (curr_set);
27143 if (GET_CODE (set_dest) == ZERO_EXTRACT
27144 && CONST_INT_P (SET_SRC (curr_set))
27145 && CONST_INT_P (SET_SRC (prev_set))
27146 && CONST_INT_P (XEXP (set_dest, 2))
27147 && INTVAL (XEXP (set_dest, 2)) == 16
27148 && REG_P (XEXP (set_dest, 0))
27149 && REG_P (SET_DEST (prev_set))
27150 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27152 return true;
27156 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27159 /* We're trying to match:
27160 prev (adrp) == (set (reg r1)
27161 (high (symbol_ref ("SYM"))))
27162 curr (add) == (set (reg r0)
27163 (lo_sum (reg r1)
27164 (symbol_ref ("SYM"))))
27165 Note that r0 need not necessarily be the same as r1, especially
27166 during pre-regalloc scheduling. */
27168 if (satisfies_constraint_Ush (SET_SRC (prev_set))
27169 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27171 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27172 && REG_P (XEXP (SET_SRC (curr_set), 0))
27173 && REGNO (XEXP (SET_SRC (curr_set), 0))
27174 == REGNO (SET_DEST (prev_set))
27175 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27176 XEXP (SET_SRC (curr_set), 1)))
27177 return true;
27181 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27184 /* We're trying to match:
27185 prev (movk) == (set (zero_extract (reg r0)
27186 (const_int 16)
27187 (const_int 32))
27188 (const_int imm16_1))
27189 curr (movk) == (set (zero_extract (reg r0)
27190 (const_int 16)
27191 (const_int 48))
27192 (const_int imm16_2)) */
27194 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27195 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27196 && REG_P (XEXP (SET_DEST (prev_set), 0))
27197 && REG_P (XEXP (SET_DEST (curr_set), 0))
27198 && REGNO (XEXP (SET_DEST (prev_set), 0))
27199 == REGNO (XEXP (SET_DEST (curr_set), 0))
27200 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27201 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27202 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27203 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27204 && CONST_INT_P (SET_SRC (prev_set))
27205 && CONST_INT_P (SET_SRC (curr_set)))
27206 return true;
27209 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27211 /* We're trying to match:
27212 prev (adrp) == (set (reg r0)
27213 (high (symbol_ref ("SYM"))))
27214 curr (ldr) == (set (reg r1)
27215 (mem (lo_sum (reg r0)
27216 (symbol_ref ("SYM")))))
27218 curr (ldr) == (set (reg r1)
27219 (zero_extend (mem
27220 (lo_sum (reg r0)
27221 (symbol_ref ("SYM")))))) */
27222 if (satisfies_constraint_Ush (SET_SRC (prev_set))
27223 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27225 rtx curr_src = SET_SRC (curr_set);
27227 if (GET_CODE (curr_src) == ZERO_EXTEND)
27228 curr_src = XEXP (curr_src, 0);
27230 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27231 && REG_P (XEXP (XEXP (curr_src, 0), 0))
27232 && REGNO (XEXP (XEXP (curr_src, 0), 0))
27233 == REGNO (SET_DEST (prev_set))
27234 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27235 XEXP (SET_SRC (prev_set), 0)))
27236 return true;
27240 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
27241 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27242 && prev_set && curr_set && any_condjump_p (curr)
27243 && GET_CODE (SET_SRC (prev_set)) == COMPARE
27244 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27245 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27246 return true;
27248 /* Fuse flag-setting ALU instructions and conditional branch. */
27249 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27250 && any_condjump_p (curr))
27252 unsigned int condreg1, condreg2;
27253 rtx cc_reg_1;
27254 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27255 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27257 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27258 && prev
27259 && modified_in_p (cc_reg_1, prev))
27261 enum attr_type prev_type = get_attr_type (prev);
27263 /* FIXME: this misses some which is considered simple arthematic
27264 instructions for ThunderX. Simple shifts are missed here. */
27265 if (prev_type == TYPE_ALUS_SREG
27266 || prev_type == TYPE_ALUS_IMM
27267 || prev_type == TYPE_LOGICS_REG
27268 || prev_type == TYPE_LOGICS_IMM)
27269 return true;
27273 /* Fuse ALU instructions and CBZ/CBNZ. */
27274 if (prev_set
27275 && curr_set
27276 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27277 && any_condjump_p (curr))
27279 /* We're trying to match:
27280 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27281 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
27282 (const_int 0))
27283 (label_ref ("SYM"))
27284 (pc)) */
27285 if (SET_DEST (curr_set) == (pc_rtx)
27286 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27287 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27288 && REG_P (SET_DEST (prev_set))
27289 && REGNO (SET_DEST (prev_set))
27290 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27292 /* Fuse ALU operations followed by conditional branch instruction. */
27293 switch (get_attr_type (prev))
27295 case TYPE_ALU_IMM:
27296 case TYPE_ALU_SREG:
27297 case TYPE_ADC_REG:
27298 case TYPE_ADC_IMM:
27299 case TYPE_ADCS_REG:
27300 case TYPE_ADCS_IMM:
27301 case TYPE_LOGIC_REG:
27302 case TYPE_LOGIC_IMM:
27303 case TYPE_CSEL:
27304 case TYPE_ADR:
27305 case TYPE_MOV_IMM:
27306 case TYPE_SHIFT_REG:
27307 case TYPE_SHIFT_IMM:
27308 case TYPE_BFM:
27309 case TYPE_RBIT:
27310 case TYPE_REV:
27311 case TYPE_EXTEND:
27312 return true;
27314 default:;
27319 /* Fuse A+B+1 and A-B-1 */
27320 if (simple_sets_p
27321 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27323 /* We're trying to match:
27324 prev == (set (r0) (plus (r0) (r1)))
27325 curr == (set (r0) (plus (r0) (const_int 1)))
27327 prev == (set (r0) (minus (r0) (r1)))
27328 curr == (set (r0) (plus (r0) (const_int -1))) */
27330 rtx prev_src = SET_SRC (prev_set);
27331 rtx curr_src = SET_SRC (curr_set);
27333 int polarity = 1;
27334 if (GET_CODE (prev_src) == MINUS)
27335 polarity = -1;
27337 if (GET_CODE (curr_src) == PLUS
27338 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27339 && CONST_INT_P (XEXP (curr_src, 1))
27340 && INTVAL (XEXP (curr_src, 1)) == polarity
27341 && REG_P (XEXP (curr_src, 0))
27342 && REG_P (SET_DEST (prev_set))
27343 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27344 return true;
27347 return false;
27350 /* Return true iff the instruction fusion described by OP is enabled. */
27352 bool
27353 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27355 return (aarch64_tune_params.fusible_ops & op) != 0;
27358 /* If MEM is in the form of [base+offset], extract the two parts
27359 of address and set to BASE and OFFSET, otherwise return false
27360 after clearing BASE and OFFSET. */
27362 bool
27363 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27365 rtx addr;
27367 gcc_assert (MEM_P (mem));
27369 addr = XEXP (mem, 0);
27371 if (REG_P (addr))
27373 *base = addr;
27374 *offset = const0_rtx;
27375 return true;
27378 if (GET_CODE (addr) == PLUS
27379 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27381 *base = XEXP (addr, 0);
27382 *offset = XEXP (addr, 1);
27383 return true;
27386 *base = NULL_RTX;
27387 *offset = NULL_RTX;
27389 return false;
27392 /* Types for scheduling fusion. */
27393 enum sched_fusion_type
27395 SCHED_FUSION_NONE = 0,
27396 SCHED_FUSION_LD_SIGN_EXTEND,
27397 SCHED_FUSION_LD_ZERO_EXTEND,
27398 SCHED_FUSION_LD,
27399 SCHED_FUSION_ST,
27400 SCHED_FUSION_NUM
27403 /* If INSN is a load or store of address in the form of [base+offset],
27404 extract the two parts and set to BASE and OFFSET. Return scheduling
27405 fusion type this INSN is. */
27407 static enum sched_fusion_type
27408 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27410 rtx x, dest, src;
27411 enum sched_fusion_type fusion = SCHED_FUSION_LD;
27413 gcc_assert (INSN_P (insn));
27414 x = PATTERN (insn);
27415 if (GET_CODE (x) != SET)
27416 return SCHED_FUSION_NONE;
27418 src = SET_SRC (x);
27419 dest = SET_DEST (x);
27421 machine_mode dest_mode = GET_MODE (dest);
27423 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27424 return SCHED_FUSION_NONE;
27426 if (GET_CODE (src) == SIGN_EXTEND)
27428 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27429 src = XEXP (src, 0);
27430 if (!MEM_P (src) || GET_MODE (src) != SImode)
27431 return SCHED_FUSION_NONE;
27433 else if (GET_CODE (src) == ZERO_EXTEND)
27435 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27436 src = XEXP (src, 0);
27437 if (!MEM_P (src) || GET_MODE (src) != SImode)
27438 return SCHED_FUSION_NONE;
27441 if (MEM_P (src) && REG_P (dest))
27442 extract_base_offset_in_addr (src, base, offset);
27443 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27445 fusion = SCHED_FUSION_ST;
27446 extract_base_offset_in_addr (dest, base, offset);
27448 else
27449 return SCHED_FUSION_NONE;
27451 if (*base == NULL_RTX || *offset == NULL_RTX)
27452 fusion = SCHED_FUSION_NONE;
27454 return fusion;
27457 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27459 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27460 and PRI are only calculated for these instructions. For other instruction,
27461 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
27462 type instruction fusion can be added by returning different priorities.
27464 It's important that irrelevant instructions get the largest FUSION_PRI. */
27466 static void
27467 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
27468 int *fusion_pri, int *pri)
27470 int tmp, off_val;
27471 rtx base, offset;
27472 enum sched_fusion_type fusion;
27474 gcc_assert (INSN_P (insn));
27476 tmp = max_pri - 1;
27477 fusion = fusion_load_store (insn, &base, &offset);
27478 if (fusion == SCHED_FUSION_NONE)
27480 *pri = tmp;
27481 *fusion_pri = tmp;
27482 return;
27485 /* Set FUSION_PRI according to fusion type and base register. */
27486 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
27488 /* Calculate PRI. */
27489 tmp /= 2;
27491 /* INSN with smaller offset goes first. */
27492 off_val = (int)(INTVAL (offset));
27493 if (off_val >= 0)
27494 tmp -= (off_val & 0xfffff);
27495 else
27496 tmp += ((- off_val) & 0xfffff);
27498 *pri = tmp;
27499 return;
27502 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27503 Adjust priority of sha1h instructions so they are scheduled before
27504 other SHA1 instructions. */
27506 static int
27507 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
27509 rtx x = PATTERN (insn);
27511 if (GET_CODE (x) == SET)
27513 x = SET_SRC (x);
27515 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
27516 return priority + 10;
27519 return priority;
27522 /* If REVERSED is null, return true if memory reference *MEM2 comes
27523 immediately after memory reference *MEM1. Do not change the references
27524 in this case.
27526 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27527 if they are, try to make them use constant offsets from the same base
27528 register. Return true on success. When returning true, set *REVERSED
27529 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
27530 static bool
27531 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
27533 if (reversed)
27534 *reversed = false;
27536 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
27537 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
27538 return false;
27540 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
27541 return false;
27543 auto size1 = MEM_SIZE (*mem1);
27544 auto size2 = MEM_SIZE (*mem2);
27546 rtx base1, base2, offset1, offset2;
27547 extract_base_offset_in_addr (*mem1, &base1, &offset1);
27548 extract_base_offset_in_addr (*mem2, &base2, &offset2);
27550 /* Make sure at least one memory is in base+offset form. */
27551 if (!(base1 && offset1) && !(base2 && offset2))
27552 return false;
27554 /* If both mems already use the same base register, just check the
27555 offsets. */
27556 if (base1 && base2 && rtx_equal_p (base1, base2))
27558 if (!offset1 || !offset2)
27559 return false;
27561 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
27562 return true;
27564 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
27566 *reversed = true;
27567 return true;
27570 return false;
27573 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27574 guarantee that the values are consecutive. */
27575 if (MEM_EXPR (*mem1)
27576 && MEM_EXPR (*mem2)
27577 && MEM_OFFSET_KNOWN_P (*mem1)
27578 && MEM_OFFSET_KNOWN_P (*mem2))
27580 poly_int64 expr_offset1;
27581 poly_int64 expr_offset2;
27582 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
27583 &expr_offset1);
27584 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
27585 &expr_offset2);
27586 if (!expr_base1
27587 || !expr_base2
27588 || !DECL_P (expr_base1)
27589 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
27590 return false;
27592 expr_offset1 += MEM_OFFSET (*mem1);
27593 expr_offset2 += MEM_OFFSET (*mem2);
27595 if (known_eq (expr_offset1 + size1, expr_offset2))
27597 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
27598 *reversed = true;
27599 else
27600 return false;
27602 if (reversed)
27604 if (base2)
27606 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
27607 expr_offset1 - expr_offset2);
27608 *mem1 = replace_equiv_address_nv (*mem1, addr1);
27610 else
27612 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
27613 expr_offset2 - expr_offset1);
27614 *mem2 = replace_equiv_address_nv (*mem2, addr2);
27617 return true;
27620 return false;
27623 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27624 instruction. */
27626 bool
27627 aarch64_ldpstp_operand_mode_p (machine_mode mode)
27629 if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
27630 || hard_regno_nregs (V0_REGNUM, mode) > 1)
27631 return false;
27633 const auto size = GET_MODE_SIZE (mode);
27634 return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
27637 /* Return true if MEM1 and MEM2 can be combined into a single access
27638 of mode MODE, with the combined access having the same address as MEM1. */
27640 bool
27641 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
27643 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
27644 return false;
27645 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
27648 /* Return true if MEM agrees with the ldp-stp policy model.
27649 Otherwise, false. */
27651 bool
27652 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
27654 auto policy = (load
27655 ? aarch64_tune_params.ldp_policy_model
27656 : aarch64_tune_params.stp_policy_model);
27658 /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair. */
27659 if (policy == AARCH64_LDP_STP_POLICY_NEVER)
27660 return false;
27662 /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27663 do not emit the load pair unless the alignment is checked to be
27664 at least double the alignment of the type. */
27665 if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
27666 && !optimize_function_for_size_p (cfun)
27667 && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
27668 return false;
27670 return true;
27673 /* Given OPERANDS of consecutive load/store, check if we can merge
27674 them into ldp/stp. LOAD is true if they are load instructions.
27675 MODE is the mode of memory operands. */
27677 bool
27678 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
27679 machine_mode mode)
27681 enum reg_class rclass_1, rclass_2;
27682 rtx mem_1, mem_2, reg_1, reg_2;
27684 if (load)
27686 mem_1 = operands[1];
27687 mem_2 = operands[3];
27688 reg_1 = operands[0];
27689 reg_2 = operands[2];
27690 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
27691 if (REGNO (reg_1) == REGNO (reg_2))
27692 return false;
27693 if (reg_overlap_mentioned_p (reg_1, mem_2))
27694 return false;
27696 else
27698 mem_1 = operands[0];
27699 mem_2 = operands[2];
27700 reg_1 = operands[1];
27701 reg_2 = operands[3];
27704 /* The mems cannot be volatile. */
27705 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
27706 return false;
27708 /* Check if mem_1 is ok with the ldp-stp policy model. */
27709 if (!aarch64_mem_ok_with_ldpstp_policy_model (mem_1, load, mode))
27710 return false;
27712 /* Check if the addresses are in the form of [base+offset]. */
27713 bool reversed = false;
27714 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
27715 return false;
27717 /* The operands must be of the same size. */
27718 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
27719 GET_MODE_SIZE (GET_MODE (mem_2))));
27721 /* The lower memory access must be a mem-pair operand. */
27722 rtx lower_mem = reversed ? mem_2 : mem_1;
27723 if (!aarch64_mem_pair_operand (lower_mem, GET_MODE (lower_mem)))
27724 return false;
27726 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
27727 rclass_1 = FP_REGS;
27728 else
27729 rclass_1 = GENERAL_REGS;
27731 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
27732 rclass_2 = FP_REGS;
27733 else
27734 rclass_2 = GENERAL_REGS;
27736 /* Check if the registers are of same class. */
27737 if (rclass_1 != rclass_2)
27738 return false;
27740 return true;
27743 /* Given OPERANDS of consecutive load/store that can be merged,
27744 swap them if they are not in ascending order. */
27745 void
27746 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
27748 int mem_op = load ? 1 : 0;
27749 bool reversed = false;
27750 if (!aarch64_check_consecutive_mems (operands + mem_op,
27751 operands + mem_op + 2, &reversed))
27752 gcc_unreachable ();
27754 if (reversed)
27756 /* Irrespective of whether this is a load or a store,
27757 we do the same swap. */
27758 std::swap (operands[0], operands[2]);
27759 std::swap (operands[1], operands[3]);
27763 /* Helper function used for generation of load/store pair instructions, called
27764 from peepholes in aarch64-ldpstp.md. OPERANDS is an array of
27765 operands as matched by the peepholes in that file. LOAD_P is true if we're
27766 generating a load pair, otherwise we're generating a store pair. CODE is
27767 either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
27768 standard load/store pair. */
27770 void
27771 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
27773 aarch64_swap_ldrstr_operands (operands, load_p);
27775 if (load_p)
27776 emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
27777 operands[1], code));
27778 else
27780 gcc_assert (code == UNKNOWN);
27781 emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
27782 operands[3]));
27786 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
27787 comparison between the two. */
27789 aarch64_host_wide_int_compare (const void *x, const void *y)
27791 return wi::cmps (* ((const HOST_WIDE_INT *) x),
27792 * ((const HOST_WIDE_INT *) y));
27795 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
27796 other pointing to a REG rtx containing an offset, compare the offsets
27797 of the two pairs.
27799 Return:
27801 1 iff offset (X) > offset (Y)
27802 0 iff offset (X) == offset (Y)
27803 -1 iff offset (X) < offset (Y) */
27805 aarch64_ldrstr_offset_compare (const void *x, const void *y)
27807 const rtx * operands_1 = (const rtx *) x;
27808 const rtx * operands_2 = (const rtx *) y;
27809 rtx mem_1, mem_2, base, offset_1, offset_2;
27811 if (MEM_P (operands_1[0]))
27812 mem_1 = operands_1[0];
27813 else
27814 mem_1 = operands_1[1];
27816 if (MEM_P (operands_2[0]))
27817 mem_2 = operands_2[0];
27818 else
27819 mem_2 = operands_2[1];
27821 /* Extract the offsets. */
27822 extract_base_offset_in_addr (mem_1, &base, &offset_1);
27823 extract_base_offset_in_addr (mem_2, &base, &offset_2);
27825 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
27827 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
27830 /* Given OPERANDS of consecutive load/store, check if we can merge
27831 them into ldp/stp by adjusting the offset. LOAD is true if they
27832 are load instructions. MODE is the mode of memory operands.
27834 Given below consecutive stores:
27836 str w1, [xb, 0x100]
27837 str w1, [xb, 0x104]
27838 str w1, [xb, 0x108]
27839 str w1, [xb, 0x10c]
27841 Though the offsets are out of the range supported by stp, we can
27842 still pair them after adjusting the offset, like:
27844 add scratch, xb, 0x100
27845 stp w1, w1, [scratch]
27846 stp w1, w1, [scratch, 0x8]
27848 The peephole patterns detecting this opportunity should guarantee
27849 the scratch register is avaliable. */
27851 bool
27852 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
27853 machine_mode mode)
27855 const int num_insns = 4;
27856 enum reg_class rclass;
27857 HOST_WIDE_INT offvals[num_insns], msize;
27858 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
27860 if (load)
27862 for (int i = 0; i < num_insns; i++)
27864 reg[i] = operands[2 * i];
27865 mem[i] = operands[2 * i + 1];
27867 gcc_assert (REG_P (reg[i]));
27870 /* Do not attempt to merge the loads if the loads clobber each other. */
27871 for (int i = 0; i < 8; i += 2)
27872 for (int j = i + 2; j < 8; j += 2)
27873 if (reg_overlap_mentioned_p (operands[i], operands[j]))
27874 return false;
27876 else
27877 for (int i = 0; i < num_insns; i++)
27879 mem[i] = operands[2 * i];
27880 reg[i] = operands[2 * i + 1];
27883 /* Skip if memory operand is by itself valid for ldp/stp. */
27884 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
27885 return false;
27887 for (int i = 0; i < num_insns; i++)
27889 /* The mems cannot be volatile. */
27890 if (MEM_VOLATILE_P (mem[i]))
27891 return false;
27893 /* Check if the addresses are in the form of [base+offset]. */
27894 extract_base_offset_in_addr (mem[i], base + i, offset + i);
27895 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
27896 return false;
27899 /* Check if the registers are of same class. */
27900 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
27901 ? FP_REGS : GENERAL_REGS;
27903 for (int i = 1; i < num_insns; i++)
27904 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
27906 if (rclass != FP_REGS)
27907 return false;
27909 else
27911 if (rclass != GENERAL_REGS)
27912 return false;
27915 /* Only the last register in the order in which they occur
27916 may be clobbered by the load. */
27917 if (rclass == GENERAL_REGS && load)
27918 for (int i = 0; i < num_insns - 1; i++)
27919 if (reg_mentioned_p (reg[i], mem[i]))
27920 return false;
27922 /* Check if the bases are same. */
27923 for (int i = 0; i < num_insns - 1; i++)
27924 if (!rtx_equal_p (base[i], base[i + 1]))
27925 return false;
27927 for (int i = 0; i < num_insns; i++)
27928 offvals[i] = INTVAL (offset[i]);
27930 msize = GET_MODE_SIZE (mode).to_constant ();
27932 /* Check if the offsets can be put in the right order to do a ldp/stp. */
27933 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
27934 aarch64_host_wide_int_compare);
27936 if (!(offvals[1] == offvals[0] + msize
27937 && offvals[3] == offvals[2] + msize))
27938 return false;
27940 /* Check that offsets are within range of each other. The ldp/stp
27941 instructions have 7 bit immediate offsets, so use 0x80. */
27942 if (offvals[2] - offvals[0] >= msize * 0x80)
27943 return false;
27945 /* The offsets must be aligned with respect to each other. */
27946 if (offvals[0] % msize != offvals[2] % msize)
27947 return false;
27949 /* Check if mem[0] is ok with the ldp-stp policy model. */
27950 if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
27951 return false;
27953 return true;
27956 /* Given OPERANDS of consecutive load/store, this function pairs them
27957 into LDP/STP after adjusting the offset. It depends on the fact
27958 that the operands can be sorted so the offsets are correct for STP.
27959 MODE is the mode of memory operands. CODE is the rtl operator
27960 which should be applied to all memory operands, it's SIGN_EXTEND,
27961 ZERO_EXTEND or UNKNOWN. */
27963 bool
27964 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
27965 machine_mode mode, RTX_CODE code)
27967 rtx base, offset_1, offset_2;
27968 rtx mem_1, mem_2;
27969 rtx temp_operands[8];
27970 HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
27971 stp_off_upper_limit, stp_off_lower_limit, msize;
27973 /* We make changes on a copy as we may still bail out. */
27974 for (int i = 0; i < 8; i ++)
27975 temp_operands[i] = operands[i];
27977 /* Sort the operands. Note for cases as below:
27978 [base + 0x310] = A
27979 [base + 0x320] = B
27980 [base + 0x330] = C
27981 [base + 0x320] = D
27982 We need stable sorting otherwise wrong data may be store to offset 0x320.
27983 Also note the dead store in above case should be optimized away, but no
27984 guarantees here. */
27985 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
27986 aarch64_ldrstr_offset_compare);
27988 /* Copy the memory operands so that if we have to bail for some
27989 reason the original addresses are unchanged. */
27990 if (load)
27992 mem_1 = copy_rtx (temp_operands[1]);
27993 mem_2 = copy_rtx (temp_operands[5]);
27995 else
27997 mem_1 = copy_rtx (temp_operands[0]);
27998 mem_2 = copy_rtx (temp_operands[4]);
27999 gcc_assert (code == UNKNOWN);
28002 extract_base_offset_in_addr (mem_1, &base, &offset_1);
28003 extract_base_offset_in_addr (mem_2, &base, &offset_2);
28004 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28005 && offset_2 != NULL_RTX);
28007 /* Adjust offset so it can fit in LDP/STP instruction. */
28008 msize = GET_MODE_SIZE (mode).to_constant();
28009 stp_off_upper_limit = msize * (0x40 - 1);
28010 stp_off_lower_limit = - msize * 0x40;
28012 off_val_1 = INTVAL (offset_1);
28013 off_val_2 = INTVAL (offset_2);
28015 /* The base offset is optimally half way between the two STP/LDP offsets. */
28016 if (msize <= 4)
28017 base_off = (off_val_1 + off_val_2) / 2;
28018 else
28019 /* However, due to issues with negative LDP/STP offset generation for
28020 larger modes, for DF, DD, DI and vector modes. we must not use negative
28021 addresses smaller than 9 signed unadjusted bits can store. This
28022 provides the most range in this case. */
28023 base_off = off_val_1;
28025 /* Adjust the base so that it is aligned with the addresses but still
28026 optimal. */
28027 if (base_off % msize != off_val_1 % msize)
28028 /* Fix the offset, bearing in mind we want to make it bigger not
28029 smaller. */
28030 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28031 else if (msize <= 4)
28032 /* The negative range of LDP/STP is one larger than the positive range. */
28033 base_off += msize;
28035 /* Check if base offset is too big or too small. We can attempt to resolve
28036 this issue by setting it to the maximum value and seeing if the offsets
28037 still fit. */
28038 if (base_off >= 0x1000)
28040 base_off = 0x1000 - 1;
28041 /* We must still make sure that the base offset is aligned with respect
28042 to the address. But it may not be made any bigger. */
28043 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28046 /* Likewise for the case where the base is too small. */
28047 if (base_off <= -0x1000)
28049 base_off = -0x1000 + 1;
28050 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28053 /* Offset of the first STP/LDP. */
28054 new_off_1 = off_val_1 - base_off;
28056 /* Offset of the second STP/LDP. */
28057 new_off_2 = off_val_2 - base_off;
28059 /* The offsets must be within the range of the LDP/STP instructions. */
28060 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28061 || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28062 return false;
28064 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28065 new_off_1), true);
28066 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28067 new_off_2), true);
28069 if (!aarch64_mem_pair_operand (mem_1, mode)
28070 || !aarch64_mem_pair_operand (mem_2, mode))
28071 return false;
28073 if (load)
28075 operands[0] = temp_operands[0];
28076 operands[1] = mem_1;
28077 operands[2] = temp_operands[2];
28078 operands[4] = temp_operands[4];
28079 operands[5] = mem_2;
28080 operands[6] = temp_operands[6];
28082 else
28084 operands[0] = mem_1;
28085 operands[1] = temp_operands[1];
28086 operands[3] = temp_operands[3];
28087 operands[4] = mem_2;
28088 operands[5] = temp_operands[5];
28089 operands[7] = temp_operands[7];
28092 /* Emit adjusting instruction. */
28093 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28094 /* Emit ldp/stp instructions. */
28095 if (load)
28097 emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28098 operands[1], code));
28099 emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28100 operands[5], code));
28102 else
28104 emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28105 operands[3]));
28106 emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28107 operands[7]));
28109 return true;
28112 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
28113 it isn't worth branching around empty masked ops (including masked
28114 stores). */
28116 static bool
28117 aarch64_empty_mask_is_expensive (unsigned)
28119 return false;
28122 /* Return 1 if pseudo register should be created and used to hold
28123 GOT address for PIC code. */
28125 bool
28126 aarch64_use_pseudo_pic_reg (void)
28128 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28131 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
28133 static int
28134 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28136 switch (XINT (x, 1))
28138 case UNSPEC_GOTSMALLPIC:
28139 case UNSPEC_GOTSMALLPIC28K:
28140 case UNSPEC_GOTTINYPIC:
28141 return 0;
28142 default:
28143 break;
28146 return default_unspec_may_trap_p (x, flags);
28150 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28151 return the log2 of that value. Otherwise return -1. */
28154 aarch64_fpconst_pow_of_2 (rtx x)
28156 const REAL_VALUE_TYPE *r;
28158 if (!CONST_DOUBLE_P (x))
28159 return -1;
28161 r = CONST_DOUBLE_REAL_VALUE (x);
28163 if (REAL_VALUE_NEGATIVE (*r)
28164 || REAL_VALUE_ISNAN (*r)
28165 || REAL_VALUE_ISINF (*r)
28166 || !real_isinteger (r, DFmode))
28167 return -1;
28169 return exact_log2 (real_to_integer (r));
28172 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28173 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28174 return n. Otherwise return -1. */
28177 aarch64_fpconst_pow2_recip (rtx x)
28179 REAL_VALUE_TYPE r0;
28181 if (!CONST_DOUBLE_P (x))
28182 return -1;
28184 r0 = *CONST_DOUBLE_REAL_VALUE (x);
28185 if (exact_real_inverse (DFmode, &r0)
28186 && !REAL_VALUE_NEGATIVE (r0))
28188 int ret = exact_log2 (real_to_integer (&r0));
28189 if (ret >= 1 && ret <= 32)
28190 return ret;
28192 return -1;
28195 /* If X is a vector of equal CONST_DOUBLE values and that value is
28196 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
28199 aarch64_vec_fpconst_pow_of_2 (rtx x)
28201 int nelts;
28202 if (!CONST_VECTOR_P (x)
28203 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28204 return -1;
28206 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28207 return -1;
28209 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28210 if (firstval <= 0)
28211 return -1;
28213 for (int i = 1; i < nelts; i++)
28214 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28215 return -1;
28217 return firstval;
28220 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28221 to float.
28223 __fp16 always promotes through this hook.
28224 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28225 through the generic excess precision logic rather than here. */
28227 static tree
28228 aarch64_promoted_type (const_tree t)
28230 if (SCALAR_FLOAT_TYPE_P (t)
28231 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28232 return float_type_node;
28234 return NULL_TREE;
28237 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
28239 static bool
28240 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28241 optimization_type opt_type)
28243 switch (op)
28245 case rsqrt_optab:
28246 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28248 default:
28249 return true;
28253 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
28255 static unsigned int
28256 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28257 int *offset)
28259 /* Polynomial invariant 1 == (VG / 2) - 1. */
28260 gcc_assert (i == 1);
28261 *factor = 2;
28262 *offset = 1;
28263 return AARCH64_DWARF_VG;
28266 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28267 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28269 static bool
28270 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28272 return ((mode == HFmode || mode == BFmode)
28273 ? true
28274 : default_libgcc_floating_mode_supported_p (mode));
28277 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28278 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28280 static bool
28281 aarch64_scalar_mode_supported_p (scalar_mode mode)
28283 if (DECIMAL_FLOAT_MODE_P (mode))
28284 return default_decimal_float_supported_p ();
28286 return ((mode == HFmode || mode == BFmode)
28287 ? true
28288 : default_scalar_mode_supported_p (mode));
28291 /* Set the value of FLT_EVAL_METHOD.
28292 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28294 0: evaluate all operations and constants, whose semantic type has at
28295 most the range and precision of type float, to the range and
28296 precision of float; evaluate all other operations and constants to
28297 the range and precision of the semantic type;
28299 N, where _FloatN is a supported interchange floating type
28300 evaluate all operations and constants, whose semantic type has at
28301 most the range and precision of _FloatN type, to the range and
28302 precision of the _FloatN type; evaluate all other operations and
28303 constants to the range and precision of the semantic type;
28305 If we have the ARMv8.2-A extensions then we support _Float16 in native
28306 precision, so we should set this to 16. Otherwise, we support the type,
28307 but want to evaluate expressions in float precision, so set this to
28308 0. */
28310 static enum flt_eval_method
28311 aarch64_excess_precision (enum excess_precision_type type)
28313 switch (type)
28315 case EXCESS_PRECISION_TYPE_FAST:
28316 case EXCESS_PRECISION_TYPE_STANDARD:
28317 /* We can calculate either in 16-bit range and precision or
28318 32-bit range and precision. Make that decision based on whether
28319 we have native support for the ARMv8.2-A 16-bit floating-point
28320 instructions or not. */
28321 return (TARGET_FP_F16INST
28322 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28323 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28324 case EXCESS_PRECISION_TYPE_IMPLICIT:
28325 case EXCESS_PRECISION_TYPE_FLOAT16:
28326 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28327 default:
28328 gcc_unreachable ();
28330 return FLT_EVAL_METHOD_UNPREDICTABLE;
28333 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
28334 scheduled for speculative execution. Reject the long-running division
28335 and square-root instructions. */
28337 static bool
28338 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28340 switch (get_attr_type (insn))
28342 case TYPE_SDIV:
28343 case TYPE_UDIV:
28344 case TYPE_FDIVS:
28345 case TYPE_FDIVD:
28346 case TYPE_FSQRTS:
28347 case TYPE_FSQRTD:
28348 case TYPE_NEON_FP_SQRT_S:
28349 case TYPE_NEON_FP_SQRT_D:
28350 case TYPE_NEON_FP_SQRT_S_Q:
28351 case TYPE_NEON_FP_SQRT_D_Q:
28352 case TYPE_NEON_FP_DIV_S:
28353 case TYPE_NEON_FP_DIV_D:
28354 case TYPE_NEON_FP_DIV_S_Q:
28355 case TYPE_NEON_FP_DIV_D_Q:
28356 return false;
28357 default:
28358 return true;
28362 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
28364 static int
28365 aarch64_compute_pressure_classes (reg_class *classes)
28367 int i = 0;
28368 classes[i++] = GENERAL_REGS;
28369 classes[i++] = FP_REGS;
28370 /* PR_REGS isn't a useful pressure class because many predicate pseudo
28371 registers need to go in PR_LO_REGS at some point during their
28372 lifetime. Splitting it into two halves has the effect of making
28373 all predicates count against PR_LO_REGS, so that we try whenever
28374 possible to restrict the number of live predicates to 8. This
28375 greatly reduces the amount of spilling in certain loops. */
28376 classes[i++] = PR_LO_REGS;
28377 classes[i++] = PR_HI_REGS;
28378 return i;
28381 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
28383 static bool
28384 aarch64_can_change_mode_class (machine_mode from,
28385 machine_mode to, reg_class_t)
28387 return aarch64_modes_compatible_p (from, to);
28390 /* Implement TARGET_EARLY_REMAT_MODES. */
28392 static void
28393 aarch64_select_early_remat_modes (sbitmap modes)
28395 /* SVE values are not normally live across a call, so it should be
28396 worth doing early rematerialization even in VL-specific mode. */
28397 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
28398 if (aarch64_sve_mode_p ((machine_mode) i))
28399 bitmap_set_bit (modes, i);
28402 /* Override the default target speculation_safe_value. */
28403 static rtx
28404 aarch64_speculation_safe_value (machine_mode mode,
28405 rtx result, rtx val, rtx failval)
28407 /* Maybe we should warn if falling back to hard barriers. They are
28408 likely to be noticably more expensive than the alternative below. */
28409 if (!aarch64_track_speculation)
28410 return default_speculation_safe_value (mode, result, val, failval);
28412 if (!REG_P (val))
28413 val = copy_to_mode_reg (mode, val);
28415 if (!aarch64_reg_or_zero (failval, mode))
28416 failval = copy_to_mode_reg (mode, failval);
28418 emit_insn (gen_despeculate_copy (mode, result, val, failval));
28419 return result;
28422 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28423 Look into the tuning structure for an estimate.
28424 KIND specifies the type of requested estimate: min, max or likely.
28425 For cores with a known SVE width all three estimates are the same.
28426 For generic SVE tuning we want to distinguish the maximum estimate from
28427 the minimum and likely ones.
28428 The likely estimate is the same as the minimum in that case to give a
28429 conservative behavior of auto-vectorizing with SVE when it is a win
28430 even for 128-bit SVE.
28431 When SVE width information is available VAL.coeffs[1] is multiplied by
28432 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
28434 static HOST_WIDE_INT
28435 aarch64_estimated_poly_value (poly_int64 val,
28436 poly_value_estimate_kind kind
28437 = POLY_VALUE_LIKELY)
28439 unsigned int width_source = aarch64_tune_params.sve_width;
28441 /* If there is no core-specific information then the minimum and likely
28442 values are based on 128-bit vectors and the maximum is based on
28443 the architectural maximum of 2048 bits. */
28444 if (width_source == SVE_SCALABLE)
28445 switch (kind)
28447 case POLY_VALUE_MIN:
28448 case POLY_VALUE_LIKELY:
28449 return val.coeffs[0];
28450 case POLY_VALUE_MAX:
28451 return val.coeffs[0] + val.coeffs[1] * 15;
28454 /* Allow sve_width to be a bitmask of different VL, treating the lowest
28455 as likely. This could be made more general if future -mtune options
28456 need it to be. */
28457 if (kind == POLY_VALUE_MAX)
28458 width_source = 1 << floor_log2 (width_source);
28459 else
28460 width_source = least_bit_hwi (width_source);
28462 /* If the core provides width information, use that. */
28463 HOST_WIDE_INT over_128 = width_source - 128;
28464 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
28468 /* Return true for types that could be supported as SIMD return or
28469 argument types. */
28471 static bool
28472 supported_simd_type (tree t)
28474 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
28476 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
28477 return s == 1 || s == 2 || s == 4 || s == 8;
28479 return false;
28482 /* Determine the lane size for the clone argument/return type. This follows
28483 the LS(P) rule in the VFABIA64. */
28485 static unsigned
28486 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
28488 gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
28490 /* For non map-to-vector types that are pointers we use the element type it
28491 points to. */
28492 if (POINTER_TYPE_P (type))
28493 switch (clone_arg_type)
28495 default:
28496 break;
28497 case SIMD_CLONE_ARG_TYPE_UNIFORM:
28498 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
28499 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
28500 type = TREE_TYPE (type);
28501 break;
28504 /* For types (or pointers of non map-to-vector types point to) that are
28505 integers or floating point, we use their size if they are 1, 2, 4 or 8.
28507 if (INTEGRAL_TYPE_P (type)
28508 || SCALAR_FLOAT_TYPE_P (type))
28509 switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
28511 default:
28512 break;
28513 case 1:
28514 case 2:
28515 case 4:
28516 case 8:
28517 return TYPE_PRECISION (type);
28519 /* For any other we use the size of uintptr_t. For map-to-vector types that
28520 are pointers, using the size of uintptr_t is the same as using the size of
28521 their type, seeing all pointers are the same size as uintptr_t. */
28522 return POINTER_SIZE;
28526 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
28528 static int
28529 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
28530 struct cgraph_simd_clone *clonei,
28531 tree base_type ATTRIBUTE_UNUSED,
28532 int num, bool explicit_p)
28534 tree t, ret_type;
28535 unsigned int nds_elt_bits;
28536 unsigned HOST_WIDE_INT const_simdlen;
28538 if (!TARGET_SIMD)
28539 return 0;
28541 /* For now, SVE simdclones won't produce illegal simdlen, So only check
28542 const simdlens here. */
28543 if (maybe_ne (clonei->simdlen, 0U)
28544 && clonei->simdlen.is_constant (&const_simdlen)
28545 && (const_simdlen < 2
28546 || const_simdlen > 1024
28547 || (const_simdlen & (const_simdlen - 1)) != 0))
28549 if (explicit_p)
28550 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28551 "unsupported simdlen %wd", const_simdlen);
28552 return 0;
28555 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
28556 /* According to AArch64's Vector ABI the type that determines the simdlen is
28557 the narrowest of types, so we ignore base_type for AArch64. */
28558 if (TREE_CODE (ret_type) != VOID_TYPE
28559 && !supported_simd_type (ret_type))
28561 if (!explicit_p)
28563 else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28564 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28565 "GCC does not currently support return type %qT "
28566 "for simd", ret_type);
28567 else
28568 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28569 "unsupported return type %qT for simd",
28570 ret_type);
28571 return 0;
28574 auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
28576 /* We are looking for the NDS type here according to the VFABIA64. */
28577 if (TREE_CODE (ret_type) != VOID_TYPE)
28579 nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
28580 vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
28582 else
28583 nds_elt_bits = POINTER_SIZE;
28585 int i;
28586 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
28587 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
28588 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
28589 t && t != void_list_node; t = TREE_CHAIN (t), i++)
28591 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
28592 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
28593 && !supported_simd_type (arg_type))
28595 if (!explicit_p)
28597 else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28598 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28599 "GCC does not currently support argument type %qT "
28600 "for simd", arg_type);
28601 else
28602 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28603 "unsupported argument type %qT for simd",
28604 arg_type);
28605 return 0;
28607 unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
28608 if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
28609 vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
28610 if (nds_elt_bits > lane_bits)
28611 nds_elt_bits = lane_bits;
28614 clonei->vecsize_mangle = 'n';
28615 clonei->mask_mode = VOIDmode;
28616 poly_uint64 simdlen;
28617 auto_vec<poly_uint64> simdlens (2);
28618 /* Keep track of the possible simdlens the clones of this function can have,
28619 and check them later to see if we support them. */
28620 if (known_eq (clonei->simdlen, 0U))
28622 simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28623 simdlens.safe_push (simdlen);
28624 simdlens.safe_push (simdlen * 2);
28626 else
28627 simdlens.safe_push (clonei->simdlen);
28629 clonei->vecsize_int = 0;
28630 clonei->vecsize_float = 0;
28632 /* We currently do not support generating simdclones where vector arguments
28633 do not fit into a single vector register, i.e. vector types that are more
28634 than 128-bits large. This is because of how we currently represent such
28635 types in ACLE, where we use a struct to allow us to pass them as arguments
28636 and return.
28637 Hence why we have to check whether the simdlens available for this
28638 simdclone would cause a vector type to be larger than 128-bits, and reject
28639 such a clone. */
28640 unsigned j = 0;
28641 while (j < simdlens.length ())
28643 bool remove_simdlen = false;
28644 for (auto elt : vec_elts)
28645 if (known_gt (simdlens[j] * elt.second, 128U))
28647 /* Don't issue a warning for every simdclone when there is no
28648 specific simdlen clause. */
28649 if (explicit_p && maybe_ne (clonei->simdlen, 0U))
28650 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28651 "GCC does not currently support simdlen %wd for "
28652 "type %qT",
28653 constant_lower_bound (simdlens[j]), elt.first);
28654 remove_simdlen = true;
28655 break;
28657 if (remove_simdlen)
28658 simdlens.ordered_remove (j);
28659 else
28660 j++;
28664 int count = simdlens.length ();
28665 if (count == 0)
28667 if (explicit_p && known_eq (clonei->simdlen, 0U))
28669 /* Warn the user if we can't generate any simdclone. */
28670 simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28671 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28672 "GCC does not currently support a simdclone with simdlens"
28673 " %wd and %wd for these types.",
28674 constant_lower_bound (simdlen),
28675 constant_lower_bound (simdlen*2));
28677 return 0;
28680 gcc_assert (num < count);
28681 clonei->simdlen = simdlens[num];
28682 return count;
28685 /* Implement TARGET_SIMD_CLONE_ADJUST. */
28687 static void
28688 aarch64_simd_clone_adjust (struct cgraph_node *node)
28690 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
28691 use the correct ABI. */
28693 tree t = TREE_TYPE (node->decl);
28694 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
28695 TYPE_ATTRIBUTES (t));
28698 /* Implement TARGET_SIMD_CLONE_USABLE. */
28700 static int
28701 aarch64_simd_clone_usable (struct cgraph_node *node)
28703 switch (node->simdclone->vecsize_mangle)
28705 case 'n':
28706 if (!TARGET_SIMD)
28707 return -1;
28708 return 0;
28709 default:
28710 gcc_unreachable ();
28714 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
28716 static int
28717 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
28719 auto check_attr = [&](const char *ns, const char *name) {
28720 tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
28721 tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
28722 if (!attr1 && !attr2)
28723 return true;
28725 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
28728 if (!check_attr ("gnu", "aarch64_vector_pcs"))
28729 return 0;
28730 if (!check_attr ("gnu", "Advanced SIMD type"))
28731 return 0;
28732 if (!check_attr ("gnu", "SVE type"))
28733 return 0;
28734 if (!check_attr ("gnu", "SVE sizeless type"))
28735 return 0;
28736 if (!check_attr ("arm", "streaming"))
28737 return 0;
28738 if (!check_attr ("arm", "streaming_compatible"))
28739 return 0;
28740 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
28741 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
28742 return 0;
28743 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
28744 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
28745 return 0;
28746 return 1;
28749 /* Implement TARGET_MERGE_DECL_ATTRIBUTES. */
28751 static tree
28752 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
28754 tree old_attrs = DECL_ATTRIBUTES (olddecl);
28755 tree old_new = lookup_attribute ("arm", "new", old_attrs);
28757 tree new_attrs = DECL_ATTRIBUTES (newdecl);
28758 tree new_new = lookup_attribute ("arm", "new", new_attrs);
28760 if (DECL_INITIAL (olddecl) && new_new)
28762 error ("cannot apply attribute %qs to %q+D after the function"
28763 " has been defined", "new", newdecl);
28764 inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
28765 newdecl);
28767 else
28769 if (old_new && new_new)
28771 old_attrs = remove_attribute ("arm", "new", old_attrs);
28772 TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
28773 TREE_VALUE (old_new));
28775 if (new_new)
28776 aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
28779 return merge_attributes (old_attrs, new_attrs);
28782 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
28784 static const char *
28785 aarch64_get_multilib_abi_name (void)
28787 if (TARGET_BIG_END)
28788 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
28789 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
28792 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
28793 global variable based guard use the default else
28794 return a null tree. */
28795 static tree
28796 aarch64_stack_protect_guard (void)
28798 if (aarch64_stack_protector_guard == SSP_GLOBAL)
28799 return default_stack_protect_guard ();
28801 return NULL_TREE;
28804 /* Return the diagnostic message string if the binary operation OP is
28805 not permitted on TYPE1 and TYPE2, NULL otherwise. */
28807 static const char *
28808 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
28809 const_tree type2)
28811 if (VECTOR_TYPE_P (type1)
28812 && VECTOR_TYPE_P (type2)
28813 && !TYPE_INDIVISIBLE_P (type1)
28814 && !TYPE_INDIVISIBLE_P (type2)
28815 && (aarch64_sve::builtin_type_p (type1)
28816 != aarch64_sve::builtin_type_p (type2)))
28817 return N_("cannot combine GNU and SVE vectors in a binary operation");
28819 /* Operation allowed. */
28820 return NULL;
28823 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
28824 compiler that we automatically ignore the top byte of our pointers, which
28825 allows using -fsanitize=hwaddress. */
28826 bool
28827 aarch64_can_tag_addresses ()
28829 return !TARGET_ILP32;
28832 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
28833 section at the end if needed. */
28834 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
28835 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
28836 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
28837 void
28838 aarch64_file_end_indicate_exec_stack ()
28840 file_end_indicate_exec_stack ();
28842 unsigned feature_1_and = 0;
28843 if (aarch_bti_enabled ())
28844 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
28846 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
28847 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
28849 if (feature_1_and)
28851 /* Generate .note.gnu.property section. */
28852 switch_to_section (get_section (".note.gnu.property",
28853 SECTION_NOTYPE, NULL));
28855 /* PT_NOTE header: namesz, descsz, type.
28856 namesz = 4 ("GNU\0")
28857 descsz = 16 (Size of the program property array)
28858 [(12 + padding) * Number of array elements]
28859 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
28860 assemble_align (POINTER_SIZE);
28861 assemble_integer (GEN_INT (4), 4, 32, 1);
28862 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
28863 assemble_integer (GEN_INT (5), 4, 32, 1);
28865 /* PT_NOTE name. */
28866 assemble_string ("GNU", 4);
28868 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
28869 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
28870 datasz = 4
28871 data = feature_1_and. */
28872 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
28873 assemble_integer (GEN_INT (4), 4, 32, 1);
28874 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
28876 /* Pad the size of the note to the required alignment. */
28877 assemble_align (POINTER_SIZE);
28880 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
28881 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
28882 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
28884 /* Helper function for straight line speculation.
28885 Return what barrier should be emitted for straight line speculation
28886 mitigation.
28887 When not mitigating against straight line speculation this function returns
28888 an empty string.
28889 When mitigating against straight line speculation, use:
28890 * SB when the v8.5-A SB extension is enabled.
28891 * DSB+ISB otherwise. */
28892 const char *
28893 aarch64_sls_barrier (int mitigation_required)
28895 return mitigation_required
28896 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
28897 : "";
28900 static GTY (()) tree aarch64_sls_shared_thunks[30];
28901 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
28902 const char *indirect_symbol_names[30] = {
28903 "__call_indirect_x0",
28904 "__call_indirect_x1",
28905 "__call_indirect_x2",
28906 "__call_indirect_x3",
28907 "__call_indirect_x4",
28908 "__call_indirect_x5",
28909 "__call_indirect_x6",
28910 "__call_indirect_x7",
28911 "__call_indirect_x8",
28912 "__call_indirect_x9",
28913 "__call_indirect_x10",
28914 "__call_indirect_x11",
28915 "__call_indirect_x12",
28916 "__call_indirect_x13",
28917 "__call_indirect_x14",
28918 "__call_indirect_x15",
28919 "", /* "__call_indirect_x16", */
28920 "", /* "__call_indirect_x17", */
28921 "__call_indirect_x18",
28922 "__call_indirect_x19",
28923 "__call_indirect_x20",
28924 "__call_indirect_x21",
28925 "__call_indirect_x22",
28926 "__call_indirect_x23",
28927 "__call_indirect_x24",
28928 "__call_indirect_x25",
28929 "__call_indirect_x26",
28930 "__call_indirect_x27",
28931 "__call_indirect_x28",
28932 "__call_indirect_x29",
28935 /* Function to create a BLR thunk. This thunk is used to mitigate straight
28936 line speculation. Instead of a simple BLR that can be speculated past,
28937 we emit a BL to this thunk, and this thunk contains a BR to the relevant
28938 register. These thunks have the relevant speculation barries put after
28939 their indirect branch so that speculation is blocked.
28941 We use such a thunk so the speculation barriers are kept off the
28942 architecturally executed path in order to reduce the performance overhead.
28944 When optimizing for size we use stubs shared by the linked object.
28945 When optimizing for performance we emit stubs for each function in the hope
28946 that the branch predictor can better train on jumps specific for a given
28947 function. */
28949 aarch64_sls_create_blr_label (int regnum)
28951 gcc_assert (STUB_REGNUM_P (regnum));
28952 if (optimize_function_for_size_p (cfun))
28954 /* For the thunks shared between different functions in this compilation
28955 unit we use a named symbol -- this is just for users to more easily
28956 understand the generated assembly. */
28957 aarch64_sls_shared_thunks_needed = true;
28958 const char *thunk_name = indirect_symbol_names[regnum];
28959 if (aarch64_sls_shared_thunks[regnum] == NULL)
28961 /* Build a decl representing this function stub and record it for
28962 later. We build a decl here so we can use the GCC machinery for
28963 handling sections automatically (through `get_named_section` and
28964 `make_decl_one_only`). That saves us a lot of trouble handling
28965 the specifics of different output file formats. */
28966 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
28967 get_identifier (thunk_name),
28968 build_function_type_list (void_type_node,
28969 NULL_TREE));
28970 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
28971 NULL_TREE, void_type_node);
28972 TREE_PUBLIC (decl) = 1;
28973 TREE_STATIC (decl) = 1;
28974 DECL_IGNORED_P (decl) = 1;
28975 DECL_ARTIFICIAL (decl) = 1;
28976 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
28977 resolve_unique_section (decl, 0, false);
28978 aarch64_sls_shared_thunks[regnum] = decl;
28981 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
28984 if (cfun->machine->call_via[regnum] == NULL)
28985 cfun->machine->call_via[regnum]
28986 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
28987 return cfun->machine->call_via[regnum];
28990 /* Helper function for aarch64_sls_emit_blr_function_thunks and
28991 aarch64_sls_emit_shared_blr_thunks below. */
28992 static void
28993 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
28995 /* Save in x16 and branch to that function so this transformation does
28996 not prevent jumping to `BTI c` instructions. */
28997 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
28998 asm_fprintf (out_file, "\tbr\tx16\n");
29001 /* Emit all BLR stubs for this particular function.
29002 Here we emit all the BLR stubs needed for the current function. Since we
29003 emit these stubs in a consecutive block we know there will be no speculation
29004 gadgets between each stub, and hence we only emit a speculation barrier at
29005 the end of the stub sequences.
29007 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
29008 void
29009 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29011 if (! aarch64_harden_sls_blr_p ())
29012 return;
29014 bool any_functions_emitted = false;
29015 /* We must save and restore the current function section since this assembly
29016 is emitted at the end of the function. This means it can be emitted *just
29017 after* the cold section of a function. That cold part would be emitted in
29018 a different section. That switch would trigger a `.cfi_endproc` directive
29019 to be emitted in the original section and a `.cfi_startproc` directive to
29020 be emitted in the new section. Switching to the original section without
29021 restoring would mean that the `.cfi_endproc` emitted as a function ends
29022 would happen in a different section -- leaving an unmatched
29023 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29024 in the standard text section. */
29025 section *save_text_section = in_section;
29026 switch_to_section (function_section (current_function_decl));
29027 for (int regnum = 0; regnum < 30; ++regnum)
29029 rtx specu_label = cfun->machine->call_via[regnum];
29030 if (specu_label == NULL)
29031 continue;
29033 targetm.asm_out.print_operand (out_file, specu_label, 0);
29034 asm_fprintf (out_file, ":\n");
29035 aarch64_sls_emit_function_stub (out_file, regnum);
29036 any_functions_emitted = true;
29038 if (any_functions_emitted)
29039 /* Can use the SB if needs be here, since this stub will only be used
29040 by the current function, and hence for the current target. */
29041 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29042 switch_to_section (save_text_section);
29045 /* Emit shared BLR stubs for the current compilation unit.
29046 Over the course of compiling this unit we may have converted some BLR
29047 instructions to a BL to a shared stub function. This is where we emit those
29048 stub functions.
29049 This function is for the stubs shared between different functions in this
29050 compilation unit. We share when optimizing for size instead of speed.
29052 This function is called through the TARGET_ASM_FILE_END hook. */
29053 void
29054 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29056 if (! aarch64_sls_shared_thunks_needed)
29057 return;
29059 for (int regnum = 0; regnum < 30; ++regnum)
29061 tree decl = aarch64_sls_shared_thunks[regnum];
29062 if (!decl)
29063 continue;
29065 const char *name = indirect_symbol_names[regnum];
29066 switch_to_section (get_named_section (decl, NULL, 0));
29067 ASM_OUTPUT_ALIGN (out_file, 2);
29068 targetm.asm_out.globalize_label (out_file, name);
29069 /* Only emits if the compiler is configured for an assembler that can
29070 handle visibility directives. */
29071 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29072 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29073 ASM_OUTPUT_LABEL (out_file, name);
29074 aarch64_sls_emit_function_stub (out_file, regnum);
29075 /* Use the most conservative target to ensure it can always be used by any
29076 function in the translation unit. */
29077 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29078 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29082 /* Implement TARGET_ASM_FILE_END. */
29083 void
29084 aarch64_asm_file_end ()
29086 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29087 /* Since this function will be called for the ASM_FILE_END hook, we ensure
29088 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29089 for FreeBSD) still gets called. */
29090 #ifdef TARGET_ASM_FILE_END
29091 TARGET_ASM_FILE_END ();
29092 #endif
29095 const char *
29096 aarch64_indirect_call_asm (rtx addr)
29098 gcc_assert (REG_P (addr));
29099 if (aarch64_harden_sls_blr_p ())
29101 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29102 output_asm_insn ("bl\t%0", &stub_label);
29104 else
29105 output_asm_insn ("blr\t%0", &addr);
29106 return "";
29109 /* Emit the assembly instruction to load the thread pointer into DEST.
29110 Select between different tpidr_elN registers depending on -mtp= setting. */
29112 const char *
29113 aarch64_output_load_tp (rtx dest)
29115 const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29116 "tpidr_el3", "tpidrro_el0"};
29117 char buffer[64];
29118 snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29119 tpidrs[aarch64_tpidr_register]);
29120 output_asm_insn (buffer, &dest);
29121 return "";
29124 /* Set up the value of REG_ALLOC_ORDER from scratch.
29126 It was previously good practice to put call-clobbered registers ahead
29127 of call-preserved registers, but that isn't necessary these days.
29128 IRA's model of register save/restore costs is much more sophisticated
29129 than the model that a simple ordering could provide. We leave
29130 HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29131 of IRA's model.
29133 However, it is still useful to list registers that are members of
29134 multiple classes after registers that are members of fewer classes.
29135 For example, we have:
29137 - FP_LO8_REGS: v0-v7
29138 - FP_LO_REGS: v0-v15
29139 - FP_REGS: v0-v31
29141 If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29142 we run the risk of starving other (lower-priority) pseudos that
29143 require FP_LO8_REGS or FP_LO_REGS. Allocating FP_LO_REGS in the
29144 order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29145 Allocating downwards rather than upwards avoids this problem, at least
29146 in code that has reasonable register pressure.
29148 The situation for predicate registers is similar. */
29150 void
29151 aarch64_adjust_reg_alloc_order ()
29153 for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29154 if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29155 reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29156 else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29157 reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29158 else
29159 reg_alloc_order[i] = i;
29162 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29163 of vector mode MODE to select half the elements of that vector.
29164 Allow any combination of indices except duplicates (or out of range of
29165 the mode units). */
29167 bool
29168 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29170 int nunits = XVECLEN (par, 0);
29171 if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29172 return false;
29173 int mode_nunits = nunits * 2;
29174 /* Put all the elements of PAR into a hash_set and use its
29175 uniqueness guarantees to check that we don't try to insert the same
29176 element twice. */
29177 hash_set<rtx> parset;
29178 for (int i = 0; i < nunits; ++i)
29180 rtx elt = XVECEXP (par, 0, i);
29181 if (!CONST_INT_P (elt)
29182 || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29183 || parset.add (elt))
29184 return false;
29186 return true;
29189 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29190 contain any common elements. */
29192 bool
29193 aarch64_pars_overlap_p (rtx par1, rtx par2)
29195 int len1 = XVECLEN (par1, 0);
29196 int len2 = XVECLEN (par2, 0);
29197 hash_set<rtx> parset;
29198 for (int i = 0; i < len1; ++i)
29199 parset.add (XVECEXP (par1, 0, i));
29200 for (int i = 0; i < len2; ++i)
29201 if (parset.contains (XVECEXP (par2, 0, i)))
29202 return true;
29203 return false;
29206 /* Implement OPTIMIZE_MODE_SWITCHING. */
29208 bool
29209 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29211 bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29212 || (aarch64_cfun_has_new_state ("za")
29213 && df_regs_ever_live_p (ZA_REGNUM))
29214 || (aarch64_cfun_has_new_state ("zt0")
29215 && df_regs_ever_live_p (ZT0_REGNUM)));
29217 if (have_sme_state && nonlocal_goto_handler_labels)
29219 static bool reported;
29220 if (!reported)
29222 sorry ("non-local gotos in functions with SME state");
29223 reported = true;
29227 switch (entity)
29229 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29230 case aarch64_mode_entity::LOCAL_SME_STATE:
29231 return have_sme_state && !nonlocal_goto_handler_labels;
29233 gcc_unreachable ();
29236 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER. */
29238 static void
29239 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29240 aarch64_tristate_mode prev_mode)
29242 if (mode == aarch64_tristate_mode::YES)
29244 gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29245 aarch64_init_tpidr2_block ();
29247 else
29248 gcc_unreachable ();
29251 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE. */
29253 static void
29254 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
29255 aarch64_local_sme_state prev_mode)
29257 /* Back-propagation should ensure that we're always starting from
29258 a known mode. */
29259 gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
29261 if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29263 /* Commit any uncommitted lazy save. This leaves ZA either active
29264 and zero (lazy save case) or off (normal case).
29266 The sequence is:
29268 mrs <temp>, tpidr2_el0
29269 cbz <temp>, no_save
29270 bl __arm_tpidr2_save
29271 msr tpidr2_el0, xzr
29272 zero { za } // Only if ZA is live
29273 no_save: */
29274 bool is_active = (mode == aarch64_local_sme_state::ACTIVE_LIVE
29275 || mode == aarch64_local_sme_state::ACTIVE_DEAD);
29276 auto tmp_reg = gen_reg_rtx (DImode);
29277 auto active_flag = gen_int_mode (is_active, DImode);
29278 emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
29279 emit_insn (gen_aarch64_commit_lazy_save (tmp_reg, active_flag));
29282 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29283 || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29285 if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29287 /* Make ZA active after being inactive.
29289 First handle the case in which the lazy save we set up was
29290 committed by a callee. If the function's source-level ZA state
29291 is live then we must conditionally restore it from the lazy
29292 save buffer. Otherwise we can just force PSTATE.ZA to 1. */
29293 if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
29294 emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29295 else
29296 emit_insn (gen_aarch64_smstart_za ());
29298 /* Now handle the case in which the lazy save was not committed.
29299 In that case, ZA still contains the current function's ZA state,
29300 and we just need to cancel the lazy save. */
29301 emit_insn (gen_aarch64_clear_tpidr2 ());
29303 /* Restore the ZT0 state, if we have some. */
29304 if (aarch64_cfun_has_state ("zt0"))
29305 aarch64_restore_zt0 (true);
29307 return;
29310 if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
29312 /* Retrieve the current function's ZA state from the lazy save
29313 buffer. */
29314 aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29316 /* Restore the ZT0 state, if we have some. */
29317 if (aarch64_cfun_has_state ("zt0"))
29318 aarch64_restore_zt0 (true);
29319 return;
29322 if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
29323 || prev_mode == aarch64_local_sme_state::OFF)
29325 /* INACTIVE_CALLER means that we are enabling ZA for the first
29326 time in this function. The code above means that ZA is either
29327 active and zero (if we committed a lazy save) or off. Handle
29328 the latter case by forcing ZA on.
29330 OFF means that PSTATE.ZA is guaranteed to be 0. We just need
29331 to force it to 1.
29333 Both cases leave ZA zeroed. */
29334 emit_insn (gen_aarch64_smstart_za ());
29336 /* Restore the ZT0 state, if we have some. */
29337 if (prev_mode == aarch64_local_sme_state::OFF
29338 && aarch64_cfun_has_state ("zt0"))
29339 aarch64_restore_zt0 (true);
29340 return;
29343 if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29344 || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
29345 /* A simple change in liveness, such as in a CFG structure where
29346 ZA is only conditionally defined. No code is needed. */
29347 return;
29349 gcc_unreachable ();
29352 if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29354 if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29355 || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29356 || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29358 /* Save the ZT0 state, if we have some. */
29359 if (aarch64_cfun_has_state ("zt0"))
29360 aarch64_save_zt0 ();
29362 /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29363 case of setting up a lazy save buffer before a call.
29364 A transition from INACTIVE_CALLER is similar, except that
29365 the contents of ZA are known to be zero.
29367 A transition from ACTIVE_DEAD means that ZA is live at the
29368 point of the transition, but is dead on at least one incoming
29369 edge. (That is, ZA is only conditionally initialized.)
29370 For efficiency, we want to set up a lazy save even for
29371 dead contents, since forcing ZA off would make later code
29372 restore ZA from the lazy save buffer. */
29373 emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29374 return;
29377 if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
29378 || prev_mode == aarch64_local_sme_state::OFF)
29379 /* We're simply discarding the information about which inactive
29380 state applies. */
29381 return;
29383 gcc_unreachable ();
29386 if (mode == aarch64_local_sme_state::INACTIVE_CALLER
29387 || mode == aarch64_local_sme_state::OFF)
29389 /* Save the ZT0 state, if we have some. */
29390 if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29391 || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
29392 && mode == aarch64_local_sme_state::OFF
29393 && aarch64_cfun_has_state ("zt0"))
29394 aarch64_save_zt0 ();
29396 /* The transition to INACTIVE_CALLER is used before returning from
29397 new("za") functions. Any state in ZA belongs to the current
29398 function rather than a caller, but that state is no longer
29399 needed. Clear any pending lazy save and turn ZA off.
29401 The transition to OFF is used before calling a private-ZA function.
29402 We committed any incoming lazy save above, so at this point any
29403 contents in ZA belong to the current function. */
29404 if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29405 emit_insn (gen_aarch64_clear_tpidr2 ());
29407 if (prev_mode != aarch64_local_sme_state::OFF
29408 && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
29409 emit_insn (gen_aarch64_smstop_za ());
29411 return;
29414 if (mode == aarch64_local_sme_state::SAVED_LOCAL)
29416 /* This is a transition to an exception handler. */
29417 gcc_assert (prev_mode == aarch64_local_sme_state::OFF
29418 || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
29419 return;
29422 gcc_unreachable ();
29425 /* Implement TARGET_MODE_EMIT. */
29427 static void
29428 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
29430 if (mode == prev_mode)
29431 return;
29433 start_sequence ();
29434 switch (aarch64_mode_entity (entity))
29436 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29437 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
29438 aarch64_tristate_mode (prev_mode));
29439 break;
29441 case aarch64_mode_entity::LOCAL_SME_STATE:
29442 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
29443 aarch64_local_sme_state (prev_mode));
29444 break;
29446 rtx_insn *seq = get_insns ();
29447 end_sequence ();
29449 /* Get the set of clobbered registers that are currently live. */
29450 HARD_REG_SET clobbers = {};
29451 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
29453 vec_rtx_properties properties;
29454 properties.add_insn (insn, false);
29455 for (rtx_obj_reference ref : properties.refs ())
29456 if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
29457 SET_HARD_REG_BIT (clobbers, ref.regno);
29459 clobbers &= live;
29461 /* Emit instructions to save clobbered registers to pseudos. Queue
29462 instructions to restore the registers afterwards.
29464 This should only needed in rare situations. */
29465 auto_vec<rtx, 33> after;
29466 for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
29467 if (TEST_HARD_REG_BIT (clobbers, regno))
29469 rtx hard_reg = gen_rtx_REG (DImode, regno);
29470 rtx pseudo_reg = gen_reg_rtx (DImode);
29471 emit_move_insn (pseudo_reg, hard_reg);
29472 after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
29474 if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
29476 rtx pseudo_reg = gen_reg_rtx (DImode);
29477 emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
29478 after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
29481 /* Emit the transition instructions themselves. */
29482 emit_insn (seq);
29484 /* Restore the clobbered registers. */
29485 for (auto *insn : after)
29486 emit_insn (insn);
29489 /* Return true if INSN references the SME state represented by hard register
29490 REGNO. */
29492 static bool
29493 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
29495 df_ref ref;
29496 FOR_EACH_INSN_DEF (ref, insn)
29497 if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
29498 && DF_REF_REGNO (ref) == regno)
29499 return true;
29500 FOR_EACH_INSN_USE (ref, insn)
29501 if (DF_REF_REGNO (ref) == regno)
29502 return true;
29503 return false;
29506 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE. */
29508 static aarch64_local_sme_state
29509 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
29511 if (!CALL_P (insn)
29512 && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29514 static bool reported;
29515 if (!reported)
29517 sorry ("catching non-call exceptions in functions with SME state");
29518 reported = true;
29520 /* Aim for graceful error recovery by picking the value that is
29521 least likely to generate an ICE. */
29522 return aarch64_local_sme_state::INACTIVE_LOCAL;
29525 /* A non-local goto is equivalent to a return. We disallow non-local
29526 receivers in functions with SME state, so we know that the target
29527 expects ZA to be dormant or off. */
29528 if (JUMP_P (insn)
29529 && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
29530 return aarch64_local_sme_state::INACTIVE_CALLER;
29532 /* start_private_za_call and end_private_za_call bracket a sequence
29533 that calls a private-ZA function. Force ZA to be turned off if the
29534 function doesn't have any live ZA state, otherwise require ZA to be
29535 inactive. */
29536 auto icode = recog_memoized (insn);
29537 if (icode == CODE_FOR_aarch64_start_private_za_call
29538 || icode == CODE_FOR_aarch64_end_private_za_call)
29539 return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29540 ? aarch64_local_sme_state::INACTIVE_LOCAL
29541 : aarch64_local_sme_state::OFF);
29543 /* Force ZA to contain the current function's ZA state if INSN wants
29544 to access it. Do the same for accesses to ZT0, since ZA and ZT0
29545 are both controlled by PSTATE.ZA. */
29546 if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
29547 || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
29548 return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29549 ? aarch64_local_sme_state::ACTIVE_LIVE
29550 : aarch64_local_sme_state::ACTIVE_DEAD);
29552 return aarch64_local_sme_state::ANY;
29555 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER. */
29557 static aarch64_tristate_mode
29558 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
29560 /* We need to set up a lazy save buffer no later than the first
29561 transition to INACTIVE_LOCAL (which involves setting up a lazy save). */
29562 if (aarch64_mode_needed_local_sme_state (insn, live)
29563 == aarch64_local_sme_state::INACTIVE_LOCAL)
29564 return aarch64_tristate_mode::YES;
29566 /* Also make sure that the lazy save buffer is set up before the first
29567 insn that throws internally. The exception handler will sometimes
29568 load from it. */
29569 if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29570 return aarch64_tristate_mode::YES;
29572 return aarch64_tristate_mode::MAYBE;
29575 /* Implement TARGET_MODE_NEEDED. */
29577 static int
29578 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
29580 switch (aarch64_mode_entity (entity))
29582 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29583 return int (aarch64_mode_needed_za_save_buffer (insn, live));
29585 case aarch64_mode_entity::LOCAL_SME_STATE:
29586 return int (aarch64_mode_needed_local_sme_state (insn, live));
29588 gcc_unreachable ();
29591 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE. */
29593 static aarch64_local_sme_state
29594 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
29595 HARD_REG_SET live)
29597 /* Note places where ZA dies, so that we can try to avoid saving and
29598 restoring state that isn't needed. */
29599 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29600 && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
29601 return aarch64_local_sme_state::ACTIVE_DEAD;
29603 /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29604 function. */
29605 if (mode == aarch64_local_sme_state::ACTIVE_DEAD
29606 && TEST_HARD_REG_BIT (live, ZA_REGNUM))
29607 return aarch64_local_sme_state::ACTIVE_LIVE;
29609 return mode;
29612 /* Implement TARGET_MODE_AFTER. */
29614 static int
29615 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
29617 switch (aarch64_mode_entity (entity))
29619 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29620 return mode;
29622 case aarch64_mode_entity::LOCAL_SME_STATE:
29623 return int (aarch64_mode_after_local_sme_state
29624 (aarch64_local_sme_state (mode), live));
29626 gcc_unreachable ();
29629 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE. */
29631 static aarch64_local_sme_state
29632 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
29633 aarch64_local_sme_state mode2)
29635 /* Perform a symmetrical check for two values. */
29636 auto is_pair = [&](aarch64_local_sme_state val1,
29637 aarch64_local_sme_state val2)
29639 return ((mode1 == val1 && mode2 == val2)
29640 || (mode1 == val2 && mode2 == val1));
29643 /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
29644 to a caller. OFF is one of the options. */
29645 if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
29646 aarch64_local_sme_state::OFF))
29647 return aarch64_local_sme_state::INACTIVE_CALLER;
29649 /* Similarly for dormant contents belonging to the current function. */
29650 if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
29651 aarch64_local_sme_state::OFF))
29652 return aarch64_local_sme_state::INACTIVE_LOCAL;
29654 /* Treat a conditionally-initialized value as a fully-initialized value. */
29655 if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
29656 aarch64_local_sme_state::ACTIVE_DEAD))
29657 return aarch64_local_sme_state::ACTIVE_LIVE;
29659 return aarch64_local_sme_state::ANY;
29662 /* Implement TARGET_MODE_CONFLUENCE. */
29664 static int
29665 aarch64_mode_confluence (int entity, int mode1, int mode2)
29667 gcc_assert (mode1 != mode2);
29668 switch (aarch64_mode_entity (entity))
29670 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29671 return int (aarch64_tristate_mode::MAYBE);
29673 case aarch64_mode_entity::LOCAL_SME_STATE:
29674 return int (aarch64_local_sme_confluence
29675 (aarch64_local_sme_state (mode1),
29676 aarch64_local_sme_state (mode2)));
29678 gcc_unreachable ();
29681 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
29682 NO throughput, or makes one transition from NO to YES. */
29684 static aarch64_tristate_mode
29685 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
29686 aarch64_tristate_mode mode2)
29688 /* Keep bringing the transition forward until it starts from NO. */
29689 if (mode1 == aarch64_tristate_mode::MAYBE
29690 && mode2 == aarch64_tristate_mode::YES)
29691 return mode2;
29693 return aarch64_tristate_mode::MAYBE;
29696 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE. */
29698 static aarch64_local_sme_state
29699 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
29700 aarch64_local_sme_state mode2)
29702 /* We always need to know what the current state is when transitioning
29703 to a new state. Force any location with indeterminate starting state
29704 to be active. */
29705 if (mode1 == aarch64_local_sme_state::ANY)
29706 switch (mode2)
29708 case aarch64_local_sme_state::INACTIVE_CALLER:
29709 case aarch64_local_sme_state::OFF:
29710 case aarch64_local_sme_state::ACTIVE_DEAD:
29711 /* The current function's ZA state is not live. */
29712 return aarch64_local_sme_state::ACTIVE_DEAD;
29714 case aarch64_local_sme_state::INACTIVE_LOCAL:
29715 case aarch64_local_sme_state::ACTIVE_LIVE:
29716 /* The current function's ZA state is live. */
29717 return aarch64_local_sme_state::ACTIVE_LIVE;
29719 case aarch64_local_sme_state::SAVED_LOCAL:
29720 /* This is a transition to an exception handler. Since we don't
29721 support non-call exceptions for SME functions, the source of
29722 the transition must be known. We'll assert later if that's
29723 not the case. */
29724 return aarch64_local_sme_state::ANY;
29726 case aarch64_local_sme_state::ANY:
29727 return aarch64_local_sme_state::ANY;
29730 return aarch64_local_sme_state::ANY;
29733 /* Implement TARGET_MODE_BACKPROP. */
29735 static int
29736 aarch64_mode_backprop (int entity, int mode1, int mode2)
29738 switch (aarch64_mode_entity (entity))
29740 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29741 return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
29742 aarch64_tristate_mode (mode2)));
29744 case aarch64_mode_entity::LOCAL_SME_STATE:
29745 return int (aarch64_local_sme_backprop
29746 (aarch64_local_sme_state (mode1),
29747 aarch64_local_sme_state (mode2)));
29749 gcc_unreachable ();
29752 /* Implement TARGET_MODE_ENTRY. */
29754 static int
29755 aarch64_mode_entry (int entity)
29757 switch (aarch64_mode_entity (entity))
29759 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29760 return int (aarch64_tristate_mode::NO);
29762 case aarch64_mode_entity::LOCAL_SME_STATE:
29763 return int (aarch64_cfun_shared_flags ("za") != 0
29764 ? aarch64_local_sme_state::ACTIVE_LIVE
29765 : aarch64_cfun_incoming_pstate_za () != 0
29766 ? aarch64_local_sme_state::ACTIVE_DEAD
29767 : aarch64_local_sme_state::INACTIVE_CALLER);
29769 gcc_unreachable ();
29772 /* Implement TARGET_MODE_EXIT. */
29774 static int
29775 aarch64_mode_exit (int entity)
29777 switch (aarch64_mode_entity (entity))
29779 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29780 return int (aarch64_tristate_mode::MAYBE);
29782 case aarch64_mode_entity::LOCAL_SME_STATE:
29783 return int (aarch64_cfun_shared_flags ("za") != 0
29784 ? aarch64_local_sme_state::ACTIVE_LIVE
29785 : aarch64_cfun_incoming_pstate_za () != 0
29786 ? aarch64_local_sme_state::ACTIVE_DEAD
29787 : aarch64_local_sme_state::INACTIVE_CALLER);
29789 gcc_unreachable ();
29792 /* Implement TARGET_MODE_EH_HANDLER. */
29794 static int
29795 aarch64_mode_eh_handler (int entity)
29797 switch (aarch64_mode_entity (entity))
29799 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29800 /* Require a lazy save buffer to be allocated before the first
29801 insn that can throw. */
29802 return int (aarch64_tristate_mode::YES);
29804 case aarch64_mode_entity::LOCAL_SME_STATE:
29805 return int (aarch64_local_sme_state::SAVED_LOCAL);
29807 gcc_unreachable ();
29810 /* Implement TARGET_MODE_PRIORITY. */
29812 static int
29813 aarch64_mode_priority (int, int n)
29815 return n;
29818 /* Implement TARGET_MD_ASM_ADJUST. */
29820 static rtx_insn *
29821 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
29822 vec<machine_mode> &input_modes,
29823 vec<const char *> &constraints,
29824 vec<rtx> &uses, vec<rtx> &clobbers,
29825 HARD_REG_SET &clobbered_regs, location_t loc)
29827 rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
29828 uses, clobbers, clobbered_regs, loc);
29830 /* "za" in the clobber list of a function with ZA state is defined to
29831 mean that the asm can read from and write to ZA. We can model the
29832 read using a USE, but unfortunately, it's not possible to model the
29833 write directly. Use a separate insn to model the effect.
29835 We must ensure that ZA is active on entry, which is enforced by using
29836 SME_STATE_REGNUM. The asm must ensure that ZA is active on return.
29838 The same thing applies to ZT0. */
29839 if (TARGET_ZA)
29840 for (unsigned int i = clobbers.length (); i-- > 0; )
29842 rtx x = clobbers[i];
29843 if (REG_P (x)
29844 && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
29846 auto id = cfun->machine->next_asm_update_za_id++;
29848 start_sequence ();
29849 if (seq)
29850 emit_insn (seq);
29851 rtx id_rtx = gen_int_mode (id, SImode);
29852 emit_insn (REGNO (x) == ZA_REGNUM
29853 ? gen_aarch64_asm_update_za (id_rtx)
29854 : gen_aarch64_asm_update_zt0 (id_rtx));
29855 seq = get_insns ();
29856 end_sequence ();
29858 auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
29859 uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
29860 uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
29862 clobbers.ordered_remove (i);
29863 CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
29866 return seq;
29869 /* BB is the target of an exception or nonlocal goto edge, which means
29870 that PSTATE.SM is known to be 0 on entry. Put it into the state that
29871 the current function requires. */
29873 static bool
29874 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
29876 if (TARGET_NON_STREAMING)
29877 return false;
29879 start_sequence ();
29880 rtx_insn *guard_label = nullptr;
29881 if (TARGET_STREAMING_COMPATIBLE)
29882 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29883 AARCH64_FL_SM_OFF);
29884 aarch64_sme_mode_switch_regs args_switch;
29885 args_switch.add_call_preserved_regs (df_get_live_in (bb));
29886 args_switch.emit_prologue ();
29887 aarch64_switch_pstate_sm (AARCH64_FL_SM_OFF, AARCH64_FL_SM_ON);
29888 args_switch.emit_epilogue ();
29889 if (guard_label)
29890 emit_label (guard_label);
29891 auto seq = get_insns ();
29892 end_sequence ();
29894 emit_insn_after (seq, bb_note (bb));
29895 return true;
29898 /* JUMP is a nonlocal goto. Its target requires PSTATE.SM to be 0 on entry,
29899 so arrange to make it so. */
29901 static bool
29902 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
29904 if (TARGET_NON_STREAMING)
29905 return false;
29907 start_sequence ();
29908 rtx_insn *guard_label = nullptr;
29909 if (TARGET_STREAMING_COMPATIBLE)
29910 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29911 AARCH64_FL_SM_OFF);
29912 aarch64_switch_pstate_sm (AARCH64_FL_SM_ON, AARCH64_FL_SM_OFF);
29913 if (guard_label)
29914 emit_label (guard_label);
29915 auto seq = get_insns ();
29916 end_sequence ();
29918 emit_insn_before (seq, jump);
29919 return true;
29922 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
29923 to switch to the new mode and the instructions needed to restore the
29924 original mode. Return true if something changed. */
29925 static bool
29926 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
29928 /* Mode switches for sibling calls are handled via the epilogue. */
29929 if (SIBLING_CALL_P (call))
29930 return false;
29932 auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
29933 if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
29934 return false;
29936 /* Switch mode before the call, preserving any argument registers
29937 across the switch. */
29938 start_sequence ();
29939 rtx_insn *args_guard_label = nullptr;
29940 if (TARGET_STREAMING_COMPATIBLE)
29941 args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29942 callee_isa_mode);
29943 aarch64_sme_mode_switch_regs args_switch;
29944 args_switch.add_call_args (call);
29945 args_switch.emit_prologue ();
29946 aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
29947 args_switch.emit_epilogue ();
29948 if (args_guard_label)
29949 emit_label (args_guard_label);
29950 auto args_seq = get_insns ();
29951 end_sequence ();
29952 emit_insn_before (args_seq, call);
29954 if (find_reg_note (call, REG_NORETURN, NULL_RTX))
29955 return true;
29957 /* Switch mode after the call, preserving any return registers across
29958 the switch. */
29959 start_sequence ();
29960 rtx_insn *return_guard_label = nullptr;
29961 if (TARGET_STREAMING_COMPATIBLE)
29962 return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29963 callee_isa_mode);
29964 aarch64_sme_mode_switch_regs return_switch;
29965 return_switch.add_call_result (call);
29966 return_switch.emit_prologue ();
29967 aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
29968 return_switch.emit_epilogue ();
29969 if (return_guard_label)
29970 emit_label (return_guard_label);
29971 auto result_seq = get_insns ();
29972 end_sequence ();
29973 emit_insn_after (result_seq, call);
29974 return true;
29977 namespace {
29979 const pass_data pass_data_switch_pstate_sm =
29981 RTL_PASS, // type
29982 "smstarts", // name
29983 OPTGROUP_NONE, // optinfo_flags
29984 TV_NONE, // tv_id
29985 0, // properties_required
29986 0, // properties_provided
29987 0, // properties_destroyed
29988 0, // todo_flags_start
29989 TODO_df_finish, // todo_flags_finish
29992 class pass_switch_pstate_sm : public rtl_opt_pass
29994 public:
29995 pass_switch_pstate_sm (gcc::context *ctxt)
29996 : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
29999 // opt_pass methods:
30000 bool gate (function *) override final;
30001 unsigned int execute (function *) override final;
30004 bool
30005 pass_switch_pstate_sm::gate (function *fn)
30007 return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_FL_SM_OFF
30008 || cfun->machine->call_switches_pstate_sm);
30011 /* Emit any instructions needed to switch PSTATE.SM. */
30012 unsigned int
30013 pass_switch_pstate_sm::execute (function *fn)
30015 basic_block bb;
30017 auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30018 bitmap_clear (blocks);
30019 FOR_EACH_BB_FN (bb, fn)
30021 if (has_abnormal_call_or_eh_pred_edge_p (bb)
30022 && aarch64_switch_pstate_sm_for_landing_pad (bb))
30023 bitmap_set_bit (blocks, bb->index);
30025 if (cfun->machine->call_switches_pstate_sm)
30027 rtx_insn *insn;
30028 FOR_BB_INSNS (bb, insn)
30029 if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30030 if (aarch64_switch_pstate_sm_for_call (call))
30031 bitmap_set_bit (blocks, bb->index);
30034 auto end = BB_END (bb);
30035 if (JUMP_P (end)
30036 && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30037 && aarch64_switch_pstate_sm_for_jump (end))
30038 bitmap_set_bit (blocks, bb->index);
30040 find_many_sub_basic_blocks (blocks);
30041 clear_aux_for_blocks ();
30042 return 0;
30047 rtl_opt_pass *
30048 make_pass_switch_pstate_sm (gcc::context *ctxt)
30050 return new pass_switch_pstate_sm (ctxt);
30053 /* Parse an implementation-defined system register name of
30054 the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30055 Return true if name matched against above pattern, false
30056 otherwise. */
30057 bool
30058 aarch64_is_implem_def_reg (const char *regname)
30060 unsigned pos = 0;
30061 unsigned name_len = strlen (regname);
30062 if (name_len < 12 || name_len > 14)
30063 return false;
30065 auto cterm_valid_p = [&]()
30067 bool leading_zero_p = false;
30068 unsigned i = 0;
30069 char n[3] = {0};
30071 if (regname[pos] != 'c')
30072 return false;
30073 pos++;
30074 while (regname[pos] != '_')
30076 if (leading_zero_p)
30077 return false;
30078 if (i == 0 && regname[pos] == '0')
30079 leading_zero_p = true;
30080 if (i > 2)
30081 return false;
30082 if (!ISDIGIT (regname[pos]))
30083 return false;
30084 n[i++] = regname[pos++];
30086 if (atoi (n) > 15)
30087 return false;
30088 return true;
30091 if (regname[pos] != 's')
30092 return false;
30093 pos++;
30094 if (regname[pos] < '0' || regname[pos] > '3')
30095 return false;
30096 pos++;
30097 if (regname[pos++] != '_')
30098 return false;
30099 if (regname[pos] < '0' || regname[pos] > '7')
30100 return false;
30101 pos++;
30102 if (regname[pos++] != '_')
30103 return false;
30104 if (!cterm_valid_p ())
30105 return false;
30106 if (regname[pos++] != '_')
30107 return false;
30108 if (!cterm_valid_p ())
30109 return false;
30110 if (regname[pos++] != '_')
30111 return false;
30112 if (regname[pos] < '0' || regname[pos] > '7')
30113 return false;
30114 return true;
30117 /* Return true if REGNAME matches either a known permitted system
30118 register name, or a generic sysreg specification. For use in
30119 back-end predicate `aarch64_sysreg_string'. */
30120 bool
30121 aarch64_valid_sysreg_name_p (const char *regname)
30123 const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30124 if (sysreg == NULL)
30125 return aarch64_is_implem_def_reg (regname);
30126 if (sysreg->arch_reqs)
30127 return (aarch64_isa_flags & sysreg->arch_reqs);
30128 return true;
30131 /* Return the generic sysreg specification for a valid system register
30132 name, otherwise NULL. WRITE_P is true iff the register is being
30133 written to. IS128OP indicates the requested system register should
30134 be checked for a 128-bit implementation. */
30135 const char *
30136 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30138 const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30139 if (sysreg == NULL)
30141 if (aarch64_is_implem_def_reg (regname))
30142 return regname;
30143 else
30144 return NULL;
30146 if (is128op && !(sysreg->properties & F_REG_128))
30147 return NULL;
30148 if ((write_p && (sysreg->properties & F_REG_READ))
30149 || (!write_p && (sysreg->properties & F_REG_WRITE)))
30150 return NULL;
30151 if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30152 return NULL;
30153 return sysreg->encoding;
30156 /* Target-specific selftests. */
30158 #if CHECKING_P
30160 namespace selftest {
30162 /* Selftest for the RTL loader.
30163 Verify that the RTL loader copes with a dump from
30164 print_rtx_function. This is essentially just a test that class
30165 function_reader can handle a real dump, but it also verifies
30166 that lookup_reg_by_dump_name correctly handles hard regs.
30167 The presence of hard reg names in the dump means that the test is
30168 target-specific, hence it is in this file. */
30170 static void
30171 aarch64_test_loading_full_dump ()
30173 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
30175 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
30177 rtx_insn *insn_1 = get_insn_by_uid (1);
30178 ASSERT_EQ (NOTE, GET_CODE (insn_1));
30180 rtx_insn *insn_15 = get_insn_by_uid (15);
30181 ASSERT_EQ (INSN, GET_CODE (insn_15));
30182 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
30184 /* Verify crtl->return_rtx. */
30185 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
30186 ASSERT_EQ (0, REGNO (crtl->return_rtx));
30187 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
30190 /* Test the fractional_cost class. */
30192 static void
30193 aarch64_test_fractional_cost ()
30195 using cf = fractional_cost;
30197 ASSERT_EQ (cf (0, 20), 0);
30199 ASSERT_EQ (cf (4, 2), 2);
30200 ASSERT_EQ (3, cf (9, 3));
30202 ASSERT_NE (cf (5, 2), 2);
30203 ASSERT_NE (3, cf (8, 3));
30205 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30206 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30207 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30209 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30210 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30211 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30212 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30213 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30214 ASSERT_EQ (3 - cf (10, 3), 0);
30216 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30217 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30219 ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30220 ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30221 ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30222 ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30223 ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30224 ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30225 ASSERT_TRUE (cf (239, 240) <= 1);
30226 ASSERT_TRUE (cf (240, 240) <= 1);
30227 ASSERT_FALSE (cf (241, 240) <= 1);
30228 ASSERT_FALSE (2 <= cf (207, 104));
30229 ASSERT_TRUE (2 <= cf (208, 104));
30230 ASSERT_TRUE (2 <= cf (209, 104));
30232 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30233 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30234 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30235 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30236 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30237 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30238 ASSERT_TRUE (cf (239, 240) < 1);
30239 ASSERT_FALSE (cf (240, 240) < 1);
30240 ASSERT_FALSE (cf (241, 240) < 1);
30241 ASSERT_FALSE (2 < cf (207, 104));
30242 ASSERT_FALSE (2 < cf (208, 104));
30243 ASSERT_TRUE (2 < cf (209, 104));
30245 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30246 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30247 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30248 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30249 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30250 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30251 ASSERT_FALSE (cf (239, 240) >= 1);
30252 ASSERT_TRUE (cf (240, 240) >= 1);
30253 ASSERT_TRUE (cf (241, 240) >= 1);
30254 ASSERT_TRUE (2 >= cf (207, 104));
30255 ASSERT_TRUE (2 >= cf (208, 104));
30256 ASSERT_FALSE (2 >= cf (209, 104));
30258 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30259 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30260 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30261 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30262 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30263 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30264 ASSERT_FALSE (cf (239, 240) > 1);
30265 ASSERT_FALSE (cf (240, 240) > 1);
30266 ASSERT_TRUE (cf (241, 240) > 1);
30267 ASSERT_TRUE (2 > cf (207, 104));
30268 ASSERT_FALSE (2 > cf (208, 104));
30269 ASSERT_FALSE (2 > cf (209, 104));
30271 ASSERT_EQ (cf (1, 2).ceil (), 1);
30272 ASSERT_EQ (cf (11, 7).ceil (), 2);
30273 ASSERT_EQ (cf (20, 1).ceil (), 20);
30274 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30275 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30276 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30277 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30278 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30280 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30283 /* Calculate whether our system register data, as imported from
30284 `aarch64-sys-reg.def' has any duplicate entries. */
30285 static void
30286 aarch64_test_sysreg_encoding_clashes (void)
30288 using dup_instances_t = hash_map<nofree_string_hash,
30289 std::vector<const sysreg_t*>>;
30291 dup_instances_t duplicate_instances;
30293 /* Every time an encoding is established to come up more than once
30294 we add it to a "clash-analysis queue", which is then used to extract
30295 necessary information from our hash map when establishing whether
30296 repeated encodings are valid. */
30298 /* 1) Collect recurrence information. */
30299 for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
30301 const sysreg_t *reg = aarch64_sysregs + i;
30303 std::vector<const sysreg_t*> *tmp
30304 = &duplicate_instances.get_or_insert (reg->encoding);
30306 tmp->push_back (reg);
30309 /* 2) Carry out analysis on collected data. */
30310 for (auto instance : duplicate_instances)
30312 unsigned nrep = instance.second.size ();
30313 if (nrep > 1)
30314 for (unsigned i = 0; i < nrep; i++)
30315 for (unsigned j = i + 1; j < nrep; j++)
30317 const sysreg_t *a = instance.second[i];
30318 const sysreg_t *b = instance.second[j];
30319 ASSERT_TRUE ((a->properties != b->properties)
30320 || (a->arch_reqs != b->arch_reqs));
30325 /* Run all target-specific selftests. */
30327 static void
30328 aarch64_run_selftests (void)
30330 aarch64_test_loading_full_dump ();
30331 aarch64_test_fractional_cost ();
30332 aarch64_test_sysreg_encoding_clashes ();
30335 } // namespace selftest
30337 #endif /* #if CHECKING_P */
30339 #undef TARGET_STACK_PROTECT_GUARD
30340 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30342 #undef TARGET_ADDRESS_COST
30343 #define TARGET_ADDRESS_COST aarch64_address_cost
30345 /* This hook will determines whether unnamed bitfields affect the alignment
30346 of the containing structure. The hook returns true if the structure
30347 should inherit the alignment requirements of an unnamed bitfield's
30348 type. */
30349 #undef TARGET_ALIGN_ANON_BITFIELD
30350 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30352 #undef TARGET_ASM_ALIGNED_DI_OP
30353 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30355 #undef TARGET_ASM_ALIGNED_HI_OP
30356 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30358 #undef TARGET_ASM_ALIGNED_SI_OP
30359 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30361 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30362 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30363 hook_bool_const_tree_hwi_hwi_const_tree_true
30365 #undef TARGET_ASM_FILE_START
30366 #define TARGET_ASM_FILE_START aarch64_start_file
30368 #undef TARGET_ASM_OUTPUT_MI_THUNK
30369 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30371 #undef TARGET_ASM_SELECT_RTX_SECTION
30372 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30374 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30375 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30377 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30378 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30380 #undef TARGET_BUILD_BUILTIN_VA_LIST
30381 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30383 #undef TARGET_CALLEE_COPIES
30384 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30386 #undef TARGET_FRAME_POINTER_REQUIRED
30387 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30389 #undef TARGET_CAN_ELIMINATE
30390 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30392 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30393 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30394 aarch64_function_attribute_inlinable_p
30396 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30397 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30399 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30400 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30402 #undef TARGET_CAN_INLINE_P
30403 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30405 #undef TARGET_CANNOT_FORCE_CONST_MEM
30406 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30408 #undef TARGET_CASE_VALUES_THRESHOLD
30409 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30411 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30412 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30414 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30415 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30417 /* Only the least significant bit is used for initialization guard
30418 variables. */
30419 #undef TARGET_CXX_GUARD_MASK_BIT
30420 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30422 #undef TARGET_C_MODE_FOR_SUFFIX
30423 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30425 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30426 #undef TARGET_DEFAULT_TARGET_FLAGS
30427 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30428 #endif
30430 #undef TARGET_CLASS_MAX_NREGS
30431 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30433 #undef TARGET_BUILTIN_DECL
30434 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30436 #undef TARGET_BUILTIN_RECIPROCAL
30437 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30439 #undef TARGET_C_EXCESS_PRECISION
30440 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30442 #undef TARGET_EXPAND_BUILTIN
30443 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30445 #undef TARGET_EXPAND_BUILTIN_VA_START
30446 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30448 #undef TARGET_FOLD_BUILTIN
30449 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30451 #undef TARGET_FUNCTION_ARG
30452 #define TARGET_FUNCTION_ARG aarch64_function_arg
30454 #undef TARGET_FUNCTION_ARG_ADVANCE
30455 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30457 #undef TARGET_FUNCTION_ARG_BOUNDARY
30458 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30460 #undef TARGET_FUNCTION_ARG_PADDING
30461 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30463 #undef TARGET_GET_RAW_RESULT_MODE
30464 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30465 #undef TARGET_GET_RAW_ARG_MODE
30466 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30468 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30469 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30471 #undef TARGET_FUNCTION_VALUE
30472 #define TARGET_FUNCTION_VALUE aarch64_function_value
30474 #undef TARGET_FUNCTION_VALUE_REGNO_P
30475 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30477 #undef TARGET_START_CALL_ARGS
30478 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30480 #undef TARGET_END_CALL_ARGS
30481 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30483 #undef TARGET_GIMPLE_FOLD_BUILTIN
30484 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30486 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30487 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30489 #undef TARGET_INIT_BUILTINS
30490 #define TARGET_INIT_BUILTINS aarch64_init_builtins
30492 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30493 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30494 aarch64_ira_change_pseudo_allocno_class
30496 #undef TARGET_LEGITIMATE_ADDRESS_P
30497 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30499 #undef TARGET_LEGITIMATE_CONSTANT_P
30500 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30502 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30503 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30504 aarch64_legitimize_address_displacement
30506 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30507 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30509 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30510 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30511 aarch64_libgcc_floating_mode_supported_p
30513 #undef TARGET_MANGLE_TYPE
30514 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30516 #undef TARGET_INVALID_BINARY_OP
30517 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30519 #undef TARGET_VERIFY_TYPE_CONTEXT
30520 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30522 #undef TARGET_MEMORY_MOVE_COST
30523 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30525 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30526 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30528 #undef TARGET_MUST_PASS_IN_STACK
30529 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30531 /* This target hook should return true if accesses to volatile bitfields
30532 should use the narrowest mode possible. It should return false if these
30533 accesses should use the bitfield container type. */
30534 #undef TARGET_NARROW_VOLATILE_BITFIELD
30535 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30537 #undef TARGET_OPTION_OVERRIDE
30538 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30540 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30541 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30542 aarch64_override_options_after_change
30544 #undef TARGET_OFFLOAD_OPTIONS
30545 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30547 #undef TARGET_OPTION_RESTORE
30548 #define TARGET_OPTION_RESTORE aarch64_option_restore
30550 #undef TARGET_OPTION_PRINT
30551 #define TARGET_OPTION_PRINT aarch64_option_print
30553 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30554 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30556 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30557 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30558 aarch64_option_valid_version_attribute_p
30560 #undef TARGET_SET_CURRENT_FUNCTION
30561 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30563 #undef TARGET_PASS_BY_REFERENCE
30564 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30566 #undef TARGET_PREFERRED_RELOAD_CLASS
30567 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30569 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30570 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30572 #undef TARGET_DWARF_FRAME_REG_MODE
30573 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30575 #undef TARGET_PROMOTED_TYPE
30576 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30578 #undef TARGET_SECONDARY_RELOAD
30579 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30581 #undef TARGET_SECONDARY_MEMORY_NEEDED
30582 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30584 #undef TARGET_SHIFT_TRUNCATION_MASK
30585 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30587 #undef TARGET_SETUP_INCOMING_VARARGS
30588 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30590 #undef TARGET_STRUCT_VALUE_RTX
30591 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
30593 #undef TARGET_REGISTER_MOVE_COST
30594 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30596 #undef TARGET_RETURN_IN_MEMORY
30597 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30599 #undef TARGET_RETURN_IN_MSB
30600 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30602 #undef TARGET_RTX_COSTS
30603 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30605 #undef TARGET_INSN_COST
30606 #define TARGET_INSN_COST aarch64_insn_cost
30608 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30609 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30611 #undef TARGET_SCHED_ISSUE_RATE
30612 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
30614 #undef TARGET_SCHED_VARIABLE_ISSUE
30615 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
30617 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
30618 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
30619 aarch64_sched_first_cycle_multipass_dfa_lookahead
30621 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
30622 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
30623 aarch64_first_cycle_multipass_dfa_lookahead_guard
30625 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
30626 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
30627 aarch64_get_separate_components
30629 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
30630 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
30631 aarch64_components_for_bb
30633 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
30634 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
30635 aarch64_disqualify_components
30637 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
30638 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
30639 aarch64_emit_prologue_components
30641 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
30642 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
30643 aarch64_emit_epilogue_components
30645 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
30646 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
30647 aarch64_set_handled_components
30649 #undef TARGET_TRAMPOLINE_INIT
30650 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
30652 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
30653 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
30655 #undef TARGET_VECTOR_MODE_SUPPORTED_P
30656 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
30658 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
30659 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
30661 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
30662 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
30664 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
30665 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
30666 aarch64_builtin_support_vector_misalignment
30668 #undef TARGET_ARRAY_MODE
30669 #define TARGET_ARRAY_MODE aarch64_array_mode
30671 #undef TARGET_ARRAY_MODE_SUPPORTED_P
30672 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
30674 #undef TARGET_VECTORIZE_CREATE_COSTS
30675 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
30677 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
30678 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
30679 aarch64_builtin_vectorization_cost
30681 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
30682 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
30684 #undef TARGET_VECTORIZE_BUILTINS
30685 #define TARGET_VECTORIZE_BUILTINS
30687 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
30688 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
30689 aarch64_autovectorize_vector_modes
30691 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
30692 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
30693 aarch64_atomic_assign_expand_fenv
30695 /* Section anchor support. */
30697 #undef TARGET_MIN_ANCHOR_OFFSET
30698 #define TARGET_MIN_ANCHOR_OFFSET -256
30700 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
30701 byte offset; we can do much more for larger data types, but have no way
30702 to determine the size of the access. We assume accesses are aligned. */
30703 #undef TARGET_MAX_ANCHOR_OFFSET
30704 #define TARGET_MAX_ANCHOR_OFFSET 4095
30706 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
30707 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
30708 aarch64_vectorize_preferred_div_as_shifts_over_mult
30710 #undef TARGET_VECTOR_ALIGNMENT
30711 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
30713 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
30714 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
30715 aarch64_vectorize_preferred_vector_alignment
30716 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
30717 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
30718 aarch64_simd_vector_alignment_reachable
30720 /* vec_perm support. */
30722 #undef TARGET_VECTORIZE_VEC_PERM_CONST
30723 #define TARGET_VECTORIZE_VEC_PERM_CONST \
30724 aarch64_vectorize_vec_perm_const
30726 #undef TARGET_VECTORIZE_RELATED_MODE
30727 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
30728 #undef TARGET_VECTORIZE_GET_MASK_MODE
30729 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
30730 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
30731 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
30732 aarch64_empty_mask_is_expensive
30733 #undef TARGET_PREFERRED_ELSE_VALUE
30734 #define TARGET_PREFERRED_ELSE_VALUE \
30735 aarch64_preferred_else_value
30737 #undef TARGET_INIT_LIBFUNCS
30738 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
30740 #undef TARGET_FIXED_CONDITION_CODE_REGS
30741 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
30743 #undef TARGET_FLAGS_REGNUM
30744 #define TARGET_FLAGS_REGNUM CC_REGNUM
30746 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
30747 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
30749 #undef TARGET_ASAN_SHADOW_OFFSET
30750 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
30752 #undef TARGET_LEGITIMIZE_ADDRESS
30753 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
30755 #undef TARGET_SCHED_CAN_SPECULATE_INSN
30756 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
30758 #undef TARGET_CAN_USE_DOLOOP_P
30759 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
30761 #undef TARGET_SCHED_ADJUST_PRIORITY
30762 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
30764 #undef TARGET_SCHED_MACRO_FUSION_P
30765 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
30767 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
30768 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
30770 #undef TARGET_SCHED_FUSION_PRIORITY
30771 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
30773 #undef TARGET_UNSPEC_MAY_TRAP_P
30774 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
30776 #undef TARGET_USE_PSEUDO_PIC_REG
30777 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
30779 #undef TARGET_PRINT_OPERAND
30780 #define TARGET_PRINT_OPERAND aarch64_print_operand
30782 #undef TARGET_PRINT_OPERAND_ADDRESS
30783 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
30785 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
30786 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
30788 #undef TARGET_OPTAB_SUPPORTED_P
30789 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
30791 #undef TARGET_OMIT_STRUCT_RETURN_REG
30792 #define TARGET_OMIT_STRUCT_RETURN_REG true
30794 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
30795 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
30796 aarch64_dwarf_poly_indeterminate_value
30798 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
30799 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
30800 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
30802 #undef TARGET_HARD_REGNO_NREGS
30803 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
30804 #undef TARGET_HARD_REGNO_MODE_OK
30805 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
30807 #undef TARGET_MODES_TIEABLE_P
30808 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
30810 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
30811 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
30812 aarch64_hard_regno_call_part_clobbered
30814 #undef TARGET_INSN_CALLEE_ABI
30815 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
30817 #undef TARGET_CONSTANT_ALIGNMENT
30818 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
30820 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
30821 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
30822 aarch64_stack_clash_protection_alloca_probe_range
30824 #undef TARGET_COMPUTE_PRESSURE_CLASSES
30825 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
30827 #undef TARGET_CAN_CHANGE_MODE_CLASS
30828 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
30830 #undef TARGET_SELECT_EARLY_REMAT_MODES
30831 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
30833 #undef TARGET_SPECULATION_SAFE_VALUE
30834 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
30836 #undef TARGET_ESTIMATED_POLY_VALUE
30837 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
30839 #undef TARGET_ATTRIBUTE_TABLE
30840 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
30842 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
30843 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
30844 aarch64_simd_clone_compute_vecsize_and_simdlen
30846 #undef TARGET_SIMD_CLONE_ADJUST
30847 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
30849 #undef TARGET_SIMD_CLONE_USABLE
30850 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
30852 #undef TARGET_COMP_TYPE_ATTRIBUTES
30853 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
30855 #undef TARGET_MERGE_DECL_ATTRIBUTES
30856 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
30858 #undef TARGET_GET_MULTILIB_ABI_NAME
30859 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
30861 #undef TARGET_FNTYPE_ABI
30862 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
30864 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
30865 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
30867 #if CHECKING_P
30868 #undef TARGET_RUN_TARGET_SELFTESTS
30869 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
30870 #endif /* #if CHECKING_P */
30872 #undef TARGET_ASM_POST_CFI_STARTPROC
30873 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
30875 #undef TARGET_STRICT_ARGUMENT_NAMING
30876 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
30878 #undef TARGET_MODE_EMIT
30879 #define TARGET_MODE_EMIT aarch64_mode_emit
30881 #undef TARGET_MODE_NEEDED
30882 #define TARGET_MODE_NEEDED aarch64_mode_needed
30884 #undef TARGET_MODE_AFTER
30885 #define TARGET_MODE_AFTER aarch64_mode_after
30887 #undef TARGET_MODE_CONFLUENCE
30888 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
30890 #undef TARGET_MODE_BACKPROP
30891 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
30893 #undef TARGET_MODE_ENTRY
30894 #define TARGET_MODE_ENTRY aarch64_mode_entry
30896 #undef TARGET_MODE_EXIT
30897 #define TARGET_MODE_EXIT aarch64_mode_exit
30899 #undef TARGET_MODE_EH_HANDLER
30900 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
30902 #undef TARGET_MODE_PRIORITY
30903 #define TARGET_MODE_PRIORITY aarch64_mode_priority
30905 #undef TARGET_MD_ASM_ADJUST
30906 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
30908 #undef TARGET_ASM_FILE_END
30909 #define TARGET_ASM_FILE_END aarch64_asm_file_end
30911 #undef TARGET_ASM_FUNCTION_EPILOGUE
30912 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
30914 #undef TARGET_HAVE_SHADOW_CALL_STACK
30915 #define TARGET_HAVE_SHADOW_CALL_STACK true
30917 #undef TARGET_CONST_ANCHOR
30918 #define TARGET_CONST_ANCHOR 0x1000000
30920 #undef TARGET_EXTRA_LIVE_ON_ENTRY
30921 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
30923 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
30924 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
30926 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
30927 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
30929 #undef TARGET_OPTION_FUNCTION_VERSIONS
30930 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
30932 #undef TARGET_COMPARE_VERSION_PRIORITY
30933 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
30935 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
30936 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
30937 aarch64_generate_version_dispatcher_body
30939 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
30940 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
30941 aarch64_get_function_versions_dispatcher
30943 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
30944 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
30946 struct gcc_target targetm = TARGET_INITIALIZER;
30948 #include "gt-aarch64.h"