testsuite: powerpc: fix dg-do run typo
[official-gcc.git] / gcc / config / aarch64 / aarch64.cc
blob0d41a193ec187755088c8f8d3de5a9ae5db4ced6
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2024 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #define INCLUDE_VECTOR
26 #include "config.h"
27 #include "system.h"
28 #include "coretypes.h"
29 #include "backend.h"
30 #include "target.h"
31 #include "rtl.h"
32 #include "tree.h"
33 #include "memmodel.h"
34 #include "gimple.h"
35 #include "cfghooks.h"
36 #include "cfgloop.h"
37 #include "df.h"
38 #include "tm_p.h"
39 #include "stringpool.h"
40 #include "attribs.h"
41 #include "optabs.h"
42 #include "regs.h"
43 #include "emit-rtl.h"
44 #include "recog.h"
45 #include "cgraph.h"
46 #include "diagnostic.h"
47 #include "insn-attr.h"
48 #include "alias.h"
49 #include "fold-const.h"
50 #include "stor-layout.h"
51 #include "calls.h"
52 #include "varasm.h"
53 #include "output.h"
54 #include "flags.h"
55 #include "explow.h"
56 #include "expr.h"
57 #include "reload.h"
58 #include "langhooks.h"
59 #include "opts.h"
60 #include "gimplify.h"
61 #include "dwarf2.h"
62 #include "gimple-iterator.h"
63 #include "tree-vectorizer.h"
64 #include "aarch64-cost-tables.h"
65 #include "dumpfile.h"
66 #include "builtins.h"
67 #include "rtl-iter.h"
68 #include "tm-constrs.h"
69 #include "sched-int.h"
70 #include "target-globals.h"
71 #include "common/common-target.h"
72 #include "cfgrtl.h"
73 #include "selftest.h"
74 #include "selftest-rtl.h"
75 #include "rtx-vector-builder.h"
76 #include "intl.h"
77 #include "expmed.h"
78 #include "function-abi.h"
79 #include "gimple-pretty-print.h"
80 #include "tree-ssa-loop-niter.h"
81 #include "fractional-cost.h"
82 #include "rtlanal.h"
83 #include "tree-dfa.h"
84 #include "asan.h"
85 #include "aarch64-feature-deps.h"
86 #include "config/arm/aarch-common.h"
87 #include "config/arm/aarch-common-protos.h"
88 #include "common/config/aarch64/cpuinfo.h"
89 #include "ssa.h"
90 #include "except.h"
91 #include "tree-pass.h"
92 #include "cfgbuild.h"
93 #include "symbol-summary.h"
94 #include "sreal.h"
95 #include "ipa-cp.h"
96 #include "ipa-prop.h"
97 #include "ipa-fnsummary.h"
98 #include "hash-map.h"
100 /* This file should be included last. */
101 #include "target-def.h"
103 /* Defined for convenience. */
104 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
106 /* Maximum bytes set for an inline memset expansion. With -Os use 3 STP
107 and 1 MOVI/DUP (same size as a call). */
108 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
110 /* Flags that describe how a function shares certain architectural state
111 with its callers.
113 - AARCH64_STATE_SHARED indicates that the function does share the state
114 with callers.
116 - AARCH64_STATE_IN indicates that the function reads (or might read) the
117 incoming state. The converse is that the function ignores the incoming
118 state.
120 - AARCH64_STATE_OUT indicates that the function returns new state.
121 The converse is that the state on return is the same as it was on entry.
123 A function that partially modifies the state treats it as both IN
124 and OUT (because the value on return depends to some extent on the
125 value on input). */
126 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
127 constexpr auto AARCH64_STATE_IN = 1U << 1;
128 constexpr auto AARCH64_STATE_OUT = 1U << 2;
130 /* Information about a legitimate vector immediate operand. */
131 struct simd_immediate_info
133 enum insn_type { MOV, MVN, INDEX, PTRUE };
134 enum modifier_type { LSL, MSL };
136 simd_immediate_info () {}
137 simd_immediate_info (scalar_float_mode, rtx);
138 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
139 insn_type = MOV, modifier_type = LSL,
140 unsigned int = 0);
141 simd_immediate_info (scalar_mode, rtx, rtx);
142 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
144 /* The mode of the elements. */
145 scalar_mode elt_mode;
147 /* The instruction to use to move the immediate into a vector. */
148 insn_type insn;
150 union
152 /* For MOV and MVN. */
153 struct
155 /* The value of each element. */
156 rtx value;
158 /* The kind of shift modifier to use, and the number of bits to shift.
159 This is (LSL, 0) if no shift is needed. */
160 modifier_type modifier;
161 unsigned int shift;
162 } mov;
164 /* For INDEX. */
165 struct
167 /* The value of the first element and the step to be added for each
168 subsequent element. */
169 rtx base, step;
170 } index;
172 /* For PTRUE. */
173 aarch64_svpattern pattern;
174 } u;
177 /* Construct a floating-point immediate in which each element has mode
178 ELT_MODE_IN and value VALUE_IN. */
179 inline simd_immediate_info
180 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
181 : elt_mode (elt_mode_in), insn (MOV)
183 u.mov.value = value_in;
184 u.mov.modifier = LSL;
185 u.mov.shift = 0;
188 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
189 and value VALUE_IN. The other parameters are as for the structure
190 fields. */
191 inline simd_immediate_info
192 ::simd_immediate_info (scalar_int_mode elt_mode_in,
193 unsigned HOST_WIDE_INT value_in,
194 insn_type insn_in, modifier_type modifier_in,
195 unsigned int shift_in)
196 : elt_mode (elt_mode_in), insn (insn_in)
198 u.mov.value = gen_int_mode (value_in, elt_mode_in);
199 u.mov.modifier = modifier_in;
200 u.mov.shift = shift_in;
203 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
204 and where element I is equal to BASE_IN + I * STEP_IN. */
205 inline simd_immediate_info
206 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
207 : elt_mode (elt_mode_in), insn (INDEX)
209 u.index.base = base_in;
210 u.index.step = step_in;
213 /* Construct a predicate that controls elements of mode ELT_MODE_IN
214 and has PTRUE pattern PATTERN_IN. */
215 inline simd_immediate_info
216 ::simd_immediate_info (scalar_int_mode elt_mode_in,
217 aarch64_svpattern pattern_in)
218 : elt_mode (elt_mode_in), insn (PTRUE)
220 u.pattern = pattern_in;
223 namespace {
225 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
226 class pure_scalable_type_info
228 public:
229 /* Represents the result of analyzing a type. All values are nonzero,
230 in the possibly forlorn hope that accidental conversions to bool
231 trigger a warning. */
232 enum analysis_result
234 /* The type does not have an ABI identity; i.e. it doesn't contain
235 at least one object whose type is a Fundamental Data Type. */
236 NO_ABI_IDENTITY = 1,
238 /* The type is definitely a Pure Scalable Type. */
239 IS_PST,
241 /* The type is definitely not a Pure Scalable Type. */
242 ISNT_PST,
244 /* It doesn't matter for PCS purposes whether the type is a Pure
245 Scalable Type or not, since the type will be handled the same
246 way regardless.
248 Specifically, this means that if the type is a Pure Scalable Type,
249 there aren't enough argument registers to hold it, and so it will
250 need to be passed or returned in memory. If the type isn't a
251 Pure Scalable Type, it's too big to be passed or returned in core
252 or SIMD&FP registers, and so again will need to go in memory. */
253 DOESNT_MATTER
256 /* Aggregates of 17 bytes or more are normally passed and returned
257 in memory, so aggregates of that size can safely be analyzed as
258 DOESNT_MATTER. We need to be able to collect enough pieces to
259 represent a PST that is smaller than that. Since predicates are
260 2 bytes in size for -msve-vector-bits=128, that means we need to be
261 able to store at least 8 pieces.
263 We also need to be able to store enough pieces to represent
264 a single vector in each vector argument register and a single
265 predicate in each predicate argument register. This means that
266 we need at least 12 pieces. */
267 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
268 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
270 /* Describes one piece of a PST. Each piece is one of:
272 - a single Scalable Vector Type (SVT)
273 - a single Scalable Predicate Type (SPT)
274 - a PST containing 2, 3 or 4 SVTs, with no padding
276 It either represents a single built-in type or a PST formed from
277 multiple homogeneous built-in types. */
278 struct piece
280 rtx get_rtx (unsigned int, unsigned int) const;
282 /* The number of vector and predicate registers that the piece
283 occupies. One of the two is always zero. */
284 unsigned int num_zr;
285 unsigned int num_pr;
287 /* The mode of the registers described above. */
288 machine_mode mode;
290 /* If this piece is formed from multiple homogeneous built-in types,
291 this is the mode of the built-in types, otherwise it is MODE. */
292 machine_mode orig_mode;
294 /* The offset in bytes of the piece from the start of the type. */
295 poly_uint64 offset;
298 /* Divides types analyzed as IS_PST into individual pieces. The pieces
299 are in memory order. */
300 auto_vec<piece, MAX_PIECES> pieces;
302 unsigned int num_zr () const;
303 unsigned int num_pr () const;
305 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
307 analysis_result analyze (const_tree);
308 bool analyze_registers (const_tree);
310 private:
311 analysis_result analyze_array (const_tree);
312 analysis_result analyze_record (const_tree);
313 void add_piece (const piece &);
317 /* The current code model. */
318 enum aarch64_code_model aarch64_cmodel;
320 enum aarch64_tp_reg aarch64_tpidr_register;
322 /* The number of 64-bit elements in an SVE vector. */
323 poly_uint16 aarch64_sve_vg;
325 #ifdef HAVE_AS_TLS
326 #undef TARGET_HAVE_TLS
327 #define TARGET_HAVE_TLS 1
328 #endif
330 static bool aarch64_composite_type_p (const_tree, machine_mode);
331 static bool aarch64_return_in_memory_1 (const_tree);
332 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
333 const_tree,
334 machine_mode *, int *,
335 bool *, bool);
336 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
337 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
338 static void aarch64_override_options_after_change (void);
339 static bool aarch64_vector_mode_supported_p (machine_mode);
340 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
341 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
342 const_tree type,
343 int misalignment,
344 bool is_packed);
345 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
346 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
347 aarch64_addr_query_type);
349 /* The processor for which instructions should be scheduled. */
350 enum aarch64_processor aarch64_tune = cortexa53;
352 /* Mask to specify which instruction scheduling options should be used. */
353 uint64_t aarch64_tune_flags = 0;
355 /* Global flag for PC relative loads. */
356 bool aarch64_pcrelative_literal_loads;
358 /* Global flag for whether frame pointer is enabled. */
359 bool aarch64_use_frame_pointer;
361 /* Support for command line parsing of boolean flags in the tuning
362 structures. */
363 struct aarch64_flag_desc
365 const char* name;
366 unsigned int flag;
369 #define AARCH64_FUSION_PAIR(name, internal_name) \
370 { name, AARCH64_FUSE_##internal_name },
371 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
373 { "none", AARCH64_FUSE_NOTHING },
374 #include "aarch64-fusion-pairs.def"
375 { "all", AARCH64_FUSE_ALL },
376 { NULL, AARCH64_FUSE_NOTHING }
379 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
380 { name, AARCH64_EXTRA_TUNE_##internal_name },
381 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
383 { "none", AARCH64_EXTRA_TUNE_NONE },
384 #include "aarch64-tuning-flags.def"
385 { "all", AARCH64_EXTRA_TUNE_ALL },
386 { NULL, AARCH64_EXTRA_TUNE_NONE }
389 /* Tuning parameters. */
390 #include "tuning_models/generic.h"
391 #include "tuning_models/generic_armv8_a.h"
392 #include "tuning_models/generic_armv9_a.h"
393 #include "tuning_models/cortexa35.h"
394 #include "tuning_models/cortexa53.h"
395 #include "tuning_models/cortexa57.h"
396 #include "tuning_models/cortexa72.h"
397 #include "tuning_models/cortexa73.h"
398 #include "tuning_models/exynosm1.h"
399 #include "tuning_models/thunderxt88.h"
400 #include "tuning_models/thunderx.h"
401 #include "tuning_models/tsv110.h"
402 #include "tuning_models/xgene1.h"
403 #include "tuning_models/emag.h"
404 #include "tuning_models/qdf24xx.h"
405 #include "tuning_models/saphira.h"
406 #include "tuning_models/thunderx2t99.h"
407 #include "tuning_models/thunderx3t110.h"
408 #include "tuning_models/neoversen1.h"
409 #include "tuning_models/ampere1.h"
410 #include "tuning_models/ampere1a.h"
411 #include "tuning_models/ampere1b.h"
412 #include "tuning_models/neoversev1.h"
413 #include "tuning_models/neoverse512tvb.h"
414 #include "tuning_models/neoversen2.h"
415 #include "tuning_models/neoversev2.h"
416 #include "tuning_models/a64fx.h"
418 /* Support for fine-grained override of the tuning structures. */
419 struct aarch64_tuning_override_function
421 const char* name;
422 void (*parse_override)(const char*, struct tune_params*);
425 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
426 static void aarch64_parse_tune_string (const char*, struct tune_params*);
427 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
429 static const struct aarch64_tuning_override_function
430 aarch64_tuning_override_functions[] =
432 { "fuse", aarch64_parse_fuse_string },
433 { "tune", aarch64_parse_tune_string },
434 { "sve_width", aarch64_parse_sve_width_string },
435 { NULL, NULL }
438 /* A processor implementing AArch64. */
439 struct processor
441 const char *name;
442 aarch64_processor ident;
443 aarch64_processor sched_core;
444 aarch64_arch arch;
445 aarch64_feature_flags flags;
446 const tune_params *tune;
449 /* Architectures implementing AArch64. */
450 static CONSTEXPR const processor all_architectures[] =
452 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
453 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
454 feature_deps::ARCH_IDENT ().enable, NULL},
455 #include "aarch64-arches.def"
456 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
459 /* Processor cores implementing AArch64. */
460 static const struct processor all_cores[] =
462 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
463 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
464 feature_deps::cpu_##IDENT, &COSTS##_tunings},
465 #include "aarch64-cores.def"
466 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
468 /* Internal representation of system registers. */
469 typedef struct {
470 const char *name;
471 /* Stringified sysreg encoding values, represented as
472 s<sn>_<op1>_c<cn>_c<cm>_<op2>. */
473 const char *encoding;
474 /* Flags affecting sysreg usage, such as read/write-only. */
475 unsigned properties;
476 /* Architectural features implied by sysreg. */
477 aarch64_feature_flags arch_reqs;
478 } sysreg_t;
480 /* An aarch64_feature_set initializer for a single feature,
481 AARCH64_FEATURE_<FEAT>. */
482 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
484 /* Used by AARCH64_FEATURES. */
485 #define AARCH64_OR_FEATURES_1(X, F1) \
486 AARCH64_FEATURE (F1)
487 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
488 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
489 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
490 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
492 /* An aarch64_feature_set initializer for the N features listed in "...". */
493 #define AARCH64_FEATURES(N, ...) \
494 AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
496 #define AARCH64_NO_FEATURES 0
498 /* Flags associated with the properties of system registers. It mainly serves
499 to mark particular registers as read or write only. */
500 #define F_DEPRECATED (1 << 1)
501 #define F_REG_READ (1 << 2)
502 #define F_REG_WRITE (1 << 3)
503 #define F_ARCHEXT (1 << 4)
504 /* Flag indicating register name is alias for another system register. */
505 #define F_REG_ALIAS (1 << 5)
506 /* Flag indicatinig registers which may be implemented with 128-bits. */
507 #define F_REG_128 (1 << 6)
509 /* Database of system registers, their encodings and architectural
510 requirements. */
511 const sysreg_t aarch64_sysregs[] =
513 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
514 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
515 { NAME, ENC, FLAGS, ARCH },
516 #include "aarch64-sys-regs.def"
517 #undef CPENC
520 #undef AARCH64_NO_FEATURES
522 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
523 static sysreg_map_t *sysreg_map = nullptr;
525 /* Map system register names to their hardware metadata: encoding,
526 feature flags and architectural feature requirements, all of which
527 are encoded in a sysreg_t struct. */
528 void
529 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
531 bool dup = sysreg_map->put (name, metadata);
532 gcc_checking_assert (!dup);
535 /* Lazily initialize hash table for system register validation,
536 checking the validity of supplied register name and returning
537 register's associated metadata. */
538 static void
539 aarch64_init_sysregs (void)
541 gcc_assert (!sysreg_map);
542 sysreg_map = new sysreg_map_t;
545 for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
547 const sysreg_t *reg = aarch64_sysregs + i;
548 aarch64_register_sysreg (reg->name, reg);
552 /* No direct access to the sysreg hash-map should be made. Doing so
553 risks trying to acess an unitialized hash-map and dereferencing the
554 returned double pointer without due care risks dereferencing a
555 null-pointer. */
556 const sysreg_t *
557 aarch64_lookup_sysreg_map (const char *regname)
559 if (!sysreg_map)
560 aarch64_init_sysregs ();
562 const sysreg_t **sysreg_entry = sysreg_map->get (regname);
563 if (sysreg_entry != NULL)
564 return *sysreg_entry;
565 return NULL;
568 /* The current tuning set. */
569 struct tune_params aarch64_tune_params = generic_tunings;
571 /* If NAME is the name of an arm:: attribute that describes shared state,
572 return its associated AARCH64_STATE_* flags, otherwise return 0. */
573 static unsigned int
574 aarch64_attribute_shared_state_flags (const char *name)
576 if (strcmp (name, "in") == 0)
577 return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
578 if (strcmp (name, "inout") == 0)
579 return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
580 if (strcmp (name, "out") == 0)
581 return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
582 if (strcmp (name, "preserves") == 0)
583 return AARCH64_STATE_SHARED;
584 return 0;
587 /* See whether attribute list ATTRS has any sharing information
588 for state STATE_NAME. Return the associated state flags if so,
589 otherwise return 0. */
590 static unsigned int
591 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
593 for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
595 if (!cxx11_attribute_p (attr))
596 continue;
598 auto ns = IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr)));
599 if (strcmp (ns, "arm") != 0)
600 continue;
602 auto attr_name = IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr)));
603 auto flags = aarch64_attribute_shared_state_flags (attr_name);
604 if (!flags)
605 continue;
607 for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
609 tree value = TREE_VALUE (arg);
610 if (TREE_CODE (value) == STRING_CST
611 && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
612 return flags;
615 return 0;
618 /* Return true if DECL creates a new scope for state STATE_STRING. */
619 static bool
620 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
622 if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
623 for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
625 tree value = TREE_VALUE (arg);
626 if (TREE_CODE (value) == STRING_CST
627 && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
628 return true;
630 return false;
633 /* Return true if attribute argument VALUE is a recognized state string,
634 otherwise report an error. NAME is the name of the attribute to which
635 VALUE is being passed. */
636 static bool
637 aarch64_check_state_string (tree name, tree value)
639 if (TREE_CODE (value) != STRING_CST)
641 error ("the arguments to %qE must be constant strings", name);
642 return false;
645 const char *state_name = TREE_STRING_POINTER (value);
646 if (strcmp (state_name, "za") != 0
647 && strcmp (state_name, "zt0") != 0)
649 error ("unrecognized state string %qs", state_name);
650 return false;
653 return true;
656 /* qsort callback to compare two STRING_CSTs. */
657 static int
658 cmp_string_csts (const void *a, const void *b)
660 return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
661 TREE_STRING_POINTER (*(const_tree const *) b));
664 /* Canonicalize a list of state strings. ARGS contains the arguments to
665 a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
666 of the same type. If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
667 arguments and drop the new attribute. Otherwise, the new attribute must
668 be kept and ARGS must include the information in OLD_ATTR.
670 In both cases, the new arguments must be a sorted list of state strings
671 with duplicates removed.
673 Return true if new attribute should be kept, false if it should be
674 dropped. */
675 static bool
676 aarch64_merge_string_arguments (tree args, tree old_attr,
677 bool can_merge_in_place)
679 /* Get a sorted list of all state strings (including duplicates). */
680 auto add_args = [](vec<tree> &strings, const_tree args)
682 for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
683 if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
684 strings.safe_push (TREE_VALUE (arg));
686 auto_vec<tree, 16> strings;
687 add_args (strings, args);
688 if (old_attr)
689 add_args (strings, TREE_VALUE (old_attr));
690 strings.qsort (cmp_string_csts);
692 /* The list can be empty if there was no previous attribute and if all
693 the new arguments are erroneous. Drop the attribute in that case. */
694 if (strings.is_empty ())
695 return false;
697 /* Destructively modify one of the argument lists, removing duplicates
698 on the fly. */
699 bool use_old_attr = old_attr && can_merge_in_place;
700 tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
701 tree prev = NULL_TREE;
702 for (tree arg : strings)
704 if (prev && simple_cst_equal (arg, prev))
705 continue;
706 prev = arg;
707 if (!*end)
708 *end = tree_cons (NULL_TREE, arg, NULL_TREE);
709 else
710 TREE_VALUE (*end) = arg;
711 end = &TREE_CHAIN (*end);
713 *end = NULL_TREE;
714 return !use_old_attr;
717 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
719 static tree
720 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
721 int, bool *no_add_attrs)
723 /* Since we set fn_type_req to true, the caller should have checked
724 this for us. */
725 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
726 switch ((arm_pcs) fntype_abi (*node).id ())
728 case ARM_PCS_AAPCS64:
729 case ARM_PCS_SIMD:
730 return NULL_TREE;
732 case ARM_PCS_SVE:
733 error ("the %qE attribute cannot be applied to an SVE function type",
734 name);
735 *no_add_attrs = true;
736 return NULL_TREE;
738 case ARM_PCS_TLSDESC:
739 case ARM_PCS_UNKNOWN:
740 break;
742 gcc_unreachable ();
745 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
746 otherwise report an error. */
747 static bool
748 aarch64_check_arm_new_against_type (tree args, tree decl)
750 tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
751 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
753 tree value = TREE_VALUE (arg);
754 if (TREE_CODE (value) == STRING_CST)
756 const char *state_name = TREE_STRING_POINTER (value);
757 if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
759 error_at (DECL_SOURCE_LOCATION (decl),
760 "cannot create a new %qs scope since %qs is shared"
761 " with callers", state_name, state_name);
762 return false;
766 return true;
769 /* Callback for arm::new attributes. */
770 static tree
771 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
773 tree decl = *node;
774 if (TREE_CODE (decl) != FUNCTION_DECL)
776 error ("%qE attribute applies only to function definitions", name);
777 *no_add_attrs = true;
778 return NULL_TREE;
780 if (TREE_TYPE (decl) == error_mark_node)
782 *no_add_attrs = true;
783 return NULL_TREE;
786 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
787 aarch64_check_state_string (name, TREE_VALUE (arg));
789 if (!aarch64_check_arm_new_against_type (args, decl))
791 *no_add_attrs = true;
792 return NULL_TREE;
795 /* If there is an old attribute, we should try to update it in-place,
796 so that there is only one (definitive) arm::new attribute on the decl. */
797 tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
798 if (!aarch64_merge_string_arguments (args, old_attr, true))
799 *no_add_attrs = true;
801 return NULL_TREE;
804 /* Callback for arm::{in,out,inout,preserves} attributes. */
805 static tree
806 handle_arm_shared (tree *node, tree name, tree args,
807 int, bool *no_add_attrs)
809 tree type = *node;
810 tree old_attrs = TYPE_ATTRIBUTES (type);
811 auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
812 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
814 tree value = TREE_VALUE (arg);
815 if (aarch64_check_state_string (name, value))
817 const char *state_name = TREE_STRING_POINTER (value);
818 auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
819 state_name);
820 if (old_flags && old_flags != flags)
822 error ("inconsistent attributes for state %qs", state_name);
823 *no_add_attrs = true;
824 return NULL_TREE;
829 /* We can't update an old attribute in-place, since types are shared.
830 Instead make sure that this new attribute contains all the
831 information, so that the old attribute becomes redundant. */
832 tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
833 old_attrs);
834 if (!aarch64_merge_string_arguments (args, old_attr, false))
835 *no_add_attrs = true;
837 return NULL_TREE;
840 /* Mutually-exclusive function type attributes for controlling PSTATE.SM. */
841 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
843 /* Attribute name exclusion applies to:
844 function, type, variable */
845 { "streaming", false, true, false },
846 { "streaming_compatible", false, true, false },
847 { NULL, false, false, false }
850 /* Table of machine attributes. */
851 static const attribute_spec aarch64_gnu_attributes[] =
853 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
854 affects_type_identity, handler, exclude } */
855 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
856 handle_aarch64_vector_pcs_attribute, NULL },
857 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
858 aarch64_sve::handle_arm_sve_vector_bits_attribute,
859 NULL },
860 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
861 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
862 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
863 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
864 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
865 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
866 #endif
867 #ifdef SUBTARGET_ATTRIBUTE_TABLE
868 SUBTARGET_ATTRIBUTE_TABLE
869 #endif
872 static const scoped_attribute_specs aarch64_gnu_attribute_table =
874 "gnu", { aarch64_gnu_attributes }
877 static const attribute_spec aarch64_arm_attributes[] =
879 { "streaming", 0, 0, false, true, true, true,
880 NULL, attr_streaming_exclusions },
881 { "streaming_compatible", 0, 0, false, true, true, true,
882 NULL, attr_streaming_exclusions },
883 { "locally_streaming", 0, 0, true, false, false, false, NULL, NULL },
884 { "new", 1, -1, true, false, false, false,
885 handle_arm_new, NULL },
886 { "preserves", 1, -1, false, true, true, true,
887 handle_arm_shared, NULL },
888 { "in", 1, -1, false, true, true, true,
889 handle_arm_shared, NULL },
890 { "out", 1, -1, false, true, true, true,
891 handle_arm_shared, NULL },
892 { "inout", 1, -1, false, true, true, true,
893 handle_arm_shared, NULL }
896 static const scoped_attribute_specs aarch64_arm_attribute_table =
898 "arm", { aarch64_arm_attributes }
901 static const scoped_attribute_specs *const aarch64_attribute_table[] =
903 &aarch64_gnu_attribute_table,
904 &aarch64_arm_attribute_table
907 typedef enum aarch64_cond_code
909 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
910 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
911 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
913 aarch64_cc;
915 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
918 /* The condition codes of the processor, and the inverse function. */
919 static const char * const aarch64_condition_codes[] =
921 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
922 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
925 /* The preferred condition codes for SVE conditions. */
926 static const char *const aarch64_sve_condition_codes[] =
928 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
929 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
932 /* Return the assembly token for svpattern value VALUE. */
934 static const char *
935 svpattern_token (enum aarch64_svpattern pattern)
937 switch (pattern)
939 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
940 AARCH64_FOR_SVPATTERN (CASE)
941 #undef CASE
942 case AARCH64_NUM_SVPATTERNS:
943 break;
945 gcc_unreachable ();
948 /* Return the location of a piece that is known to be passed or returned
949 in registers. FIRST_ZR is the first unused vector argument register
950 and FIRST_PR is the first unused predicate argument register. */
953 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
954 unsigned int first_pr) const
956 gcc_assert (VECTOR_MODE_P (mode)
957 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
958 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
960 if (num_zr > 0 && num_pr == 0)
961 return gen_rtx_REG (mode, first_zr);
963 if (num_zr == 0 && num_pr <= 2)
964 return gen_rtx_REG (mode, first_pr);
966 gcc_unreachable ();
969 /* Return the total number of vector registers required by the PST. */
971 unsigned int
972 pure_scalable_type_info::num_zr () const
974 unsigned int res = 0;
975 for (unsigned int i = 0; i < pieces.length (); ++i)
976 res += pieces[i].num_zr;
977 return res;
980 /* Return the total number of predicate registers required by the PST. */
982 unsigned int
983 pure_scalable_type_info::num_pr () const
985 unsigned int res = 0;
986 for (unsigned int i = 0; i < pieces.length (); ++i)
987 res += pieces[i].num_pr;
988 return res;
991 /* Return the location of a PST that is known to be passed or returned
992 in registers. FIRST_ZR is the first unused vector argument register
993 and FIRST_PR is the first unused predicate argument register. */
996 pure_scalable_type_info::get_rtx (machine_mode mode,
997 unsigned int first_zr,
998 unsigned int first_pr) const
1000 /* Try to return a single REG if possible. This leads to better
1001 code generation; it isn't required for correctness. */
1002 if (mode == pieces[0].mode)
1004 gcc_assert (pieces.length () == 1);
1005 return pieces[0].get_rtx (first_zr, first_pr);
1008 /* Build up a PARALLEL that contains the individual pieces. */
1009 rtvec rtxes = rtvec_alloc (pieces.length ());
1010 for (unsigned int i = 0; i < pieces.length (); ++i)
1012 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1013 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1014 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1015 first_zr += pieces[i].num_zr;
1016 first_pr += pieces[i].num_pr;
1018 return gen_rtx_PARALLEL (mode, rtxes);
1021 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1022 in the AAPCS64. */
1024 pure_scalable_type_info::analysis_result
1025 pure_scalable_type_info::analyze (const_tree type)
1027 /* Prevent accidental reuse. */
1028 gcc_assert (pieces.is_empty ());
1030 /* No code will be generated for erroneous types, so we won't establish
1031 an ABI mapping. */
1032 if (type == error_mark_node)
1033 return NO_ABI_IDENTITY;
1035 /* Zero-sized types disappear in the language->ABI mapping. */
1036 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1037 return NO_ABI_IDENTITY;
1039 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1040 piece p = {};
1041 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1043 machine_mode mode = TYPE_MODE_RAW (type);
1044 gcc_assert (VECTOR_MODE_P (mode)
1045 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1047 p.mode = p.orig_mode = mode;
1048 add_piece (p);
1049 return IS_PST;
1052 /* Check for user-defined PSTs. */
1053 if (TREE_CODE (type) == ARRAY_TYPE)
1054 return analyze_array (type);
1055 if (TREE_CODE (type) == RECORD_TYPE)
1056 return analyze_record (type);
1058 return ISNT_PST;
1061 /* Analyze a type that is known not to be passed or returned in memory.
1062 Return true if it has an ABI identity and is a Pure Scalable Type. */
1064 bool
1065 pure_scalable_type_info::analyze_registers (const_tree type)
1067 analysis_result result = analyze (type);
1068 gcc_assert (result != DOESNT_MATTER);
1069 return result == IS_PST;
1072 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1074 pure_scalable_type_info::analysis_result
1075 pure_scalable_type_info::analyze_array (const_tree type)
1077 /* Analyze the element type. */
1078 pure_scalable_type_info element_info;
1079 analysis_result result = element_info.analyze (TREE_TYPE (type));
1080 if (result != IS_PST)
1081 return result;
1083 /* An array of unknown, flexible or variable length will be passed and
1084 returned by reference whatever we do. */
1085 tree nelts_minus_one = array_type_nelts (type);
1086 if (!tree_fits_uhwi_p (nelts_minus_one))
1087 return DOESNT_MATTER;
1089 /* Likewise if the array is constant-sized but too big to be interesting.
1090 The double checks against MAX_PIECES are to protect against overflow. */
1091 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1092 if (count > MAX_PIECES)
1093 return DOESNT_MATTER;
1094 count += 1;
1095 if (count * element_info.pieces.length () > MAX_PIECES)
1096 return DOESNT_MATTER;
1098 /* The above checks should have weeded out elements of unknown size. */
1099 poly_uint64 element_bytes;
1100 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1101 gcc_unreachable ();
1103 /* Build up the list of individual vectors and predicates. */
1104 gcc_assert (!element_info.pieces.is_empty ());
1105 for (unsigned int i = 0; i < count; ++i)
1106 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1108 piece p = element_info.pieces[j];
1109 p.offset += i * element_bytes;
1110 add_piece (p);
1112 return IS_PST;
1115 /* Subroutine of analyze for handling RECORD_TYPEs. */
1117 pure_scalable_type_info::analysis_result
1118 pure_scalable_type_info::analyze_record (const_tree type)
1120 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1122 if (TREE_CODE (field) != FIELD_DECL)
1123 continue;
1125 /* Zero-sized fields disappear in the language->ABI mapping. */
1126 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1127 continue;
1129 /* All fields with an ABI identity must be PSTs for the record as
1130 a whole to be a PST. If any individual field is too big to be
1131 interesting then the record is too. */
1132 pure_scalable_type_info field_info;
1133 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1134 if (subresult == NO_ABI_IDENTITY)
1135 continue;
1136 if (subresult != IS_PST)
1137 return subresult;
1139 /* Since all previous fields are PSTs, we ought to be able to track
1140 the field offset using poly_ints. */
1141 tree bitpos = bit_position (field);
1142 gcc_assert (poly_int_tree_p (bitpos));
1144 /* For the same reason, it shouldn't be possible to create a PST field
1145 whose offset isn't byte-aligned. */
1146 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1147 BITS_PER_UNIT);
1149 /* Punt if the record is too big to be interesting. */
1150 poly_uint64 bytepos;
1151 if (!wide_bytepos.to_uhwi (&bytepos)
1152 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1153 return DOESNT_MATTER;
1155 /* Add the individual vectors and predicates in the field to the
1156 record's list. */
1157 gcc_assert (!field_info.pieces.is_empty ());
1158 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1160 piece p = field_info.pieces[i];
1161 p.offset += bytepos;
1162 add_piece (p);
1165 /* Empty structures disappear in the language->ABI mapping. */
1166 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1169 /* Add P to the list of pieces in the type. */
1171 void
1172 pure_scalable_type_info::add_piece (const piece &p)
1174 /* Try to fold the new piece into the previous one to form a
1175 single-mode PST. For example, if we see three consecutive vectors
1176 of the same mode, we can represent them using the corresponding
1177 3-tuple mode.
1179 This is purely an optimization. */
1180 if (!pieces.is_empty ())
1182 piece &prev = pieces.last ();
1183 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1184 unsigned int nelems1, nelems2;
1185 if (prev.orig_mode == p.orig_mode
1186 && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1187 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1188 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1189 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1190 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1191 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1192 && targetm.array_mode (p.orig_mode,
1193 nelems1 + nelems2).exists (&prev.mode))
1195 prev.num_zr += p.num_zr;
1196 prev.num_pr += p.num_pr;
1197 return;
1200 pieces.quick_push (p);
1203 /* Return true if at least one possible value of type TYPE includes at
1204 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1206 This is a relatively expensive test for some types, so it should
1207 generally be made as late as possible. */
1209 static bool
1210 aarch64_some_values_include_pst_objects_p (const_tree type)
1212 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1213 return false;
1215 if (aarch64_sve::builtin_type_p (type))
1216 return true;
1218 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1219 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1221 if (RECORD_OR_UNION_TYPE_P (type))
1222 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1223 if (TREE_CODE (field) == FIELD_DECL
1224 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1225 return true;
1227 return false;
1230 /* Return the descriptor of the SIMD ABI. */
1232 static const predefined_function_abi &
1233 aarch64_simd_abi (void)
1235 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1236 if (!simd_abi.initialized_p ())
1238 HARD_REG_SET full_reg_clobbers
1239 = default_function_abi.full_reg_clobbers ();
1240 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1241 if (FP_SIMD_SAVED_REGNUM_P (regno))
1242 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1243 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1245 return simd_abi;
1248 /* Return the descriptor of the SVE PCS. */
1250 static const predefined_function_abi &
1251 aarch64_sve_abi (void)
1253 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1254 if (!sve_abi.initialized_p ())
1256 HARD_REG_SET full_reg_clobbers
1257 = default_function_abi.full_reg_clobbers ();
1258 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1259 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1260 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1261 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1262 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1264 return sve_abi;
1267 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1268 wraps, otherwise return X itself. */
1270 static rtx
1271 strip_salt (rtx x)
1273 rtx search = x;
1274 if (GET_CODE (search) == CONST)
1275 search = XEXP (search, 0);
1276 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1277 x = XVECEXP (search, 0, 0);
1278 return x;
1281 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1282 expression. */
1284 static rtx
1285 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1287 return strip_salt (strip_offset (addr, offset));
1290 /* Generate code to enable conditional branches in functions over 1 MiB. */
1291 const char *
1292 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1293 const char * branch_format)
1295 rtx_code_label * tmp_label = gen_label_rtx ();
1296 char label_buf[256];
1297 char buffer[128];
1298 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1299 CODE_LABEL_NUMBER (tmp_label));
1300 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1301 rtx dest_label = operands[pos_label];
1302 operands[pos_label] = tmp_label;
1304 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1305 output_asm_insn (buffer, operands);
1307 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1308 operands[pos_label] = dest_label;
1309 output_asm_insn (buffer, operands);
1310 return "";
1313 void
1314 aarch64_err_no_fpadvsimd (machine_mode mode)
1316 if (TARGET_GENERAL_REGS_ONLY)
1317 if (FLOAT_MODE_P (mode))
1318 error ("%qs is incompatible with the use of floating-point types",
1319 "-mgeneral-regs-only");
1320 else
1321 error ("%qs is incompatible with the use of vector types",
1322 "-mgeneral-regs-only");
1323 else
1324 if (FLOAT_MODE_P (mode))
1325 error ("%qs feature modifier is incompatible with the use of"
1326 " floating-point types", "+nofp");
1327 else
1328 error ("%qs feature modifier is incompatible with the use of"
1329 " vector types", "+nofp");
1332 /* Report when we try to do something that requires SVE when SVE is disabled.
1333 This is an error of last resort and isn't very high-quality. It usually
1334 involves attempts to measure the vector length in some way. */
1335 static void
1336 aarch64_report_sve_required (void)
1338 static bool reported_p = false;
1340 /* Avoid reporting a slew of messages for a single oversight. */
1341 if (reported_p)
1342 return;
1344 error ("this operation requires the SVE ISA extension");
1345 inform (input_location, "you can enable SVE using the command-line"
1346 " option %<-march%>, or by using the %<target%>"
1347 " attribute or pragma");
1348 reported_p = true;
1351 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1352 registers. */
1353 inline bool
1354 pr_or_ffr_regnum_p (unsigned int regno)
1356 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1359 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1360 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1361 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1362 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1363 and GENERAL_REGS is lower than the memory cost (in this case the best class
1364 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1365 cost results in bad allocations with many redundant int<->FP moves which
1366 are expensive on various cores.
1367 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1368 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1369 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1370 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1371 The result of this is that it is no longer inefficient to have a higher
1372 memory move cost than the register move cost.
1375 static reg_class_t
1376 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1377 reg_class_t best_class)
1379 machine_mode mode;
1381 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1382 || !reg_class_subset_p (FP_REGS, allocno_class))
1383 return allocno_class;
1385 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1386 || !reg_class_subset_p (FP_REGS, best_class))
1387 return best_class;
1389 mode = PSEUDO_REGNO_MODE (regno);
1390 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1393 static unsigned int
1394 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1396 if (GET_MODE_UNIT_SIZE (mode) == 4)
1397 return aarch64_tune_params.min_div_recip_mul_sf;
1398 return aarch64_tune_params.min_div_recip_mul_df;
1401 /* Return the reassociation width of treeop OPC with mode MODE. */
1402 static int
1403 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1405 if (VECTOR_MODE_P (mode))
1406 return aarch64_tune_params.vec_reassoc_width;
1407 if (INTEGRAL_MODE_P (mode))
1408 return aarch64_tune_params.int_reassoc_width;
1409 /* Reassociation reduces the number of FMAs which may result in worse
1410 performance. Use a per-CPU setting for FMA reassociation which allows
1411 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1412 CPUs with many FP pipes to enable reassociation.
1413 Since the reassociation pass doesn't understand FMA at all, assume
1414 that any FP addition might turn into FMA. */
1415 if (FLOAT_MODE_P (mode))
1416 return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1417 : aarch64_tune_params.fp_reassoc_width;
1418 return 1;
1421 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1422 unsigned
1423 aarch64_debugger_regno (unsigned regno)
1425 if (GP_REGNUM_P (regno))
1426 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1427 else if (regno == SP_REGNUM)
1428 return AARCH64_DWARF_SP;
1429 else if (FP_REGNUM_P (regno))
1430 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1431 else if (PR_REGNUM_P (regno))
1432 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1433 else if (regno == VG_REGNUM)
1434 return AARCH64_DWARF_VG;
1436 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1437 equivalent DWARF register. */
1438 return DWARF_FRAME_REGISTERS;
1441 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
1442 static machine_mode
1443 aarch64_dwarf_frame_reg_mode (int regno)
1445 /* Predicate registers are call-clobbered in the EH ABI (which is
1446 ARM_PCS_AAPCS64), so they should not be described by CFI.
1447 Their size changes as VL changes, so any values computed by
1448 __builtin_init_dwarf_reg_size_table might not be valid for
1449 all frames. */
1450 if (PR_REGNUM_P (regno))
1451 return VOIDmode;
1452 return default_dwarf_frame_reg_mode (regno);
1455 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1456 integer, otherwise return X unmodified. */
1457 static rtx
1458 aarch64_bit_representation (rtx x)
1460 if (CONST_DOUBLE_P (x))
1461 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1462 return x;
1465 /* Return an estimate for the number of quadwords in an SVE vector. This is
1466 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
1467 static unsigned int
1468 aarch64_estimated_sve_vq ()
1470 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1473 /* Return true if MODE is an SVE predicate mode. */
1474 static bool
1475 aarch64_sve_pred_mode_p (machine_mode mode)
1477 return (TARGET_SVE
1478 && (mode == VNx16BImode
1479 || mode == VNx8BImode
1480 || mode == VNx4BImode
1481 || mode == VNx2BImode));
1484 /* Three mutually-exclusive flags describing a vector or predicate type. */
1485 const unsigned int VEC_ADVSIMD = 1;
1486 const unsigned int VEC_SVE_DATA = 2;
1487 const unsigned int VEC_SVE_PRED = 4;
1488 /* Indicates a structure of 2, 3 or 4 vectors or predicates. */
1489 const unsigned int VEC_STRUCT = 8;
1490 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1491 vector has fewer significant bytes than a full SVE vector. */
1492 const unsigned int VEC_PARTIAL = 16;
1493 /* Useful combinations of the above. */
1494 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1495 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1497 /* Return a set of flags describing the vector properties of mode MODE.
1498 If ANY_TARGET_P is false (the default), ignore modes that are not supported
1499 by the current target. Otherwise categorize the modes that can be used
1500 with the set of all targets supported by the port. */
1502 static unsigned int
1503 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1505 if (aarch64_sve_pred_mode_p (mode))
1506 return VEC_SVE_PRED;
1508 /* Make the decision based on the mode's enum value rather than its
1509 properties, so that we keep the correct classification regardless
1510 of -msve-vector-bits. */
1511 switch (mode)
1513 /* Partial SVE QI vectors. */
1514 case E_VNx2QImode:
1515 case E_VNx4QImode:
1516 case E_VNx8QImode:
1517 /* Partial SVE HI vectors. */
1518 case E_VNx2HImode:
1519 case E_VNx4HImode:
1520 /* Partial SVE SI vector. */
1521 case E_VNx2SImode:
1522 /* Partial SVE HF vectors. */
1523 case E_VNx2HFmode:
1524 case E_VNx4HFmode:
1525 /* Partial SVE BF vectors. */
1526 case E_VNx2BFmode:
1527 case E_VNx4BFmode:
1528 /* Partial SVE SF vector. */
1529 case E_VNx2SFmode:
1530 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1532 case E_VNx16QImode:
1533 case E_VNx8HImode:
1534 case E_VNx4SImode:
1535 case E_VNx2DImode:
1536 case E_VNx8BFmode:
1537 case E_VNx8HFmode:
1538 case E_VNx4SFmode:
1539 case E_VNx2DFmode:
1540 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1542 /* x2 SVE vectors. */
1543 case E_VNx32QImode:
1544 case E_VNx16HImode:
1545 case E_VNx8SImode:
1546 case E_VNx4DImode:
1547 case E_VNx16BFmode:
1548 case E_VNx16HFmode:
1549 case E_VNx8SFmode:
1550 case E_VNx4DFmode:
1551 /* x3 SVE vectors. */
1552 case E_VNx48QImode:
1553 case E_VNx24HImode:
1554 case E_VNx12SImode:
1555 case E_VNx6DImode:
1556 case E_VNx24BFmode:
1557 case E_VNx24HFmode:
1558 case E_VNx12SFmode:
1559 case E_VNx6DFmode:
1560 /* x4 SVE vectors. */
1561 case E_VNx64QImode:
1562 case E_VNx32HImode:
1563 case E_VNx16SImode:
1564 case E_VNx8DImode:
1565 case E_VNx32BFmode:
1566 case E_VNx32HFmode:
1567 case E_VNx16SFmode:
1568 case E_VNx8DFmode:
1569 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1571 case E_OImode:
1572 case E_CImode:
1573 case E_XImode:
1574 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1576 /* Structures of 64-bit Advanced SIMD vectors. */
1577 case E_V2x8QImode:
1578 case E_V2x4HImode:
1579 case E_V2x2SImode:
1580 case E_V2x1DImode:
1581 case E_V2x4BFmode:
1582 case E_V2x4HFmode:
1583 case E_V2x2SFmode:
1584 case E_V2x1DFmode:
1585 case E_V3x8QImode:
1586 case E_V3x4HImode:
1587 case E_V3x2SImode:
1588 case E_V3x1DImode:
1589 case E_V3x4BFmode:
1590 case E_V3x4HFmode:
1591 case E_V3x2SFmode:
1592 case E_V3x1DFmode:
1593 case E_V4x8QImode:
1594 case E_V4x4HImode:
1595 case E_V4x2SImode:
1596 case E_V4x1DImode:
1597 case E_V4x4BFmode:
1598 case E_V4x4HFmode:
1599 case E_V4x2SFmode:
1600 case E_V4x1DFmode:
1601 return (TARGET_FLOAT || any_target_p)
1602 ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1604 /* Structures of 128-bit Advanced SIMD vectors. */
1605 case E_V2x16QImode:
1606 case E_V2x8HImode:
1607 case E_V2x4SImode:
1608 case E_V2x2DImode:
1609 case E_V2x8BFmode:
1610 case E_V2x8HFmode:
1611 case E_V2x4SFmode:
1612 case E_V2x2DFmode:
1613 case E_V3x16QImode:
1614 case E_V3x8HImode:
1615 case E_V3x4SImode:
1616 case E_V3x2DImode:
1617 case E_V3x8BFmode:
1618 case E_V3x8HFmode:
1619 case E_V3x4SFmode:
1620 case E_V3x2DFmode:
1621 case E_V4x16QImode:
1622 case E_V4x8HImode:
1623 case E_V4x4SImode:
1624 case E_V4x2DImode:
1625 case E_V4x8BFmode:
1626 case E_V4x8HFmode:
1627 case E_V4x4SFmode:
1628 case E_V4x2DFmode:
1629 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1631 /* 64-bit Advanced SIMD vectors. */
1632 case E_V8QImode:
1633 case E_V4HImode:
1634 case E_V2SImode:
1635 case E_V1DImode:
1636 case E_V4HFmode:
1637 case E_V4BFmode:
1638 case E_V2SFmode:
1639 case E_V1DFmode:
1640 /* 128-bit Advanced SIMD vectors. */
1641 case E_V16QImode:
1642 case E_V8HImode:
1643 case E_V4SImode:
1644 case E_V2DImode:
1645 case E_V8HFmode:
1646 case E_V8BFmode:
1647 case E_V4SFmode:
1648 case E_V2DFmode:
1649 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1651 case E_VNx32BImode:
1652 return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1654 default:
1655 return 0;
1659 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1660 bool
1661 aarch64_advsimd_struct_mode_p (machine_mode mode)
1663 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1664 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1667 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
1668 static bool
1669 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1671 return (aarch64_classify_vector_mode (mode)
1672 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1675 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
1676 static bool
1677 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1679 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1682 /* Return true if MODE is any of the data vector modes, including
1683 structure modes. */
1684 static bool
1685 aarch64_vector_data_mode_p (machine_mode mode)
1687 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1690 /* Return true if MODE is any form of SVE mode, including predicates,
1691 vectors and structures. */
1692 bool
1693 aarch64_sve_mode_p (machine_mode mode)
1695 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1698 /* Return true if MODE is an SVE data vector mode; either a single vector
1699 or a structure of vectors. */
1700 static bool
1701 aarch64_sve_data_mode_p (machine_mode mode)
1703 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1706 /* Return the number of defined bytes in one constituent vector of
1707 SVE mode MODE, which has vector flags VEC_FLAGS. */
1708 static poly_int64
1709 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1711 if (vec_flags & VEC_PARTIAL)
1712 /* A single partial vector. */
1713 return GET_MODE_SIZE (mode);
1715 if (vec_flags & VEC_SVE_DATA)
1716 /* A single vector or a tuple. */
1717 return BYTES_PER_SVE_VECTOR;
1719 /* A single predicate. */
1720 gcc_assert (vec_flags & VEC_SVE_PRED);
1721 return BYTES_PER_SVE_PRED;
1724 /* If MODE holds an array of vectors, return the number of vectors
1725 in the array, otherwise return 1. */
1727 static unsigned int
1728 aarch64_ldn_stn_vectors (machine_mode mode)
1730 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1731 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1732 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1733 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1734 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1735 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1736 return exact_div (GET_MODE_SIZE (mode),
1737 BYTES_PER_SVE_VECTOR).to_constant ();
1738 return 1;
1741 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1742 corresponding vector structure mode. */
1743 static opt_machine_mode
1744 aarch64_advsimd_vector_array_mode (machine_mode mode,
1745 unsigned HOST_WIDE_INT nelems)
1747 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1748 if (known_eq (GET_MODE_SIZE (mode), 8))
1749 flags |= VEC_PARTIAL;
1751 machine_mode struct_mode;
1752 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1753 if (aarch64_classify_vector_mode (struct_mode) == flags
1754 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1755 && known_eq (GET_MODE_NUNITS (struct_mode),
1756 GET_MODE_NUNITS (mode) * nelems))
1757 return struct_mode;
1758 return opt_machine_mode ();
1761 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1763 opt_machine_mode
1764 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1766 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1767 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1768 machine_mode mode;
1769 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1770 if (inner_mode == GET_MODE_INNER (mode)
1771 && known_eq (nunits, GET_MODE_NUNITS (mode))
1772 && aarch64_sve_data_mode_p (mode))
1773 return mode;
1774 return opt_machine_mode ();
1777 /* Implement target hook TARGET_ARRAY_MODE. */
1778 static opt_machine_mode
1779 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1781 if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1783 /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1784 a mode to other array sizes. Using integer modes requires a round
1785 trip through memory and generates terrible code. */
1786 if (nelems == 1)
1787 return mode;
1788 if (mode == VNx16BImode && nelems == 2)
1789 return VNx32BImode;
1790 return BLKmode;
1793 auto flags = aarch64_classify_vector_mode (mode);
1794 if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1795 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1796 GET_MODE_NUNITS (mode) * nelems);
1798 if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1799 return aarch64_advsimd_vector_array_mode (mode, nelems);
1801 return opt_machine_mode ();
1804 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1805 static bool
1806 aarch64_array_mode_supported_p (machine_mode mode,
1807 unsigned HOST_WIDE_INT nelems)
1809 if (TARGET_BASE_SIMD
1810 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1811 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1812 && (nelems >= 2 && nelems <= 4))
1813 return true;
1815 return false;
1818 /* MODE is some form of SVE vector mode. For data modes, return the number
1819 of vector register bits that each element of MODE occupies, such as 64
1820 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1821 in a 64-bit container). For predicate modes, return the number of
1822 data bits controlled by each significant predicate bit. */
1824 static unsigned int
1825 aarch64_sve_container_bits (machine_mode mode)
1827 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1828 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1829 ? BITS_PER_SVE_VECTOR
1830 : GET_MODE_BITSIZE (mode));
1831 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1834 /* Return the SVE predicate mode to use for elements that have
1835 ELEM_NBYTES bytes, if such a mode exists. */
1837 opt_machine_mode
1838 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1840 if (TARGET_SVE)
1842 if (elem_nbytes == 1)
1843 return VNx16BImode;
1844 if (elem_nbytes == 2)
1845 return VNx8BImode;
1846 if (elem_nbytes == 4)
1847 return VNx4BImode;
1848 if (elem_nbytes == 8)
1849 return VNx2BImode;
1851 return opt_machine_mode ();
1854 /* Return the SVE predicate mode that should be used to control
1855 SVE mode MODE. */
1857 machine_mode
1858 aarch64_sve_pred_mode (machine_mode mode)
1860 unsigned int bits = aarch64_sve_container_bits (mode);
1861 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1864 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1866 static opt_machine_mode
1867 aarch64_get_mask_mode (machine_mode mode)
1869 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1870 if (vec_flags & VEC_SVE_DATA)
1871 return aarch64_sve_pred_mode (mode);
1873 return default_get_mask_mode (mode);
1876 /* Return the integer element mode associated with SVE mode MODE. */
1878 static scalar_int_mode
1879 aarch64_sve_element_int_mode (machine_mode mode)
1881 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1882 ? BITS_PER_SVE_VECTOR
1883 : GET_MODE_BITSIZE (mode));
1884 unsigned int elt_bits = vector_element_size (vector_bits,
1885 GET_MODE_NUNITS (mode));
1886 return int_mode_for_size (elt_bits, 0).require ();
1889 /* Return an integer element mode that contains exactly
1890 aarch64_sve_container_bits (MODE) bits. This is wider than
1891 aarch64_sve_element_int_mode if MODE is a partial vector,
1892 otherwise it's the same. */
1894 static scalar_int_mode
1895 aarch64_sve_container_int_mode (machine_mode mode)
1897 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1900 /* Return the integer vector mode associated with SVE mode MODE.
1901 Unlike related_int_vector_mode, this can handle the case in which
1902 MODE is a predicate (and thus has a different total size). */
1904 machine_mode
1905 aarch64_sve_int_mode (machine_mode mode)
1907 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1908 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1911 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1913 static opt_machine_mode
1914 aarch64_vectorize_related_mode (machine_mode vector_mode,
1915 scalar_mode element_mode,
1916 poly_uint64 nunits)
1918 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1920 /* If we're operating on SVE vectors, try to return an SVE mode. */
1921 poly_uint64 sve_nunits;
1922 if ((vec_flags & VEC_SVE_DATA)
1923 && multiple_p (BYTES_PER_SVE_VECTOR,
1924 GET_MODE_SIZE (element_mode), &sve_nunits))
1926 machine_mode sve_mode;
1927 if (maybe_ne (nunits, 0U))
1929 /* Try to find a full or partial SVE mode with exactly
1930 NUNITS units. */
1931 if (multiple_p (sve_nunits, nunits)
1932 && aarch64_sve_data_mode (element_mode,
1933 nunits).exists (&sve_mode))
1934 return sve_mode;
1936 else
1938 /* Take the preferred number of units from the number of bytes
1939 that fit in VECTOR_MODE. We always start by "autodetecting"
1940 a full vector mode with preferred_simd_mode, so vectors
1941 chosen here will also be full vector modes. Then
1942 autovectorize_vector_modes tries smaller starting modes
1943 and thus smaller preferred numbers of units. */
1944 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1945 if (aarch64_sve_data_mode (element_mode,
1946 sve_nunits).exists (&sve_mode))
1947 return sve_mode;
1951 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1952 if (TARGET_SIMD
1953 && (vec_flags & VEC_ADVSIMD)
1954 && known_eq (nunits, 0U)
1955 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1956 && maybe_ge (GET_MODE_BITSIZE (element_mode)
1957 * GET_MODE_NUNITS (vector_mode), 128U))
1959 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1960 if (VECTOR_MODE_P (res))
1961 return res;
1964 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1967 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
1969 static bool
1970 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
1972 machine_mode mode = TYPE_MODE (type);
1973 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1974 bool sve_p = (vec_flags & VEC_ANY_SVE);
1975 bool simd_p = (vec_flags & VEC_ADVSIMD);
1977 return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
1980 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1981 prefer to use the first arithmetic operand as the else value if
1982 the else value doesn't matter, since that exactly matches the SVE
1983 destructive merging form. For ternary operations we could either
1984 pick the first operand and use FMAD-like instructions or the last
1985 operand and use FMLA-like instructions; the latter seems more
1986 natural. */
1988 static tree
1989 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1991 return nops == 3 ? ops[2] : ops[0];
1994 /* Implement TARGET_HARD_REGNO_NREGS. */
1996 static unsigned int
1997 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1999 /* ??? Logically we should only need to provide a value when
2000 HARD_REGNO_MODE_OK says that the combination is valid,
2001 but at the moment we need to handle all modes. Just ignore
2002 any runtime parts for registers that can't store them. */
2003 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2004 switch (aarch64_regno_regclass (regno))
2006 case FP_REGS:
2007 case FP_LO_REGS:
2008 case FP_LO8_REGS:
2010 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2011 if (vec_flags & VEC_SVE_DATA)
2012 return exact_div (GET_MODE_SIZE (mode),
2013 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2014 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2015 return GET_MODE_SIZE (mode).to_constant () / 8;
2016 return CEIL (lowest_size, UNITS_PER_VREG);
2019 case PR_REGS:
2020 case PR_LO_REGS:
2021 case PR_HI_REGS:
2022 return mode == VNx32BImode ? 2 : 1;
2024 case FFR_REGS:
2025 case PR_AND_FFR_REGS:
2026 case FAKE_REGS:
2027 return 1;
2029 default:
2030 return CEIL (lowest_size, UNITS_PER_WORD);
2032 gcc_unreachable ();
2035 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2037 static bool
2038 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2040 if (mode == V8DImode)
2041 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2042 && multiple_p (regno - R0_REGNUM, 2);
2044 if (GET_MODE_CLASS (mode) == MODE_CC)
2045 return regno == CC_REGNUM;
2047 if (regno == VG_REGNUM)
2048 /* This must have the same size as _Unwind_Word. */
2049 return mode == DImode;
2051 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2052 if (vec_flags == VEC_SVE_PRED)
2053 return pr_or_ffr_regnum_p (regno);
2055 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2056 return PR_REGNUM_P (regno);
2058 if (pr_or_ffr_regnum_p (regno))
2059 return false;
2061 /* These registers are abstract; their modes don't matter. */
2062 if (FAKE_REGNUM_P (regno))
2063 return true;
2065 if (regno == SP_REGNUM)
2066 /* The purpose of comparing with ptr_mode is to support the
2067 global register variable associated with the stack pointer
2068 register via the syntax of asm ("wsp") in ILP32. */
2069 return mode == Pmode || mode == ptr_mode;
2071 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2072 return mode == Pmode;
2074 if (GP_REGNUM_P (regno))
2076 if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2077 return false;
2078 if (known_le (GET_MODE_SIZE (mode), 8))
2079 return true;
2080 if (known_le (GET_MODE_SIZE (mode), 16))
2081 return (regno & 1) == 0;
2083 else if (FP_REGNUM_P (regno))
2085 if (vec_flags & VEC_STRUCT)
2086 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2087 else
2088 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2091 return false;
2094 /* Return true if a function with type FNTYPE returns its value in
2095 SVE vector or predicate registers. */
2097 static bool
2098 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2100 tree return_type = TREE_TYPE (fntype);
2102 pure_scalable_type_info pst_info;
2103 switch (pst_info.analyze (return_type))
2105 case pure_scalable_type_info::IS_PST:
2106 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2107 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2109 case pure_scalable_type_info::DOESNT_MATTER:
2110 gcc_assert (aarch64_return_in_memory_1 (return_type));
2111 return false;
2113 case pure_scalable_type_info::NO_ABI_IDENTITY:
2114 case pure_scalable_type_info::ISNT_PST:
2115 return false;
2117 gcc_unreachable ();
2120 /* Return true if a function with type FNTYPE takes arguments in
2121 SVE vector or predicate registers. */
2123 static bool
2124 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2126 CUMULATIVE_ARGS args_so_far_v;
2127 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2128 NULL_TREE, 0, true);
2129 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2131 for (tree chain = TYPE_ARG_TYPES (fntype);
2132 chain && chain != void_list_node;
2133 chain = TREE_CHAIN (chain))
2135 tree arg_type = TREE_VALUE (chain);
2136 if (arg_type == error_mark_node)
2137 return false;
2139 function_arg_info arg (arg_type, /*named=*/true);
2140 apply_pass_by_reference_rules (&args_so_far_v, arg);
2141 pure_scalable_type_info pst_info;
2142 if (pst_info.analyze_registers (arg.type))
2144 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2145 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2146 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2147 return true;
2150 targetm.calls.function_arg_advance (args_so_far, arg);
2152 return false;
2155 /* Implement TARGET_FNTYPE_ABI. */
2157 static const predefined_function_abi &
2158 aarch64_fntype_abi (const_tree fntype)
2160 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2161 return aarch64_simd_abi ();
2163 if (aarch64_returns_value_in_sve_regs_p (fntype)
2164 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2165 return aarch64_sve_abi ();
2167 return default_function_abi;
2170 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE. */
2172 static aarch64_feature_flags
2173 aarch64_fntype_pstate_sm (const_tree fntype)
2175 if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2176 return AARCH64_FL_SM_ON;
2178 if (lookup_attribute ("arm", "streaming_compatible",
2179 TYPE_ATTRIBUTES (fntype)))
2180 return 0;
2182 return AARCH64_FL_SM_OFF;
2185 /* Return state flags that describe whether and how functions of type
2186 FNTYPE share state STATE_NAME with their callers. */
2188 static unsigned int
2189 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2191 return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2192 state_name);
2195 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE. */
2197 static aarch64_feature_flags
2198 aarch64_fntype_pstate_za (const_tree fntype)
2200 if (aarch64_fntype_shared_flags (fntype, "za")
2201 || aarch64_fntype_shared_flags (fntype, "zt0"))
2202 return AARCH64_FL_ZA_ON;
2204 return 0;
2207 /* Return the ISA mode on entry to functions of type FNTYPE. */
2209 static aarch64_feature_flags
2210 aarch64_fntype_isa_mode (const_tree fntype)
2212 return (aarch64_fntype_pstate_sm (fntype)
2213 | aarch64_fntype_pstate_za (fntype));
2216 /* Return true if FNDECL uses streaming mode internally, as an
2217 implementation choice. */
2219 static bool
2220 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2222 return lookup_attribute ("arm", "locally_streaming",
2223 DECL_ATTRIBUTES (fndecl));
2226 /* Return the state of PSTATE.SM when compiling the body of
2227 function FNDECL. This might be different from the state of
2228 PSTATE.SM on entry. */
2230 static aarch64_feature_flags
2231 aarch64_fndecl_pstate_sm (const_tree fndecl)
2233 if (aarch64_fndecl_is_locally_streaming (fndecl))
2234 return AARCH64_FL_SM_ON;
2236 return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2239 /* Return true if function FNDECL has state STATE_NAME, either by creating
2240 new state itself or by sharing state with callers. */
2242 static bool
2243 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2245 return (aarch64_fndecl_has_new_state (fndecl, state_name)
2246 || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2247 state_name) != 0);
2250 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2251 This might be different from the state of PSTATE.ZA on entry. */
2253 static aarch64_feature_flags
2254 aarch64_fndecl_pstate_za (const_tree fndecl)
2256 if (aarch64_fndecl_has_new_state (fndecl, "za")
2257 || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2258 return AARCH64_FL_ZA_ON;
2260 return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2263 /* Return the ISA mode that should be used to compile the body of
2264 function FNDECL. */
2266 static aarch64_feature_flags
2267 aarch64_fndecl_isa_mode (const_tree fndecl)
2269 return (aarch64_fndecl_pstate_sm (fndecl)
2270 | aarch64_fndecl_pstate_za (fndecl));
2273 /* Return the state of PSTATE.SM on entry to the current function.
2274 This might be different from the state of PSTATE.SM in the function
2275 body. */
2277 static aarch64_feature_flags
2278 aarch64_cfun_incoming_pstate_sm ()
2280 return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2283 /* Return the state of PSTATE.ZA on entry to the current function.
2284 This might be different from the state of PSTATE.ZA in the function
2285 body. */
2287 static aarch64_feature_flags
2288 aarch64_cfun_incoming_pstate_za ()
2290 return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2293 /* Return state flags that describe whether and how the current function shares
2294 state STATE_NAME with callers. */
2296 static unsigned int
2297 aarch64_cfun_shared_flags (const char *state_name)
2299 return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2302 /* Return true if the current function creates new state of type STATE_NAME
2303 (as opposed to sharing the state with its callers or ignoring the state
2304 altogether). */
2306 static bool
2307 aarch64_cfun_has_new_state (const char *state_name)
2309 return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2312 /* Return true if PSTATE.SM is 1 in the body of the current function,
2313 but is not guaranteed to be 1 on entry. */
2315 static bool
2316 aarch64_cfun_enables_pstate_sm ()
2318 return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2319 && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
2322 /* Return true if the current function has state STATE_NAME, either by
2323 creating new state itself or by sharing state with callers. */
2325 static bool
2326 aarch64_cfun_has_state (const char *state_name)
2328 return aarch64_fndecl_has_state (cfun->decl, state_name);
2331 /* Return true if a call from the current function to a function with
2332 ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2333 the BL instruction. */
2335 static bool
2336 aarch64_call_switches_pstate_sm (aarch64_feature_flags callee_mode)
2338 return (callee_mode & ~AARCH64_ISA_MODE & AARCH64_FL_SM_STATE) != 0;
2341 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2343 static bool
2344 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2346 return (aarch64_sve::builtin_type_p (type1)
2347 == aarch64_sve::builtin_type_p (type2));
2350 /* Return true if we should emit CFI for register REGNO. */
2352 static bool
2353 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2355 return (GP_REGNUM_P (regno)
2356 || !default_function_abi.clobbers_full_reg_p (regno));
2359 /* Return the mode we should use to save and restore register REGNO. */
2361 static machine_mode
2362 aarch64_reg_save_mode (unsigned int regno)
2364 if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2365 return DImode;
2367 if (FP_REGNUM_P (regno))
2368 switch (crtl->abi->id ())
2370 case ARM_PCS_AAPCS64:
2371 /* Only the low 64 bits are saved by the base PCS. */
2372 return DFmode;
2374 case ARM_PCS_SIMD:
2375 /* The vector PCS saves the low 128 bits (which is the full
2376 register on non-SVE targets). */
2377 return V16QImode;
2379 case ARM_PCS_SVE:
2380 /* Use vectors of DImode for registers that need frame
2381 information, so that the first 64 bytes of the save slot
2382 are always the equivalent of what storing D<n> would give. */
2383 if (aarch64_emit_cfi_for_reg_p (regno))
2384 return VNx2DImode;
2386 /* Use vectors of bytes otherwise, so that the layout is
2387 endian-agnostic, and so that we can use LDR and STR for
2388 big-endian targets. */
2389 return VNx16QImode;
2391 case ARM_PCS_TLSDESC:
2392 case ARM_PCS_UNKNOWN:
2393 break;
2396 if (PR_REGNUM_P (regno))
2397 /* Save the full predicate register. */
2398 return VNx16BImode;
2400 gcc_unreachable ();
2403 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2404 return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx. */
2407 aarch64_gen_callee_cookie (aarch64_feature_flags isa_mode, arm_pcs pcs_variant)
2409 return gen_int_mode ((unsigned int) isa_mode
2410 | (unsigned int) pcs_variant << AARCH64_NUM_ISA_MODES,
2411 DImode);
2414 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2415 callee's ABI. */
2417 static const predefined_function_abi &
2418 aarch64_callee_abi (rtx cookie)
2420 return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES];
2423 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2424 required ISA mode on entry to the callee, which is also the ISA
2425 mode on return from the callee. */
2427 static aarch64_feature_flags
2428 aarch64_callee_isa_mode (rtx cookie)
2430 return UINTVAL (cookie) & AARCH64_FL_ISA_MODES;
2433 /* INSN is a call instruction. Return the CONST_INT stored in its
2434 UNSPEC_CALLEE_ABI rtx. */
2436 static rtx
2437 aarch64_insn_callee_cookie (const rtx_insn *insn)
2439 rtx pat = PATTERN (insn);
2440 gcc_assert (GET_CODE (pat) == PARALLEL);
2441 rtx unspec = XVECEXP (pat, 0, 1);
2442 gcc_assert (GET_CODE (unspec) == UNSPEC
2443 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2444 return XVECEXP (unspec, 0, 0);
2447 /* Implement TARGET_INSN_CALLEE_ABI. */
2449 const predefined_function_abi &
2450 aarch64_insn_callee_abi (const rtx_insn *insn)
2452 return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2455 /* INSN is a call instruction. Return the required ISA mode on entry to
2456 the callee, which is also the ISA mode on return from the callee. */
2458 static aarch64_feature_flags
2459 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2461 return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2464 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2465 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2466 clobbers the top 64 bits when restoring the bottom 64 bits. */
2468 static bool
2469 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2470 unsigned int regno,
2471 machine_mode mode)
2473 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2475 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2476 unsigned int nregs = hard_regno_nregs (regno, mode);
2477 if (nregs > 1)
2478 per_register_size = exact_div (per_register_size, nregs);
2479 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2480 return maybe_gt (per_register_size, 16);
2481 return maybe_gt (per_register_size, 8);
2483 return false;
2486 /* Implement REGMODE_NATURAL_SIZE. */
2487 poly_uint64
2488 aarch64_regmode_natural_size (machine_mode mode)
2490 /* The natural size for SVE data modes is one SVE data vector,
2491 and similarly for predicates. We can't independently modify
2492 anything smaller than that. */
2493 /* ??? For now, only do this for variable-width SVE registers.
2494 Doing it for constant-sized registers breaks lower-subreg.cc. */
2495 /* ??? And once that's fixed, we should probably have similar
2496 code for Advanced SIMD. */
2497 if (!aarch64_sve_vg.is_constant ())
2499 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2500 if (vec_flags & VEC_SVE_PRED)
2501 return BYTES_PER_SVE_PRED;
2502 if (vec_flags & VEC_SVE_DATA)
2503 return BYTES_PER_SVE_VECTOR;
2505 return UNITS_PER_WORD;
2508 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2509 machine_mode
2510 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2511 machine_mode mode)
2513 /* The predicate mode determines which bits are significant and
2514 which are "don't care". Decreasing the number of lanes would
2515 lose data while increasing the number of lanes would make bits
2516 unnecessarily significant. */
2517 if (PR_REGNUM_P (regno))
2518 return mode;
2519 if (known_ge (GET_MODE_SIZE (mode), 4))
2520 return mode;
2521 else
2522 return SImode;
2525 /* Return true if I's bits are consecutive ones from the MSB. */
2526 bool
2527 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2529 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2532 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2533 that strcpy from constants will be faster. */
2535 static HOST_WIDE_INT
2536 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2538 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2539 return MAX (align, BITS_PER_WORD);
2540 return align;
2543 /* Return true if calls to DECL should be treated as
2544 long-calls (ie called via a register). */
2545 static bool
2546 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2548 return false;
2551 /* Return true if calls to symbol-ref SYM should be treated as
2552 long-calls (ie called via a register). */
2553 bool
2554 aarch64_is_long_call_p (rtx sym)
2556 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2559 /* Return true if calls to symbol-ref SYM should not go through
2560 plt stubs. */
2562 bool
2563 aarch64_is_noplt_call_p (rtx sym)
2565 const_tree decl = SYMBOL_REF_DECL (sym);
2567 if (flag_pic
2568 && decl
2569 && (!flag_plt
2570 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2571 && !targetm.binds_local_p (decl))
2572 return true;
2574 return false;
2577 /* Emit an insn that's a simple single-set. Both the operands must be
2578 known to be valid. */
2579 inline static rtx_insn *
2580 emit_set_insn (rtx x, rtx y)
2582 return emit_insn (gen_rtx_SET (x, y));
2585 /* X and Y are two things to compare using CODE. Emit the compare insn and
2586 return the rtx for register 0 in the proper mode. */
2588 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2590 machine_mode cmp_mode = GET_MODE (x);
2591 machine_mode cc_mode;
2592 rtx cc_reg;
2594 if (cmp_mode == TImode)
2596 gcc_assert (code == NE);
2598 cc_mode = CCmode;
2599 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2601 rtx x_lo = operand_subword (x, 0, 0, TImode);
2602 rtx y_lo = operand_subword (y, 0, 0, TImode);
2603 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2605 rtx x_hi = operand_subword (x, 1, 0, TImode);
2606 rtx y_hi = operand_subword (y, 1, 0, TImode);
2607 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2608 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2609 GEN_INT (AARCH64_EQ)));
2611 else
2613 cc_mode = SELECT_CC_MODE (code, x, y);
2614 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2615 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2617 return cc_reg;
2620 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2622 static rtx
2623 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2624 machine_mode y_mode)
2626 if (y_mode == E_QImode || y_mode == E_HImode)
2628 if (CONST_INT_P (y))
2630 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2631 y_mode = SImode;
2633 else
2635 rtx t, cc_reg;
2636 machine_mode cc_mode;
2638 t = gen_rtx_ZERO_EXTEND (SImode, y);
2639 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2640 cc_mode = CC_SWPmode;
2641 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2642 emit_set_insn (cc_reg, t);
2643 return cc_reg;
2647 if (!aarch64_plus_operand (y, y_mode))
2648 y = force_reg (y_mode, y);
2650 return aarch64_gen_compare_reg (code, x, y);
2653 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2654 Return the jump instruction. */
2656 static rtx
2657 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
2658 rtx_code_label *label)
2660 if (aarch64_track_speculation)
2662 /* Emit an explicit compare instruction, so that we can correctly
2663 track the condition codes. */
2664 rtx cc_reg = aarch64_gen_compare_reg (code, x, const0_rtx);
2665 x = gen_rtx_fmt_ee (code, GET_MODE (cc_reg), cc_reg, const0_rtx);
2667 else
2668 x = gen_rtx_fmt_ee (code, VOIDmode, x, const0_rtx);
2670 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
2671 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
2672 return gen_rtx_SET (pc_rtx, x);
2675 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2676 If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2677 it branches to LABEL when the bit is clear. */
2679 static rtx
2680 aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
2681 rtx_code_label *label)
2683 auto mode = GET_MODE (x);
2684 if (aarch64_track_speculation)
2686 auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
2687 emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
2688 rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
2689 rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
2690 return gen_condjump (x, cc_reg, label);
2692 return gen_aarch64_tb (code, mode, mode,
2693 x, gen_int_mode (bitnum, mode), label);
2696 /* Consider the operation:
2698 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2700 where:
2702 - CODE is [SU]MAX or [SU]MIN
2703 - OPERANDS[2] and OPERANDS[3] are constant integers
2704 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2705 - all operands have mode MODE
2707 Decide whether it is possible to implement the operation using:
2709 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2711 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2713 followed by:
2715 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2717 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
2718 If GENERATE_P is true, also update OPERANDS as follows:
2720 OPERANDS[4] = -OPERANDS[3]
2721 OPERANDS[5] = the rtl condition representing <cond>
2722 OPERANDS[6] = <tmp>
2723 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
2724 bool
2725 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2727 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2728 rtx dst = operands[0];
2729 rtx maxmin_op = operands[2];
2730 rtx add_op = operands[3];
2731 machine_mode mode = GET_MODE (dst);
2733 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2734 == (x >= y ? x : y) - z
2735 == (x > y ? x : y) - z
2736 == (x > y - 1 ? x : y) - z
2738 min (x, y) - z == (x <= y - 1 ? x : y) - z
2739 == (x <= y ? x : y) - z
2740 == (x < y ? x : y) - z
2741 == (x < y + 1 ? x : y) - z
2743 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2744 which x is compared with z. Set DIFF to y - z. Thus the supported
2745 combinations are as follows, with DIFF being the value after the ":":
2747 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
2748 == x >= y ? x - y : 0 [z == y]
2749 == x > y ? x - y : 0 [z == y]
2750 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
2752 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
2753 == x <= y ? x - y : 0 [z == y]
2754 == x < y ? x - y : 0 [z == y]
2755 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
2756 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2757 auto add_val = rtx_mode_t (add_op, mode);
2758 auto sub_val = wi::neg (add_val);
2759 auto diff = wi::sub (maxmin_val, sub_val);
2760 if (!(diff == 0
2761 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2762 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2763 return false;
2765 if (!generate_p)
2766 return true;
2768 rtx_code cmp;
2769 switch (code)
2771 case SMAX:
2772 cmp = diff == 1 ? GT : GE;
2773 break;
2774 case UMAX:
2775 cmp = diff == 1 ? GTU : GEU;
2776 break;
2777 case SMIN:
2778 cmp = diff == -1 ? LT : LE;
2779 break;
2780 case UMIN:
2781 cmp = diff == -1 ? LTU : LEU;
2782 break;
2783 default:
2784 gcc_unreachable ();
2786 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2788 operands[4] = immed_wide_int_const (sub_val, mode);
2789 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2790 if (can_create_pseudo_p ())
2791 operands[6] = gen_reg_rtx (mode);
2792 else
2793 operands[6] = dst;
2794 operands[7] = immed_wide_int_const (diff, mode);
2796 return true;
2800 /* Build the SYMBOL_REF for __tls_get_addr. */
2802 static GTY(()) rtx tls_get_addr_libfunc;
2805 aarch64_tls_get_addr (void)
2807 if (!tls_get_addr_libfunc)
2808 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2809 return tls_get_addr_libfunc;
2812 /* Return the TLS model to use for ADDR. */
2814 static enum tls_model
2815 tls_symbolic_operand_type (rtx addr)
2817 enum tls_model tls_kind = TLS_MODEL_NONE;
2818 poly_int64 offset;
2819 addr = strip_offset_and_salt (addr, &offset);
2820 if (SYMBOL_REF_P (addr))
2821 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2823 return tls_kind;
2826 /* We'll allow lo_sum's in addresses in our legitimate addresses
2827 so that combine would take care of combining addresses where
2828 necessary, but for generation purposes, we'll generate the address
2829 as :
2830 RTL Absolute
2831 tmp = hi (symbol_ref); adrp x1, foo
2832 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2835 PIC TLS
2836 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2837 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2838 bl __tls_get_addr
2841 Load TLS symbol, depending on TLS mechanism and TLS access model.
2843 Global Dynamic - Traditional TLS:
2844 adrp tmp, :tlsgd:imm
2845 add dest, tmp, #:tlsgd_lo12:imm
2846 bl __tls_get_addr
2848 Global Dynamic - TLS Descriptors:
2849 adrp dest, :tlsdesc:imm
2850 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2851 add dest, dest, #:tlsdesc_lo12:imm
2852 blr tmp
2853 mrs tp, tpidr_el0
2854 add dest, dest, tp
2856 Initial Exec:
2857 mrs tp, tpidr_el0
2858 adrp tmp, :gottprel:imm
2859 ldr dest, [tmp, #:gottprel_lo12:imm]
2860 add dest, dest, tp
2862 Local Exec:
2863 mrs tp, tpidr_el0
2864 add t0, tp, #:tprel_hi12:imm, lsl #12
2865 add t0, t0, #:tprel_lo12_nc:imm
2868 static void
2869 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2870 enum aarch64_symbol_type type)
2872 #if TARGET_PECOFF
2873 rtx tmp = legitimize_pe_coff_symbol (imm, true);
2874 if (tmp)
2876 emit_insn (gen_rtx_SET (dest, tmp));
2877 return;
2879 #endif
2881 switch (type)
2883 case SYMBOL_SMALL_ABSOLUTE:
2885 /* In ILP32, the mode of dest can be either SImode or DImode. */
2886 rtx tmp_reg = dest;
2887 machine_mode mode = GET_MODE (dest);
2889 gcc_assert (mode == Pmode || mode == ptr_mode);
2891 if (can_create_pseudo_p ())
2892 tmp_reg = gen_reg_rtx (mode);
2894 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
2895 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2896 return;
2899 case SYMBOL_TINY_ABSOLUTE:
2900 emit_insn (gen_rtx_SET (dest, imm));
2901 return;
2903 case SYMBOL_SMALL_GOT_28K:
2905 machine_mode mode = GET_MODE (dest);
2906 rtx gp_rtx = pic_offset_table_rtx;
2907 rtx insn;
2908 rtx mem;
2910 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2911 here before rtl expand. Tree IVOPT will generate rtl pattern to
2912 decide rtx costs, in which case pic_offset_table_rtx is not
2913 initialized. For that case no need to generate the first adrp
2914 instruction as the final cost for global variable access is
2915 one instruction. */
2916 if (gp_rtx != NULL)
2918 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2919 using the page base as GOT base, the first page may be wasted,
2920 in the worst scenario, there is only 28K space for GOT).
2922 The generate instruction sequence for accessing global variable
2925 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2927 Only one instruction needed. But we must initialize
2928 pic_offset_table_rtx properly. We generate initialize insn for
2929 every global access, and allow CSE to remove all redundant.
2931 The final instruction sequences will look like the following
2932 for multiply global variables access.
2934 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2936 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2937 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2938 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2939 ... */
2941 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2942 crtl->uses_pic_offset_table = 1;
2943 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2945 if (mode != GET_MODE (gp_rtx))
2946 gp_rtx = gen_lowpart (mode, gp_rtx);
2950 if (mode == ptr_mode)
2952 if (mode == DImode)
2953 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2954 else
2955 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2957 mem = XVECEXP (SET_SRC (insn), 0, 0);
2959 else
2961 gcc_assert (mode == Pmode);
2963 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2964 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2967 /* The operand is expected to be MEM. Whenever the related insn
2968 pattern changed, above code which calculate mem should be
2969 updated. */
2970 gcc_assert (MEM_P (mem));
2971 MEM_READONLY_P (mem) = 1;
2972 MEM_NOTRAP_P (mem) = 1;
2973 emit_insn (insn);
2974 return;
2977 case SYMBOL_SMALL_GOT_4G:
2978 emit_insn (gen_rtx_SET (dest, imm));
2979 return;
2981 case SYMBOL_SMALL_TLSGD:
2983 rtx_insn *insns;
2984 /* The return type of __tls_get_addr is the C pointer type
2985 so use ptr_mode. */
2986 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
2987 rtx tmp_reg = dest;
2989 if (GET_MODE (dest) != ptr_mode)
2990 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
2992 start_sequence ();
2993 if (ptr_mode == SImode)
2994 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2995 else
2996 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2997 insns = get_insns ();
2998 end_sequence ();
3000 RTL_CONST_CALL_P (insns) = 1;
3001 emit_libcall_block (insns, tmp_reg, result, imm);
3002 /* Convert back to the mode of the dest adding a zero_extend
3003 from SImode (ptr_mode) to DImode (Pmode). */
3004 if (dest != tmp_reg)
3005 convert_move (dest, tmp_reg, true);
3006 return;
3009 case SYMBOL_SMALL_TLSDESC:
3011 machine_mode mode = GET_MODE (dest);
3012 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3013 rtx tp;
3015 gcc_assert (mode == Pmode || mode == ptr_mode);
3017 /* In ILP32, the got entry is always of SImode size. Unlike
3018 small GOT, the dest is fixed at reg 0. */
3019 if (TARGET_ILP32)
3020 emit_insn (gen_tlsdesc_small_si (imm));
3021 else
3022 emit_insn (gen_tlsdesc_small_di (imm));
3023 tp = aarch64_load_tp (NULL);
3025 if (mode != Pmode)
3026 tp = gen_lowpart (mode, tp);
3028 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3029 if (REG_P (dest))
3030 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3031 return;
3034 case SYMBOL_SMALL_TLSIE:
3036 /* In ILP32, the mode of dest can be either SImode or DImode,
3037 while the got entry is always of SImode size. The mode of
3038 dest depends on how dest is used: if dest is assigned to a
3039 pointer (e.g. in the memory), it has SImode; it may have
3040 DImode if dest is dereferenced to access the memeory.
3041 This is why we have to handle three different tlsie_small
3042 patterns here (two patterns for ILP32). */
3043 machine_mode mode = GET_MODE (dest);
3044 rtx tmp_reg = gen_reg_rtx (mode);
3045 rtx tp = aarch64_load_tp (NULL);
3047 if (mode == ptr_mode)
3049 if (mode == DImode)
3050 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3051 else
3053 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3054 tp = gen_lowpart (mode, tp);
3057 else
3059 gcc_assert (mode == Pmode);
3060 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3063 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3064 if (REG_P (dest))
3065 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3066 return;
3069 case SYMBOL_TLSLE12:
3070 case SYMBOL_TLSLE24:
3071 case SYMBOL_TLSLE32:
3072 case SYMBOL_TLSLE48:
3074 machine_mode mode = GET_MODE (dest);
3075 rtx tp = aarch64_load_tp (NULL);
3077 if (mode != Pmode)
3078 tp = gen_lowpart (mode, tp);
3080 switch (type)
3082 case SYMBOL_TLSLE12:
3083 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3084 (dest, tp, imm));
3085 break;
3086 case SYMBOL_TLSLE24:
3087 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3088 (dest, tp, imm));
3089 break;
3090 case SYMBOL_TLSLE32:
3091 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3092 (dest, imm));
3093 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3094 (dest, dest, tp));
3095 break;
3096 case SYMBOL_TLSLE48:
3097 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3098 (dest, imm));
3099 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3100 (dest, dest, tp));
3101 break;
3102 default:
3103 gcc_unreachable ();
3106 if (REG_P (dest))
3107 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3108 return;
3111 case SYMBOL_TINY_GOT:
3113 rtx insn;
3114 machine_mode mode = GET_MODE (dest);
3116 if (mode == ptr_mode)
3117 insn = gen_ldr_got_tiny (mode, dest, imm);
3118 else
3120 gcc_assert (mode == Pmode);
3121 insn = gen_ldr_got_tiny_sidi (dest, imm);
3124 emit_insn (insn);
3125 return;
3128 case SYMBOL_TINY_TLSIE:
3130 machine_mode mode = GET_MODE (dest);
3131 rtx tp = aarch64_load_tp (NULL);
3133 if (mode == ptr_mode)
3135 if (mode == DImode)
3136 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3137 else
3139 tp = gen_lowpart (mode, tp);
3140 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3143 else
3145 gcc_assert (mode == Pmode);
3146 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3149 if (REG_P (dest))
3150 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3151 return;
3154 default:
3155 gcc_unreachable ();
3159 /* Emit a move from SRC to DEST. Assume that the move expanders can
3160 handle all moves if !can_create_pseudo_p (). The distinction is
3161 important because, unlike emit_move_insn, the move expanders know
3162 how to force Pmode objects into the constant pool even when the
3163 constant pool address is not itself legitimate. */
3164 static rtx
3165 aarch64_emit_move (rtx dest, rtx src)
3167 return (can_create_pseudo_p ()
3168 ? emit_move_insn (dest, src)
3169 : emit_move_insn_1 (dest, src));
3172 /* Apply UNOPTAB to OP and store the result in DEST. */
3174 static void
3175 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3177 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3178 if (dest != tmp)
3179 emit_move_insn (dest, tmp);
3182 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3184 static void
3185 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3187 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3188 OPTAB_DIRECT);
3189 if (dest != tmp)
3190 emit_move_insn (dest, tmp);
3193 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE. */
3195 void
3196 aarch64_split_double_move (rtx dst, rtx src, machine_mode single_mode)
3198 machine_mode mode = GET_MODE (dst);
3200 rtx dst0 = simplify_gen_subreg (single_mode, dst, mode, 0);
3201 rtx dst1 = simplify_gen_subreg (single_mode, dst, mode,
3202 GET_MODE_SIZE (single_mode));
3203 rtx src0 = simplify_gen_subreg (single_mode, src, mode, 0);
3204 rtx src1 = simplify_gen_subreg (single_mode, src, mode,
3205 GET_MODE_SIZE (single_mode));
3207 /* At most one pairing may overlap. */
3208 if (reg_overlap_mentioned_p (dst0, src1))
3210 aarch64_emit_move (dst1, src1);
3211 aarch64_emit_move (dst0, src0);
3213 else
3215 aarch64_emit_move (dst0, src0);
3216 aarch64_emit_move (dst1, src1);
3220 /* Split a 128-bit move operation into two 64-bit move operations,
3221 taking care to handle partial overlap of register to register
3222 copies. Special cases are needed when moving between GP regs and
3223 FP regs. SRC can be a register, constant or memory; DST a register
3224 or memory. If either operand is memory it must not have any side
3225 effects. */
3226 void
3227 aarch64_split_128bit_move (rtx dst, rtx src)
3229 machine_mode mode = GET_MODE (dst);
3231 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3232 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3233 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3235 if (REG_P (dst) && REG_P (src))
3237 int src_regno = REGNO (src);
3238 int dst_regno = REGNO (dst);
3240 /* Handle FP <-> GP regs. */
3241 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3243 rtx src_lo = gen_lowpart (word_mode, src);
3244 rtx src_hi = gen_highpart (word_mode, src);
3246 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3247 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3248 return;
3250 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3252 rtx dst_lo = gen_lowpart (word_mode, dst);
3253 rtx dst_hi = gen_highpart (word_mode, dst);
3255 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3256 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3257 return;
3261 aarch64_split_double_move (dst, src, word_mode);
3264 /* Return true if we should split a move from 128-bit value SRC
3265 to 128-bit register DEST. */
3267 bool
3268 aarch64_split_128bit_move_p (rtx dst, rtx src)
3270 if (FP_REGNUM_P (REGNO (dst)))
3271 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3272 /* All moves to GPRs need to be split. */
3273 return true;
3276 /* Split a complex SIMD move. */
3278 void
3279 aarch64_split_simd_move (rtx dst, rtx src)
3281 machine_mode src_mode = GET_MODE (src);
3282 machine_mode dst_mode = GET_MODE (dst);
3284 gcc_assert (VECTOR_MODE_P (dst_mode));
3286 if (REG_P (dst) && REG_P (src))
3288 gcc_assert (VECTOR_MODE_P (src_mode));
3289 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3293 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3294 The semantics of those of svreinterpret rather than those of subregs;
3295 see the comment at the head of aarch64-sve.md for details about the
3296 difference. */
3299 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3301 if (GET_MODE (x) == mode)
3302 return x;
3304 /* can_change_mode_class must only return true if subregs and svreinterprets
3305 have the same semantics. */
3306 if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3307 return force_lowpart_subreg (mode, x, GET_MODE (x));
3309 rtx res = gen_reg_rtx (mode);
3310 x = force_reg (GET_MODE (x), x);
3311 emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3312 return res;
3315 bool
3316 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3317 machine_mode ymode, rtx y)
3319 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3320 gcc_assert (r != NULL);
3321 return rtx_equal_p (x, r);
3324 /* Return TARGET if it is nonnull and a register of mode MODE.
3325 Otherwise, return a fresh register of mode MODE if we can,
3326 or TARGET reinterpreted as MODE if we can't. */
3328 static rtx
3329 aarch64_target_reg (rtx target, machine_mode mode)
3331 if (target && REG_P (target) && GET_MODE (target) == mode)
3332 return target;
3333 if (!can_create_pseudo_p ())
3335 gcc_assert (target);
3336 return gen_lowpart (mode, target);
3338 return gen_reg_rtx (mode);
3341 /* Return a register that contains the constant in BUILDER, given that
3342 the constant is a legitimate move operand. Use TARGET as the register
3343 if it is nonnull and convenient. */
3345 static rtx
3346 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3348 rtx src = builder.build ();
3349 target = aarch64_target_reg (target, GET_MODE (src));
3350 emit_insn (gen_rtx_SET (target, src));
3351 return target;
3354 static rtx
3355 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3357 if (can_create_pseudo_p ())
3358 return force_reg (mode, value);
3359 else
3361 gcc_assert (x);
3362 aarch64_emit_move (x, value);
3363 return x;
3367 /* Return true if predicate value X is a constant in which every element
3368 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3369 value, i.e. as a predicate in which all bits are significant. */
3371 static bool
3372 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3374 if (!CONST_VECTOR_P (x))
3375 return false;
3377 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3378 GET_MODE_NUNITS (GET_MODE (x)));
3379 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3380 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3381 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3383 unsigned int nelts = const_vector_encoded_nelts (x);
3384 for (unsigned int i = 0; i < nelts; ++i)
3386 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3387 if (!CONST_INT_P (elt))
3388 return false;
3390 builder.quick_push (elt);
3391 for (unsigned int j = 1; j < factor; ++j)
3392 builder.quick_push (const0_rtx);
3394 builder.finalize ();
3395 return true;
3398 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3399 widest predicate element size it can have (that is, the largest size
3400 for which each element would still be 0 or 1). */
3402 unsigned int
3403 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3405 /* Start with the most optimistic assumption: that we only need
3406 one bit per pattern. This is what we will use if only the first
3407 bit in each pattern is ever set. */
3408 unsigned int mask = GET_MODE_SIZE (DImode);
3409 mask |= builder.npatterns ();
3411 /* Look for set bits. */
3412 unsigned int nelts = builder.encoded_nelts ();
3413 for (unsigned int i = 1; i < nelts; ++i)
3414 if (INTVAL (builder.elt (i)) != 0)
3416 if (i & 1)
3417 return 1;
3418 mask |= i;
3420 return mask & -mask;
3423 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3424 return that predicate mode, otherwise return opt_machine_mode (). */
3426 opt_machine_mode
3427 aarch64_ptrue_all_mode (rtx x)
3429 gcc_assert (GET_MODE (x) == VNx16BImode);
3430 if (!CONST_VECTOR_P (x)
3431 || !CONST_VECTOR_DUPLICATE_P (x)
3432 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3433 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3434 return opt_machine_mode ();
3436 unsigned int nelts = const_vector_encoded_nelts (x);
3437 for (unsigned int i = 1; i < nelts; ++i)
3438 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3439 return opt_machine_mode ();
3441 return aarch64_sve_pred_mode (nelts);
3444 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3445 that the constant would have with predicate element size ELT_SIZE
3446 (ignoring the upper bits in each element) and return:
3448 * -1 if all bits are set
3449 * N if the predicate has N leading set bits followed by all clear bits
3450 * 0 if the predicate does not have any of these forms. */
3453 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3454 unsigned int elt_size)
3456 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3457 followed by set bits. */
3458 if (builder.nelts_per_pattern () == 3)
3459 return 0;
3461 /* Skip over leading set bits. */
3462 unsigned int nelts = builder.encoded_nelts ();
3463 unsigned int i = 0;
3464 for (; i < nelts; i += elt_size)
3465 if (INTVAL (builder.elt (i)) == 0)
3466 break;
3467 unsigned int vl = i / elt_size;
3469 /* Check for the all-true case. */
3470 if (i == nelts)
3471 return -1;
3473 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3474 repeating pattern of set bits followed by clear bits. */
3475 if (builder.nelts_per_pattern () != 2)
3476 return 0;
3478 /* We have a "foreground" value and a duplicated "background" value.
3479 If the background might repeat and the last set bit belongs to it,
3480 we might have set bits followed by clear bits followed by set bits. */
3481 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3482 return 0;
3484 /* Make sure that the rest are all clear. */
3485 for (; i < nelts; i += elt_size)
3486 if (INTVAL (builder.elt (i)) != 0)
3487 return 0;
3489 return vl;
3492 /* See if there is an svpattern that encodes an SVE predicate of mode
3493 PRED_MODE in which the first VL bits are set and the rest are clear.
3494 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3495 A VL of -1 indicates an all-true vector. */
3497 aarch64_svpattern
3498 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3500 if (vl < 0)
3501 return AARCH64_SV_ALL;
3503 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3504 return AARCH64_NUM_SVPATTERNS;
3506 if (vl >= 1 && vl <= 8)
3507 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3509 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3510 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3512 int max_vl;
3513 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3515 if (vl == (max_vl / 3) * 3)
3516 return AARCH64_SV_MUL3;
3517 /* These would only trigger for non-power-of-2 lengths. */
3518 if (vl == (max_vl & -4))
3519 return AARCH64_SV_MUL4;
3520 if (vl == (1 << floor_log2 (max_vl)))
3521 return AARCH64_SV_POW2;
3522 if (vl == max_vl)
3523 return AARCH64_SV_ALL;
3525 return AARCH64_NUM_SVPATTERNS;
3528 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3529 bits has the lowest bit set and the upper bits clear. This is the
3530 VNx16BImode equivalent of a PTRUE for controlling elements of
3531 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3532 all bits are significant, even the upper zeros. */
3535 aarch64_ptrue_all (unsigned int elt_size)
3537 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3538 builder.quick_push (const1_rtx);
3539 for (unsigned int i = 1; i < elt_size; ++i)
3540 builder.quick_push (const0_rtx);
3541 return builder.build ();
3544 /* Return an all-true predicate register of mode MODE. */
3547 aarch64_ptrue_reg (machine_mode mode)
3549 gcc_assert (aarch64_sve_pred_mode_p (mode));
3550 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3551 return gen_lowpart (mode, reg);
3554 /* Return an all-false predicate register of mode MODE. */
3557 aarch64_pfalse_reg (machine_mode mode)
3559 gcc_assert (aarch64_sve_pred_mode_p (mode));
3560 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3561 return gen_lowpart (mode, reg);
3564 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3565 for it. PRED2[0] is the predicate for the instruction whose result
3566 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3567 for it. Return true if we can prove that the two predicates are
3568 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3569 with PRED1[0] without changing behavior. */
3571 bool
3572 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3574 machine_mode mode = GET_MODE (pred1[0]);
3575 gcc_assert (aarch64_sve_pred_mode_p (mode)
3576 && mode == GET_MODE (pred2[0])
3577 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3578 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3580 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3581 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3582 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3583 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3584 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3587 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3588 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3589 Use TARGET as the target register if nonnull and convenient. */
3591 static rtx
3592 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3593 machine_mode data_mode, rtx op1, rtx op2)
3595 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3596 expand_operand ops[5];
3597 create_output_operand (&ops[0], target, pred_mode);
3598 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3599 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3600 create_input_operand (&ops[3], op1, data_mode);
3601 create_input_operand (&ops[4], op2, data_mode);
3602 expand_insn (icode, 5, ops);
3603 return ops[0].value;
3606 /* Use a comparison to convert integer vector SRC into MODE, which is
3607 the corresponding SVE predicate mode. Use TARGET for the result
3608 if it's nonnull and convenient. */
3611 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3613 machine_mode src_mode = GET_MODE (src);
3614 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3615 src, CONST0_RTX (src_mode));
3618 /* Return the assembly token for svprfop value PRFOP. */
3620 static const char *
3621 svprfop_token (enum aarch64_svprfop prfop)
3623 switch (prfop)
3625 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3626 AARCH64_FOR_SVPRFOP (CASE)
3627 #undef CASE
3628 case AARCH64_NUM_SVPRFOPS:
3629 break;
3631 gcc_unreachable ();
3634 /* Return the assembly string for an SVE prefetch operation with
3635 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3636 and that SUFFIX is the format for the remaining operands. */
3638 char *
3639 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3640 const char *suffix)
3642 static char buffer[128];
3643 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3644 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3645 mnemonic, svprfop_token (prfop), suffix);
3646 gcc_assert (written < sizeof (buffer));
3647 return buffer;
3650 /* Check whether we can calculate the number of elements in PATTERN
3651 at compile time, given that there are NELTS_PER_VQ elements per
3652 128-bit block. Return the value if so, otherwise return -1. */
3654 HOST_WIDE_INT
3655 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3657 unsigned int vl, const_vg;
3658 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3659 vl = 1 + (pattern - AARCH64_SV_VL1);
3660 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3661 vl = 16 << (pattern - AARCH64_SV_VL16);
3662 else if (aarch64_sve_vg.is_constant (&const_vg))
3664 /* There are two vector granules per quadword. */
3665 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3666 switch (pattern)
3668 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3669 case AARCH64_SV_MUL4: return nelts & -4;
3670 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3671 case AARCH64_SV_ALL: return nelts;
3672 default: gcc_unreachable ();
3675 else
3676 return -1;
3678 /* There are two vector granules per quadword. */
3679 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3680 if (known_le (vl, nelts_all))
3681 return vl;
3683 /* Requesting more elements than are available results in a PFALSE. */
3684 if (known_gt (vl, nelts_all))
3685 return 0;
3687 return -1;
3690 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3691 by the number of 128-bit quadwords in an SVE vector. */
3693 static bool
3694 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3696 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3697 return (IN_RANGE (factor, 2, 16 * 16)
3698 && (factor & 1) == 0
3699 && factor <= 16 * (factor & -factor));
3702 /* Return true if we can move VALUE into a register using a single
3703 CNT[BHWD] instruction. */
3705 static bool
3706 aarch64_sve_cnt_immediate_p (poly_int64 value)
3708 HOST_WIDE_INT factor = value.coeffs[0];
3709 return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3712 /* Likewise for rtx X. */
3714 bool
3715 aarch64_sve_cnt_immediate_p (rtx x)
3717 poly_int64 value;
3718 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3721 /* Return the asm string for an instruction with a CNT-like vector size
3722 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3723 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3724 first part of the operands template (the part that comes before the
3725 vector size itself). PATTERN is the pattern to use. FACTOR is the
3726 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3727 in each quadword. If it is zero, we can use any element size. */
3729 static char *
3730 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3731 aarch64_svpattern pattern,
3732 unsigned int factor,
3733 unsigned int nelts_per_vq)
3735 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3737 if (nelts_per_vq == 0)
3738 /* There is some overlap in the ranges of the four CNT instructions.
3739 Here we always use the smallest possible element size, so that the
3740 multiplier is 1 whereever possible. */
3741 nelts_per_vq = factor & -factor;
3742 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3743 gcc_assert (IN_RANGE (shift, 1, 4));
3744 char suffix = "dwhb"[shift - 1];
3746 factor >>= shift;
3747 unsigned int written;
3748 if (pattern == AARCH64_SV_ALL && factor == 1)
3749 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3750 prefix, suffix, operands);
3751 else if (factor == 1)
3752 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3753 prefix, suffix, operands, svpattern_token (pattern));
3754 else
3755 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3756 prefix, suffix, operands, svpattern_token (pattern),
3757 factor);
3758 gcc_assert (written < sizeof (buffer));
3759 return buffer;
3762 /* Return the asm string for an instruction with a CNT-like vector size
3763 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3764 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3765 first part of the operands template (the part that comes before the
3766 vector size itself). X is the value of the vector size operand,
3767 as a polynomial integer rtx; we need to convert this into an "all"
3768 pattern with a multiplier. */
3770 char *
3771 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3772 rtx x)
3774 poly_int64 value = rtx_to_poly_int64 (x);
3775 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3776 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3777 value.coeffs[1], 0);
3780 /* Return the asm string for an instruction with a CNT-like vector size
3781 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3782 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3783 first part of the operands template (the part that comes before the
3784 vector size itself). CNT_PAT[0..2] are the operands of the
3785 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3787 char *
3788 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3789 const char *operands, rtx *cnt_pat)
3791 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3792 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3793 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3794 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3795 factor, nelts_per_vq);
3798 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3800 bool
3801 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3803 poly_int64 value;
3804 return (poly_int_rtx_p (x, &value)
3805 && (aarch64_sve_cnt_immediate_p (value)
3806 || aarch64_sve_cnt_immediate_p (-value)));
3809 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3810 operand 0. */
3812 char *
3813 aarch64_output_sve_scalar_inc_dec (rtx offset)
3815 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3816 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3817 if (offset_value.coeffs[1] > 0)
3818 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3819 offset_value.coeffs[1], 0);
3820 else
3821 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3822 -offset_value.coeffs[1], 0);
3825 /* Return true if a single RDVL instruction can multiply FACTOR by the
3826 number of 128-bit quadwords in an SVE vector. This is also the
3827 range of ADDVL. */
3829 static bool
3830 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
3832 return (multiple_p (factor, 16)
3833 && IN_RANGE (factor, -32 * 16, 31 * 16));
3836 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3837 of quadwords in an SVE vector. */
3839 static bool
3840 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
3842 return (multiple_p (factor, 2)
3843 && IN_RANGE (factor, -32 * 2, 31 * 2));
3846 /* Return true if we can move VALUE into a register using a single
3847 RDVL instruction. */
3849 static bool
3850 aarch64_sve_rdvl_immediate_p (poly_int64 value)
3852 HOST_WIDE_INT factor = value.coeffs[0];
3853 return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
3856 /* Likewise for rtx X. */
3858 bool
3859 aarch64_sve_rdvl_immediate_p (rtx x)
3861 poly_int64 value;
3862 return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
3865 /* Return the asm string for moving RDVL immediate OFFSET into register
3866 operand 0. */
3868 char *
3869 aarch64_output_sve_rdvl (rtx offset)
3871 static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3872 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3873 gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
3875 int factor = offset_value.coeffs[1];
3876 snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
3877 return buffer;
3880 /* Return true if we can add VALUE to a register using a single ADDVL
3881 or ADDPL instruction. */
3883 static bool
3884 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3886 HOST_WIDE_INT factor = value.coeffs[0];
3887 if (factor == 0 || value.coeffs[1] != factor)
3888 return false;
3889 return (aarch64_sve_rdvl_addvl_factor_p (factor)
3890 || aarch64_sve_addpl_factor_p (factor));
3893 /* Likewise for rtx X. */
3895 bool
3896 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3898 poly_int64 value;
3899 return (poly_int_rtx_p (x, &value)
3900 && aarch64_sve_addvl_addpl_immediate_p (value));
3903 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3904 to operand 1 and storing the result in operand 0. */
3906 char *
3907 aarch64_output_sve_addvl_addpl (rtx offset)
3909 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3910 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3911 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3913 int factor = offset_value.coeffs[1];
3914 if ((factor & 15) == 0)
3915 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3916 else
3917 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3918 return buffer;
3921 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3922 instruction. If it is, store the number of elements in each vector
3923 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3924 factor in *FACTOR_OUT (if nonnull). */
3926 bool
3927 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3928 unsigned int *nelts_per_vq_out)
3930 rtx elt;
3931 poly_int64 value;
3933 if (!const_vec_duplicate_p (x, &elt)
3934 || !poly_int_rtx_p (elt, &value))
3935 return false;
3937 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3938 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3939 /* There's no vector INCB. */
3940 return false;
3942 HOST_WIDE_INT factor = value.coeffs[0];
3943 if (value.coeffs[1] != factor)
3944 return false;
3946 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3947 if ((factor % nelts_per_vq) != 0
3948 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3949 return false;
3951 if (factor_out)
3952 *factor_out = factor;
3953 if (nelts_per_vq_out)
3954 *nelts_per_vq_out = nelts_per_vq;
3955 return true;
3958 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3959 instruction. */
3961 bool
3962 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3964 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3967 /* Return the asm template for an SVE vector INC or DEC instruction.
3968 OPERANDS gives the operands before the vector count and X is the
3969 value of the vector count operand itself. */
3971 char *
3972 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3974 int factor;
3975 unsigned int nelts_per_vq;
3976 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3977 gcc_unreachable ();
3978 if (factor < 0)
3979 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3980 -factor, nelts_per_vq);
3981 else
3982 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3983 factor, nelts_per_vq);
3986 /* Return a constant that represents FACTOR multiplied by the
3987 number of 128-bit quadwords in an SME vector. ISA_MODE is the
3988 ISA mode in which the calculation is being performed. */
3991 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
3992 aarch64_feature_flags isa_mode)
3994 gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
3995 if (isa_mode & AARCH64_FL_SM_ON)
3996 /* We're in streaming mode, so we can use normal poly-int values. */
3997 return gen_int_mode ({ factor, factor }, mode);
3999 rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
4000 rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
4001 return gen_rtx_CONST (mode, unspec);
4004 /* Return true if X is a constant that represents some number X
4005 multiplied by the number of quadwords in an SME vector. Store this X
4006 in *FACTOR if so. */
4008 static bool
4009 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
4011 if (!TARGET_SME || GET_CODE (x) != CONST)
4012 return false;
4014 x = XEXP (x, 0);
4015 if (GET_CODE (x) != UNSPEC
4016 || XINT (x, 1) != UNSPEC_SME_VQ
4017 || XVECLEN (x, 0) != 1)
4018 return false;
4020 x = XVECEXP (x, 0, 0);
4021 if (!CONST_INT_P (x))
4022 return false;
4024 *factor = INTVAL (x);
4025 return true;
4028 /* Return true if X is a constant that represents some number Y
4029 multiplied by the number of quadwords in an SME vector, and if
4030 that Y is in the range of RDSVL. */
4032 bool
4033 aarch64_rdsvl_immediate_p (const_rtx x)
4035 HOST_WIDE_INT factor;
4036 return (aarch64_sme_vq_unspec_p (x, &factor)
4037 && aarch64_sve_rdvl_addvl_factor_p (factor));
4040 /* Return the asm string for an RDSVL instruction that calculates X,
4041 which is a constant that satisfies aarch64_rdsvl_immediate_p. */
4043 char *
4044 aarch64_output_rdsvl (const_rtx x)
4046 gcc_assert (aarch64_rdsvl_immediate_p (x));
4047 static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4048 x = XVECEXP (XEXP (x, 0), 0, 0);
4049 snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
4050 (int) INTVAL (x) / 16);
4051 return buffer;
4054 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL. */
4056 bool
4057 aarch64_addsvl_addspl_immediate_p (const_rtx x)
4059 HOST_WIDE_INT factor;
4060 return (aarch64_sme_vq_unspec_p (x, &factor)
4061 && (aarch64_sve_rdvl_addvl_factor_p (factor)
4062 || aarch64_sve_addpl_factor_p (factor)));
4065 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4066 Return the asm string for the associated instruction. */
4068 char *
4069 aarch64_output_addsvl_addspl (rtx x)
4071 static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4072 HOST_WIDE_INT factor;
4073 if (!aarch64_sme_vq_unspec_p (x, &factor))
4074 gcc_unreachable ();
4075 if (aarch64_sve_rdvl_addvl_factor_p (factor))
4076 snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4077 (int) factor / 16);
4078 else if (aarch64_sve_addpl_factor_p (factor))
4079 snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4080 (int) factor / 2);
4081 else
4082 gcc_unreachable ();
4083 return buffer;
4086 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4088 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4090 0x0000000100000001ull,
4091 0x0001000100010001ull,
4092 0x0101010101010101ull,
4093 0x1111111111111111ull,
4094 0x5555555555555555ull,
4099 /* Return true if 64-bit VAL is a valid bitmask immediate. */
4100 static bool
4101 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4103 unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4104 int bits;
4106 /* Check for a single sequence of one bits and return quickly if so.
4107 The special cases of all ones and all zeroes returns false. */
4108 tmp = val + (val & -val);
4110 if (tmp == (tmp & -tmp))
4111 return (val + 1) > 1;
4113 /* Invert if the immediate doesn't start with a zero bit - this means we
4114 only need to search for sequences of one bits. */
4115 if (val & 1)
4116 val = ~val;
4118 /* Find the first set bit and set tmp to val with the first sequence of one
4119 bits removed. Return success if there is a single sequence of ones. */
4120 first_one = val & -val;
4121 tmp = val & (val + first_one);
4123 if (tmp == 0)
4124 return true;
4126 /* Find the next set bit and compute the difference in bit position. */
4127 next_one = tmp & -tmp;
4128 bits = clz_hwi (first_one) - clz_hwi (next_one);
4129 mask = val ^ tmp;
4131 /* Check the bit position difference is a power of 2, and that the first
4132 sequence of one bits fits within 'bits' bits. */
4133 if ((mask >> bits) != 0 || bits != (bits & -bits))
4134 return false;
4136 /* Check the sequence of one bits is repeated 64/bits times. */
4137 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4141 /* Return true if VAL is a valid bitmask immediate for MODE. */
4142 bool
4143 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4145 if (mode == DImode)
4146 return aarch64_bitmask_imm (val);
4148 if (mode == SImode)
4149 return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4151 /* Replicate small immediates to fit 64 bits. */
4152 int size = GET_MODE_UNIT_PRECISION (mode);
4153 val &= (HOST_WIDE_INT_1U << size) - 1;
4154 val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4156 return aarch64_bitmask_imm (val);
4160 /* Return true if the immediate VAL can be a bitfield immediate
4161 by changing the given MASK bits in VAL to zeroes, ones or bits
4162 from the other half of VAL. Return the new immediate in VAL2. */
4163 static inline bool
4164 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4165 unsigned HOST_WIDE_INT &val2,
4166 unsigned HOST_WIDE_INT mask)
4168 val2 = val & ~mask;
4169 if (val2 != val && aarch64_bitmask_imm (val2))
4170 return true;
4171 val2 = val | mask;
4172 if (val2 != val && aarch64_bitmask_imm (val2))
4173 return true;
4174 val = val & ~mask;
4175 val2 = val | (((val >> 32) | (val << 32)) & mask);
4176 if (val2 != val && aarch64_bitmask_imm (val2))
4177 return true;
4178 val2 = val | (((val >> 16) | (val << 48)) & mask);
4179 if (val2 != val && aarch64_bitmask_imm (val2))
4180 return true;
4181 return false;
4185 /* Return true if VAL is a valid MOVZ immediate. */
4186 static inline bool
4187 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4189 return (val >> (ctz_hwi (val) & 48)) < 65536;
4193 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
4194 bool
4195 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4197 return aarch64_is_movz (val) || aarch64_is_movz (~val)
4198 || aarch64_bitmask_imm (val);
4202 /* Return true if VAL is an immediate that can be created by a single
4203 MOV instruction. */
4204 bool
4205 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4207 gcc_assert (mode == SImode || mode == DImode);
4209 if (val < 65536)
4210 return true;
4212 unsigned HOST_WIDE_INT mask =
4213 (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4215 if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4216 return true;
4218 val = (val & mask) | ((val << 32) & ~mask);
4219 return aarch64_bitmask_imm (val);
4223 static int
4224 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4225 machine_mode mode)
4227 int i;
4228 unsigned HOST_WIDE_INT val, val2, val3, mask;
4229 int one_match, zero_match;
4230 int num_insns;
4232 gcc_assert (mode == SImode || mode == DImode);
4234 val = INTVAL (imm);
4236 if (aarch64_move_imm (val, mode))
4238 if (generate)
4239 emit_insn (gen_rtx_SET (dest, imm));
4240 return 1;
4243 if ((val >> 32) == 0 || mode == SImode)
4245 if (generate)
4247 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4248 if (mode == SImode)
4249 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4250 GEN_INT ((val >> 16) & 0xffff)));
4251 else
4252 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4253 GEN_INT ((val >> 16) & 0xffff)));
4255 return 2;
4258 /* Remaining cases are all for DImode. */
4260 mask = 0xffff;
4261 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4262 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4263 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4264 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4266 /* Try a bitmask immediate and a movk to generate the immediate
4267 in 2 instructions. */
4269 if (zero_match < 2 && one_match < 2)
4271 for (i = 0; i < 64; i += 16)
4273 if (aarch64_check_bitmask (val, val2, mask << i))
4274 break;
4276 val2 = val & ~(mask << i);
4277 if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4278 break;
4281 if (i != 64)
4283 if (generate)
4285 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4286 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4287 GEN_INT ((val >> i) & 0xffff)));
4289 return 2;
4292 /* Try 2 bitmask immediates which are xor'd together. */
4293 for (i = 0; i < 64; i += 16)
4295 val2 = (val >> i) & mask;
4296 val2 |= val2 << 16;
4297 val2 |= val2 << 32;
4298 if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4299 break;
4302 if (i != 64)
4304 if (generate)
4306 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4307 emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4309 return 2;
4313 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
4314 if (zero_match + one_match == 0)
4316 for (i = 0; i < 48; i += 16)
4317 for (int j = i + 16; j < 64; j += 16)
4318 if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4320 if (generate)
4322 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4323 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4324 GEN_INT ((val >> i) & 0xffff)));
4325 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4326 GEN_INT ((val >> j) & 0xffff)));
4328 return 3;
4331 /* Try shifting and inserting the bottom 32-bits into the top bits. */
4332 val2 = val & 0xffffffff;
4333 val3 = 0xffffffff;
4334 val3 = val2 | (val3 << 32);
4335 for (i = 17; i < 48; i++)
4336 if ((val2 | (val2 << i)) == val)
4338 if (generate)
4340 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4341 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4342 GEN_INT (val2 >> 16)));
4343 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4345 return 3;
4347 else if ((val3 & ~(val3 << i)) == val)
4349 if (generate)
4351 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4352 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4353 GEN_INT (val2 >> 16)));
4354 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4355 dest));
4357 return 3;
4361 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4362 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4363 otherwise skip zero bits. */
4365 num_insns = 1;
4366 mask = 0xffff;
4367 val2 = one_match > zero_match ? ~val : val;
4368 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4370 if (generate)
4371 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4372 ? (val | ~(mask << i))
4373 : (val & (mask << i)))));
4374 for (i += 16; i < 64; i += 16)
4376 if ((val2 & (mask << i)) == 0)
4377 continue;
4378 if (generate)
4379 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4380 GEN_INT ((val >> i) & 0xffff)));
4381 num_insns ++;
4384 return num_insns;
4387 /* Return whether imm is a 128-bit immediate which is simple enough to
4388 expand inline. */
4389 bool
4390 aarch64_mov128_immediate (rtx imm)
4392 if (CONST_INT_P (imm))
4393 return true;
4395 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4397 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4398 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4400 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4401 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4405 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4406 a left shift of 0 or 12 bits. */
4407 bool
4408 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4410 return val < 4096 || (val & 0xfff000) == val;
4413 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4414 that can be created with a left shift of 0 or 12. */
4415 static HOST_WIDE_INT
4416 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4418 /* Check to see if the value fits in 24 bits, as that is the maximum we can
4419 handle correctly. */
4420 gcc_assert (val < 0x1000000);
4422 if (val < 4096)
4423 return val;
4425 return val & 0xfff000;
4429 /* Test whether:
4431 X = (X & AND_VAL) | IOR_VAL;
4433 can be implemented using:
4435 MOVK X, #(IOR_VAL >> shift), LSL #shift
4437 Return the shift if so, otherwise return -1. */
4439 aarch64_movk_shift (const wide_int_ref &and_val,
4440 const wide_int_ref &ior_val)
4442 unsigned int precision = and_val.get_precision ();
4443 unsigned HOST_WIDE_INT mask = 0xffff;
4444 for (unsigned int shift = 0; shift < precision; shift += 16)
4446 if (and_val == ~mask && (ior_val & mask) == ior_val)
4447 return shift;
4448 mask <<= 16;
4450 return -1;
4453 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4454 Assumed precondition: VAL_IN Is not zero. */
4456 unsigned HOST_WIDE_INT
4457 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4459 int lowest_bit_set = ctz_hwi (val_in);
4460 int highest_bit_set = floor_log2 (val_in);
4461 gcc_assert (val_in != 0);
4463 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4464 (HOST_WIDE_INT_1U << lowest_bit_set));
4467 /* Create constant where bits outside of lowest bit set to highest bit set
4468 are set to 1. */
4470 unsigned HOST_WIDE_INT
4471 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4473 return val_in | ~aarch64_and_split_imm1 (val_in);
4476 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4478 bool
4479 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4481 scalar_int_mode int_mode;
4482 if (!is_a <scalar_int_mode> (mode, &int_mode))
4483 return false;
4485 if (aarch64_bitmask_imm (val_in, int_mode))
4486 return false;
4488 if (aarch64_move_imm (val_in, int_mode))
4489 return false;
4491 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4493 return aarch64_bitmask_imm (imm2, int_mode);
4496 /* Return the number of temporary registers that aarch64_add_offset_1
4497 would need to add OFFSET to a register. */
4499 static unsigned int
4500 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4502 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4505 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4506 a non-polynomial OFFSET. MODE is the mode of the addition.
4507 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4508 be set and CFA adjustments added to the generated instructions.
4510 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4511 temporary if register allocation is already complete. This temporary
4512 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4513 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4514 the immediate again.
4516 Since this function may be used to adjust the stack pointer, we must
4517 ensure that it cannot cause transient stack deallocation (for example
4518 by first incrementing SP and then decrementing when adjusting by a
4519 large immediate). */
4521 static void
4522 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4523 rtx src, HOST_WIDE_INT offset, rtx temp1,
4524 bool frame_related_p, bool emit_move_imm)
4526 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4527 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4529 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4530 rtx_insn *insn;
4532 if (!moffset)
4534 if (!rtx_equal_p (dest, src))
4536 insn = emit_insn (gen_rtx_SET (dest, src));
4537 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4539 return;
4542 /* Single instruction adjustment. */
4543 if (aarch64_uimm12_shift (moffset))
4545 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4546 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4547 return;
4550 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4551 and either:
4553 a) the offset cannot be loaded by a 16-bit move or
4554 b) there is no spare register into which we can move it. */
4555 if (moffset < 0x1000000
4556 && ((!temp1 && !can_create_pseudo_p ())
4557 || !aarch64_move_imm (moffset, mode)))
4559 HOST_WIDE_INT low_off = moffset & 0xfff;
4561 low_off = offset < 0 ? -low_off : low_off;
4562 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4563 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4564 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4565 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4566 return;
4569 /* Emit a move immediate if required and an addition/subtraction. */
4570 if (emit_move_imm)
4572 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4573 temp1 = aarch64_force_temporary (mode, temp1,
4574 gen_int_mode (moffset, mode));
4576 insn = emit_insn (offset < 0
4577 ? gen_sub3_insn (dest, src, temp1)
4578 : gen_add3_insn (dest, src, temp1));
4579 if (frame_related_p)
4581 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4582 rtx adj = plus_constant (mode, src, offset);
4583 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4587 /* Return the number of temporary registers that aarch64_add_offset
4588 would need to move OFFSET into a register or add OFFSET to a register;
4589 ADD_P is true if we want the latter rather than the former. */
4591 static unsigned int
4592 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4594 /* This follows the same structure as aarch64_add_offset. */
4595 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4596 return 0;
4598 unsigned int count = 0;
4599 HOST_WIDE_INT factor = offset.coeffs[1];
4600 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4601 poly_int64 poly_offset (factor, factor);
4602 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4603 /* Need one register for the ADDVL/ADDPL result. */
4604 count += 1;
4605 else if (factor != 0)
4607 factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4608 if (!IN_RANGE (factor, -32, 31))
4609 /* Need one register for the CNT or RDVL result and one for the
4610 multiplication factor. If necessary, the second temporary
4611 can be reused for the constant part of the offset. */
4612 return 2;
4613 /* Need one register for the CNT or RDVL result (which might then
4614 be shifted). */
4615 count += 1;
4617 return count + aarch64_add_offset_1_temporaries (constant);
4620 /* If X can be represented as a poly_int64, return the number
4621 of temporaries that are required to add it to a register.
4622 Return -1 otherwise. */
4625 aarch64_add_offset_temporaries (rtx x)
4627 poly_int64 offset;
4628 if (!poly_int_rtx_p (x, &offset))
4629 return -1;
4630 return aarch64_offset_temporaries (true, offset);
4633 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4634 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4635 be set and CFA adjustments added to the generated instructions.
4637 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4638 temporary if register allocation is already complete. This temporary
4639 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4640 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4641 false to avoid emitting the immediate again.
4643 TEMP2, if nonnull, is a second temporary register that doesn't
4644 overlap either DEST or REG.
4646 FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
4647 is measured relative to the SME vector length instead of the current
4648 prevailing vector length. It is 0 otherwise.
4650 Since this function may be used to adjust the stack pointer, we must
4651 ensure that it cannot cause transient stack deallocation (for example
4652 by first incrementing SP and then decrementing when adjusting by a
4653 large immediate). */
4655 static void
4656 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4657 poly_int64 offset, rtx temp1, rtx temp2,
4658 aarch64_feature_flags force_isa_mode,
4659 bool frame_related_p, bool emit_move_imm = true)
4661 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4662 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4663 gcc_assert (temp1 == NULL_RTX
4664 || !frame_related_p
4665 || !reg_overlap_mentioned_p (temp1, dest));
4666 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4668 /* Try using ADDVL or ADDPL to add the whole value. */
4669 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4671 gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4672 rtx offset_rtx;
4673 if (force_isa_mode == 0)
4674 offset_rtx = gen_int_mode (offset, mode);
4675 else
4676 offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4677 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4678 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4679 if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
4680 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4681 gen_rtx_SET (dest, plus_constant (Pmode, src,
4682 offset)));
4683 return;
4686 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4687 SVE vector register, over and above the minimum size of 128 bits.
4688 This is equivalent to half the value returned by CNTD with a
4689 vector shape of ALL. */
4690 HOST_WIDE_INT factor = offset.coeffs[1];
4691 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4693 /* Try using ADDVL or ADDPL to add the VG-based part. */
4694 poly_int64 poly_offset (factor, factor);
4695 if (src != const0_rtx
4696 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4698 rtx offset_rtx;
4699 if (force_isa_mode == 0)
4700 offset_rtx = gen_int_mode (poly_offset, mode);
4701 else
4702 offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4703 if (frame_related_p)
4705 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4706 RTX_FRAME_RELATED_P (insn) = true;
4707 if (force_isa_mode & AARCH64_FL_SM_ON)
4708 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4709 gen_rtx_SET (dest, plus_constant (Pmode, src,
4710 poly_offset)));
4711 src = dest;
4713 else
4715 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4716 src = aarch64_force_temporary (mode, temp1, addr);
4717 temp1 = temp2;
4718 temp2 = NULL_RTX;
4721 /* Otherwise use a CNT-based sequence. */
4722 else if (factor != 0)
4724 /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4725 with negative shifts indicating a shift right. */
4726 HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4727 HOST_WIDE_INT rel_factor = factor / low_bit;
4728 int shift = exact_log2 (low_bit) - 4;
4729 gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4731 /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4732 equal to CNTB * FACTOR / 16, with CODE being the [+-].
4734 We can avoid a multiplication if REL_FACTOR is in the range
4735 of RDVL, although there are then various optimizations that
4736 we can try on top. */
4737 rtx_code code = PLUS;
4738 rtx val;
4739 if (IN_RANGE (rel_factor, -32, 31))
4741 if (force_isa_mode & AARCH64_FL_SM_ON)
4743 /* Try to use an unshifted RDSVL, otherwise fall back on
4744 a shifted RDSVL #1. */
4745 if (aarch64_sve_rdvl_addvl_factor_p (factor))
4746 shift = 0;
4747 else
4748 factor = rel_factor * 16;
4749 val = aarch64_sme_vq_immediate (mode, factor, 0);
4751 /* Try to use an unshifted CNT[BHWD] or RDVL. */
4752 else if (aarch64_sve_cnt_factor_p (factor)
4753 || aarch64_sve_rdvl_addvl_factor_p (factor))
4755 val = gen_int_mode (poly_int64 (factor, factor), mode);
4756 shift = 0;
4758 /* Try to subtract an unshifted CNT[BHWD]. */
4759 else if (aarch64_sve_cnt_factor_p (-factor))
4761 code = MINUS;
4762 val = gen_int_mode (poly_int64 (-factor, -factor), mode);
4763 shift = 0;
4765 /* If subtraction is free, prefer to load a positive constant.
4766 In the best case this will fit a shifted CNTB. */
4767 else if (src != const0_rtx && rel_factor < 0)
4769 code = MINUS;
4770 val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
4772 /* Otherwise use a shifted RDVL or CNT[BHWD]. */
4773 else
4774 val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
4776 else
4778 /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4779 since it should increase the chances of being able to use
4780 a shift and add sequence for the multiplication.
4781 If CNTB << SHIFT is out of range, stick with the current
4782 shift factor. */
4783 if (force_isa_mode == 0
4784 && IN_RANGE (low_bit, 2, 16 * 16))
4786 val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
4787 shift = 0;
4789 else if ((force_isa_mode & AARCH64_FL_SM_ON)
4790 && aarch64_sve_rdvl_addvl_factor_p (low_bit))
4792 val = aarch64_sme_vq_immediate (mode, low_bit, 0);
4793 shift = 0;
4795 else
4796 val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
4798 val = aarch64_force_temporary (mode, temp1, val);
4800 /* Prefer to multiply by a positive factor and subtract rather
4801 than multiply by a negative factor and add, since positive
4802 values are usually easier to move. */
4803 if (rel_factor < 0 && src != const0_rtx)
4805 rel_factor = -rel_factor;
4806 code = MINUS;
4809 if (can_create_pseudo_p ())
4811 rtx coeff1 = gen_int_mode (rel_factor, mode);
4812 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4814 else
4816 rtx coeff1 = gen_int_mode (rel_factor, mode);
4817 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4818 val = gen_rtx_MULT (mode, val, coeff1);
4822 /* Multiply by 2 ** SHIFT. */
4823 if (shift > 0)
4825 val = aarch64_force_temporary (mode, temp1, val);
4826 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4828 else if (shift < 0)
4830 val = aarch64_force_temporary (mode, temp1, val);
4831 val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
4834 /* Add the result to SRC or subtract the result from SRC. */
4835 if (src != const0_rtx)
4837 val = aarch64_force_temporary (mode, temp1, val);
4838 val = gen_rtx_fmt_ee (code, mode, src, val);
4840 else if (code == MINUS)
4842 val = aarch64_force_temporary (mode, temp1, val);
4843 val = gen_rtx_NEG (mode, val);
4846 if (constant == 0 || frame_related_p)
4848 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4849 if (frame_related_p)
4851 RTX_FRAME_RELATED_P (insn) = true;
4852 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4853 gen_rtx_SET (dest, plus_constant (Pmode, src,
4854 poly_offset)));
4856 src = dest;
4857 if (constant == 0)
4858 return;
4860 else
4862 src = aarch64_force_temporary (mode, temp1, val);
4863 temp1 = temp2;
4864 temp2 = NULL_RTX;
4867 emit_move_imm = true;
4870 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4871 frame_related_p, emit_move_imm);
4874 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4875 than a poly_int64. */
4877 void
4878 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4879 rtx offset_rtx, rtx temp1, rtx temp2)
4881 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4882 temp1, temp2, 0, false);
4885 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4886 TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
4887 for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
4888 contains abs (DELTA). */
4890 static inline void
4891 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
4892 aarch64_feature_flags force_isa_mode, bool emit_move_imm)
4894 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4895 temp1, temp2, force_isa_mode, true, emit_move_imm);
4898 /* Subtract DELTA from the stack pointer, marking the instructions
4899 frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
4900 aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
4902 static inline void
4903 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
4904 aarch64_feature_flags force_isa_mode,
4905 bool frame_related_p, bool emit_move_imm = true)
4907 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4908 temp1, temp2, force_isa_mode, frame_related_p,
4909 emit_move_imm);
4912 /* A streaming-compatible function needs to switch temporarily to the known
4913 PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains
4914 the runtime state of PSTATE.SM in the streaming-compatible code, before
4915 the start of the switch to LOCAL_MODE.
4917 Emit instructions to branch around the mode switch if PSTATE.SM already
4918 matches LOCAL_MODE. Return the label that the branch jumps to. */
4920 static rtx_insn *
4921 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
4923 local_mode &= AARCH64_FL_SM_STATE;
4924 gcc_assert (local_mode != 0);
4925 auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
4926 auto *label = gen_label_rtx ();
4927 auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
4928 label);
4929 auto *jump = emit_jump_insn (branch);
4930 JUMP_LABEL (jump) = label;
4931 return label;
4934 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
4935 state in NEW_MODE. This is known to involve either an SMSTART SM or
4936 an SMSTOP SM. */
4938 static void
4939 aarch64_switch_pstate_sm (aarch64_feature_flags old_mode,
4940 aarch64_feature_flags new_mode)
4942 old_mode &= AARCH64_FL_SM_STATE;
4943 new_mode &= AARCH64_FL_SM_STATE;
4944 gcc_assert (old_mode != new_mode);
4946 if ((new_mode & AARCH64_FL_SM_ON)
4947 || (new_mode == 0 && (old_mode & AARCH64_FL_SM_OFF)))
4948 emit_insn (gen_aarch64_smstart_sm ());
4949 else
4950 emit_insn (gen_aarch64_smstop_sm ());
4953 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
4954 FP and predicate registers. This class emits code to preserve any
4955 necessary registers around the mode switch.
4957 The class uses four approaches to saving and restoring contents, enumerated
4958 by group_type:
4960 - GPR: save and restore the contents of FP registers using GPRs.
4961 This is used if the FP register contains no more than 64 significant
4962 bits. The registers used are FIRST_GPR onwards.
4964 - MEM_128: save and restore 128-bit SIMD registers using memory.
4966 - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
4968 - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
4970 The save slots within each memory group are consecutive, with the
4971 MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
4973 There will only be two mode switches for each use of SME, so they should
4974 not be particularly performance-sensitive. It's also rare for SIMD, SVE
4975 or predicate registers to be live across mode switches. We therefore
4976 don't preallocate the save slots but instead allocate them locally on
4977 demand. This makes the code emitted by the class self-contained. */
4979 class aarch64_sme_mode_switch_regs
4981 public:
4982 static const unsigned int FIRST_GPR = R10_REGNUM;
4984 void add_reg (machine_mode, unsigned int);
4985 void add_call_args (rtx_call_insn *);
4986 void add_call_result (rtx_call_insn *);
4987 void add_call_preserved_reg (unsigned int);
4988 void add_call_preserved_regs (bitmap);
4990 void emit_prologue ();
4991 void emit_epilogue ();
4993 /* The number of GPRs needed to save FP registers, starting from
4994 FIRST_GPR. */
4995 unsigned int num_gprs () { return m_group_count[GPR]; }
4997 private:
4998 enum sequence { PROLOGUE, EPILOGUE };
4999 enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
5001 /* Information about the save location for one FP, SIMD, SVE data, or
5002 SVE predicate register. */
5003 struct save_location {
5004 /* The register to be saved. */
5005 rtx reg;
5007 /* Which group the save location belongs to. */
5008 group_type group;
5010 /* A zero-based index of the register within the group. */
5011 unsigned int index;
5014 unsigned int sve_data_headroom ();
5015 rtx get_slot_mem (machine_mode, poly_int64);
5016 void emit_stack_adjust (sequence, poly_int64);
5017 void emit_mem_move (sequence, const save_location &, poly_int64);
5019 void emit_gpr_moves (sequence);
5020 void emit_mem_128_moves (sequence);
5021 void emit_sve_sp_adjust (sequence);
5022 void emit_sve_pred_moves (sequence);
5023 void emit_sve_data_moves (sequence);
5025 /* All save locations, in no particular order. */
5026 auto_vec<save_location, 12> m_save_locations;
5028 /* The number of registers in each group. */
5029 unsigned int m_group_count[NUM_GROUPS] = {};
5032 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5033 switch. */
5035 void
5036 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
5038 if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
5039 return;
5041 unsigned int end_regno = end_hard_regno (mode, regno);
5042 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5043 gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
5044 for (; regno < end_regno; regno++)
5046 /* Force the mode of SVE saves and restores even for single registers.
5047 This is necessary because big-endian targets only allow LDR Z and
5048 STR Z to be used with byte modes. */
5049 machine_mode submode = mode;
5050 if (vec_flags & VEC_SVE_PRED)
5051 submode = VNx16BImode;
5052 else if (vec_flags & VEC_SVE_DATA)
5053 submode = SVE_BYTE_MODE;
5054 else if (vec_flags & VEC_STRUCT)
5056 if (vec_flags & VEC_PARTIAL)
5057 submode = V8QImode;
5058 else
5059 submode = V16QImode;
5061 save_location loc;
5062 loc.reg = gen_rtx_REG (submode, regno);
5063 if (vec_flags & VEC_SVE_PRED)
5065 gcc_assert (PR_REGNUM_P (regno));
5066 loc.group = MEM_SVE_PRED;
5068 else
5070 gcc_assert (FP_REGNUM_P (regno));
5071 if (known_le (GET_MODE_SIZE (submode), 8))
5072 loc.group = GPR;
5073 else if (known_eq (GET_MODE_SIZE (submode), 16))
5074 loc.group = MEM_128;
5075 else
5076 loc.group = MEM_SVE_DATA;
5078 loc.index = m_group_count[loc.group]++;
5079 m_save_locations.quick_push (loc);
5083 /* Record that the arguments to CALL_INSN need to be preserved around
5084 the mode switch. */
5086 void
5087 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5089 for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5090 node; node = XEXP (node, 1))
5092 rtx item = XEXP (node, 0);
5093 if (GET_CODE (item) != USE)
5094 continue;
5095 item = XEXP (item, 0);
5096 if (!REG_P (item))
5097 continue;
5098 add_reg (GET_MODE (item), REGNO (item));
5102 /* Record that the return value from CALL_INSN (if any) needs to be
5103 preserved around the mode switch. */
5105 void
5106 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5108 rtx pat = PATTERN (call_insn);
5109 gcc_assert (GET_CODE (pat) == PARALLEL);
5110 pat = XVECEXP (pat, 0, 0);
5111 if (GET_CODE (pat) == CALL)
5112 return;
5113 rtx dest = SET_DEST (pat);
5114 if (GET_CODE (dest) == PARALLEL)
5115 for (int i = 0; i < XVECLEN (dest, 0); ++i)
5117 rtx x = XVECEXP (dest, 0, i);
5118 gcc_assert (GET_CODE (x) == EXPR_LIST);
5119 rtx reg = XEXP (x, 0);
5120 add_reg (GET_MODE (reg), REGNO (reg));
5122 else
5123 add_reg (GET_MODE (dest), REGNO (dest));
5126 /* REGNO is a register that is call-preserved under the current function's ABI.
5127 Record that it must be preserved around the mode switch. */
5129 void
5130 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5132 if (FP_REGNUM_P (regno))
5133 switch (crtl->abi->id ())
5135 case ARM_PCS_SVE:
5136 add_reg (VNx16QImode, regno);
5137 break;
5138 case ARM_PCS_SIMD:
5139 add_reg (V16QImode, regno);
5140 break;
5141 case ARM_PCS_AAPCS64:
5142 add_reg (DImode, regno);
5143 break;
5144 default:
5145 gcc_unreachable ();
5147 else if (PR_REGNUM_P (regno))
5148 add_reg (VNx16BImode, regno);
5151 /* The hard registers in REGS are call-preserved under the current function's
5152 ABI. Record that they must be preserved around the mode switch. */
5154 void
5155 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5157 bitmap_iterator bi;
5158 unsigned int regno;
5159 EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5160 if (HARD_REGISTER_NUM_P (regno))
5161 add_call_preserved_reg (regno);
5162 else
5163 break;
5166 /* Emit code to save registers before the mode switch. */
5168 void
5169 aarch64_sme_mode_switch_regs::emit_prologue ()
5171 emit_sve_sp_adjust (PROLOGUE);
5172 emit_sve_pred_moves (PROLOGUE);
5173 emit_sve_data_moves (PROLOGUE);
5174 emit_mem_128_moves (PROLOGUE);
5175 emit_gpr_moves (PROLOGUE);
5178 /* Emit code to restore registers after the mode switch. */
5180 void
5181 aarch64_sme_mode_switch_regs::emit_epilogue ()
5183 emit_gpr_moves (EPILOGUE);
5184 emit_mem_128_moves (EPILOGUE);
5185 emit_sve_pred_moves (EPILOGUE);
5186 emit_sve_data_moves (EPILOGUE);
5187 emit_sve_sp_adjust (EPILOGUE);
5190 /* The SVE predicate registers are stored below the SVE data registers,
5191 with the predicate save area being padded to a data-register-sized
5192 boundary. Return the size of this padded area as a whole number
5193 of data register slots. */
5195 unsigned int
5196 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5198 return CEIL (m_group_count[MEM_SVE_PRED], 8);
5201 /* Return a memory reference of mode MODE to OFFSET bytes from the
5202 stack pointer. */
5205 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5206 poly_int64 offset)
5208 rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5209 return gen_rtx_MEM (mode, addr);
5212 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */
5214 void
5215 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5216 poly_int64 size)
5218 if (seq == PROLOGUE)
5219 size = -size;
5220 emit_insn (gen_rtx_SET (stack_pointer_rtx,
5221 plus_constant (Pmode, stack_pointer_rtx, size)));
5224 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5225 the stack pointer. SEQ chooses between saving and restoring. */
5227 void
5228 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5229 const save_location &loc,
5230 poly_int64 offset)
5232 rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5233 if (seq == PROLOGUE)
5234 emit_move_insn (mem, loc.reg);
5235 else
5236 emit_move_insn (loc.reg, mem);
5239 /* Emit instructions to save or restore the GPR group. SEQ chooses between
5240 saving and restoring. */
5242 void
5243 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5245 for (auto &loc : m_save_locations)
5246 if (loc.group == GPR)
5248 gcc_assert (loc.index < 8);
5249 rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5250 if (seq == PROLOGUE)
5251 emit_move_insn (gpr, loc.reg);
5252 else
5253 emit_move_insn (loc.reg, gpr);
5257 /* Emit instructions to save or restore the MEM_128 group. SEQ chooses
5258 between saving and restoring. */
5260 void
5261 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5263 HOST_WIDE_INT count = m_group_count[MEM_128];
5264 if (count == 0)
5265 return;
5267 auto sp = stack_pointer_rtx;
5268 auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5270 /* Pick a common mode that supports LDR & STR with pre/post-modification
5271 and LDP & STP with pre/post-modification. */
5272 auto mode = TFmode;
5274 /* An instruction pattern that should be emitted at the end. */
5275 rtx last_pat = NULL_RTX;
5277 /* A previous MEM_128 location that hasn't been handled yet. */
5278 save_location *prev_loc = nullptr;
5280 /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */
5281 for (auto &loc : m_save_locations)
5282 if (loc.group == MEM_128)
5284 if (!prev_loc)
5286 prev_loc = &loc;
5287 continue;
5289 gcc_assert (loc.index == prev_loc->index + 1);
5291 /* The offset of the base of the save area from the current
5292 stack pointer. */
5293 HOST_WIDE_INT bias = 0;
5294 if (prev_loc->index == 0 && seq == PROLOGUE)
5295 bias = sp_adjust;
5297 /* Get the two sets in the LDP/STP. */
5298 rtx ops[] = {
5299 gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5300 get_slot_mem (mode, prev_loc->index * 16 + bias),
5301 gen_rtx_REG (mode, REGNO (loc.reg)),
5302 get_slot_mem (mode, loc.index * 16 + bias)
5304 unsigned int lhs = (seq == PROLOGUE);
5305 rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5306 rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5308 /* Combine the sets with any stack allocation/deallocation. */
5309 rtx pat;
5310 if (prev_loc->index == 0)
5312 rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5313 rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5314 pat = gen_rtx_PARALLEL (VOIDmode, vec);
5316 else if (seq == PROLOGUE)
5317 pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5318 else
5319 pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5321 /* Queue a deallocation to the end, otherwise emit the
5322 instruction now. */
5323 if (seq == EPILOGUE && prev_loc->index == 0)
5324 last_pat = pat;
5325 else
5326 emit_insn (pat);
5327 prev_loc = nullptr;
5330 /* Handle any leftover LDR/STR. */
5331 if (prev_loc)
5333 rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5334 rtx addr;
5335 if (prev_loc->index != 0)
5336 addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5337 else if (seq == PROLOGUE)
5339 rtx allocate = plus_constant (Pmode, sp, -count * 16);
5340 addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5342 else
5344 rtx deallocate = plus_constant (Pmode, sp, count * 16);
5345 addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5347 rtx mem = gen_rtx_MEM (mode, addr);
5348 if (seq == PROLOGUE)
5349 emit_move_insn (mem, reg);
5350 else
5351 emit_move_insn (reg, mem);
5354 if (last_pat)
5355 emit_insn (last_pat);
5358 /* Allocate or deallocate the stack space needed by the SVE groups.
5359 SEQ chooses between allocating and deallocating. */
5361 void
5362 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5364 if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5365 emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5368 /* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving
5369 and restoring. */
5371 void
5372 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5374 for (auto &loc : m_save_locations)
5375 if (loc.group == MEM_SVE_DATA)
5377 auto index = loc.index + sve_data_headroom ();
5378 emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5382 /* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving
5383 and restoring. */
5385 void
5386 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5388 for (auto &loc : m_save_locations)
5389 if (loc.group == MEM_SVE_PRED)
5390 emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5393 /* Set DEST to (vec_series BASE STEP). */
5395 static void
5396 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5398 machine_mode mode = GET_MODE (dest);
5399 scalar_mode inner = GET_MODE_INNER (mode);
5401 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5402 if (!aarch64_sve_index_immediate_p (base))
5403 base = force_reg (inner, base);
5404 if (!aarch64_sve_index_immediate_p (step))
5405 step = force_reg (inner, step);
5407 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5410 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5411 register of mode MODE. Use TARGET for the result if it's nonnull
5412 and convenient.
5414 The two vector modes must have the same element mode. The behavior
5415 is to duplicate architectural lane N of SRC into architectural lanes
5416 N + I * STEP of the result. On big-endian targets, architectural
5417 lane 0 of an Advanced SIMD vector is the last element of the vector
5418 in memory layout, so for big-endian targets this operation has the
5419 effect of reversing SRC before duplicating it. Callers need to
5420 account for this. */
5423 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5425 machine_mode src_mode = GET_MODE (src);
5426 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5427 insn_code icode = (BYTES_BIG_ENDIAN
5428 ? code_for_aarch64_vec_duplicate_vq_be (mode)
5429 : code_for_aarch64_vec_duplicate_vq_le (mode));
5431 unsigned int i = 0;
5432 expand_operand ops[3];
5433 create_output_operand (&ops[i++], target, mode);
5434 create_output_operand (&ops[i++], src, src_mode);
5435 if (BYTES_BIG_ENDIAN)
5437 /* Create a PARALLEL describing the reversal of SRC. */
5438 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5439 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5440 nelts_per_vq - 1, -1);
5441 create_fixed_operand (&ops[i++], sel);
5443 expand_insn (icode, i, ops);
5444 return ops[0].value;
5447 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5448 the memory image into DEST. Return true on success. */
5450 static bool
5451 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5453 src = force_const_mem (GET_MODE (src), src);
5454 if (!src)
5455 return false;
5457 /* Make sure that the address is legitimate. */
5458 if (!aarch64_sve_ld1rq_operand_p (src))
5460 rtx addr = force_reg (Pmode, XEXP (src, 0));
5461 src = replace_equiv_address (src, addr);
5464 machine_mode mode = GET_MODE (dest);
5465 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5466 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5467 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5468 return true;
5471 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5472 by N "background" values. Try to move it into TARGET using:
5474 PTRUE PRED.<T>, VL<N>
5475 MOV TRUE.<T>, #<foreground>
5476 MOV FALSE.<T>, #<background>
5477 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5479 The PTRUE is always a single instruction but the MOVs might need a
5480 longer sequence. If the background value is zero (as it often is),
5481 the sequence can sometimes collapse to a PTRUE followed by a
5482 zero-predicated move.
5484 Return the target on success, otherwise return null. */
5486 static rtx
5487 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5489 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5491 /* Make sure that the PTRUE is valid. */
5492 machine_mode mode = GET_MODE (src);
5493 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5494 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5495 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5496 == AARCH64_NUM_SVPATTERNS)
5497 return NULL_RTX;
5499 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5500 rtx_vector_builder true_builder (mode, npatterns, 1);
5501 rtx_vector_builder false_builder (mode, npatterns, 1);
5502 for (unsigned int i = 0; i < npatterns; ++i)
5504 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5505 pred_builder.quick_push (CONST1_RTX (BImode));
5507 for (unsigned int i = 0; i < npatterns; ++i)
5509 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5510 pred_builder.quick_push (CONST0_RTX (BImode));
5512 expand_operand ops[4];
5513 create_output_operand (&ops[0], target, mode);
5514 create_input_operand (&ops[1], true_builder.build (), mode);
5515 create_input_operand (&ops[2], false_builder.build (), mode);
5516 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5517 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5518 return target;
5521 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5522 SVE data mode and isn't a legitimate constant. Use TARGET for the
5523 result if convenient.
5525 The returned register can have whatever mode seems most natural
5526 given the contents of SRC. */
5528 static rtx
5529 aarch64_expand_sve_const_vector (rtx target, rtx src)
5531 machine_mode mode = GET_MODE (src);
5532 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5533 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5534 scalar_mode elt_mode = GET_MODE_INNER (mode);
5535 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5536 unsigned int container_bits = aarch64_sve_container_bits (mode);
5537 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5539 if (nelts_per_pattern == 1
5540 && encoded_bits <= 128
5541 && container_bits != elt_bits)
5543 /* We have a partial vector mode and a constant whose full-vector
5544 equivalent would occupy a repeating 128-bit sequence. Build that
5545 full-vector equivalent instead, so that we have the option of
5546 using LD1RQ and Advanced SIMD operations. */
5547 unsigned int repeat = container_bits / elt_bits;
5548 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5549 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5550 for (unsigned int i = 0; i < npatterns; ++i)
5551 for (unsigned int j = 0; j < repeat; ++j)
5552 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5553 target = aarch64_target_reg (target, full_mode);
5554 return aarch64_expand_sve_const_vector (target, builder.build ());
5557 if (nelts_per_pattern == 1 && encoded_bits == 128)
5559 /* The constant is a duplicated quadword but can't be narrowed
5560 beyond a quadword. Get the memory image of the first quadword
5561 as a 128-bit vector and try using LD1RQ to load it from memory.
5563 The effect for both endiannesses is to load memory lane N into
5564 architectural lanes N + I * STEP of the result. On big-endian
5565 targets, the layout of the 128-bit vector in an Advanced SIMD
5566 register would be different from its layout in an SVE register,
5567 but this 128-bit vector is a memory value only. */
5568 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5569 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5570 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5571 return target;
5574 if (nelts_per_pattern == 1 && encoded_bits < 128)
5576 /* The vector is a repeating sequence of 64 bits or fewer.
5577 See if we can load them using an Advanced SIMD move and then
5578 duplicate it to fill a vector. This is better than using a GPR
5579 move because it keeps everything in the same register file. */
5580 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5581 rtx_vector_builder builder (vq_mode, npatterns, 1);
5582 for (unsigned int i = 0; i < npatterns; ++i)
5584 /* We want memory lane N to go into architectural lane N,
5585 so reverse for big-endian targets. The DUP .Q pattern
5586 has a compensating reverse built-in. */
5587 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5588 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5590 rtx vq_src = builder.build ();
5591 if (aarch64_simd_valid_immediate (vq_src, NULL))
5593 vq_src = force_reg (vq_mode, vq_src);
5594 return aarch64_expand_sve_dupq (target, mode, vq_src);
5597 /* Get an integer representation of the repeating part of Advanced
5598 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5599 which for big-endian targets is lane-swapped wrt a normal
5600 Advanced SIMD vector. This means that for both endiannesses,
5601 memory lane N of SVE vector SRC corresponds to architectural
5602 lane N of a register holding VQ_SRC. This in turn means that
5603 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5604 as a single 128-bit value) and thus that memory lane 0 of SRC is
5605 in the lsb of the integer. Duplicating the integer therefore
5606 ensures that memory lane N of SRC goes into architectural lane
5607 N + I * INDEX of the SVE register. */
5608 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5609 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5610 if (elt_value)
5612 /* Pretend that we had a vector of INT_MODE to start with. */
5613 elt_mode = int_mode;
5614 mode = aarch64_full_sve_mode (int_mode).require ();
5616 /* If the integer can be moved into a general register by a
5617 single instruction, do that and duplicate the result. */
5618 if (CONST_INT_P (elt_value)
5619 && aarch64_move_imm (INTVAL (elt_value),
5620 encoded_bits <= 32 ? SImode : DImode))
5622 elt_value = force_reg (elt_mode, elt_value);
5623 return expand_vector_broadcast (mode, elt_value);
5626 else if (npatterns == 1)
5627 /* We're duplicating a single value, but can't do better than
5628 force it to memory and load from there. This handles things
5629 like symbolic constants. */
5630 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5632 if (elt_value)
5634 /* Load the element from memory if we can, otherwise move it into
5635 a register and use a DUP. */
5636 rtx op = force_const_mem (elt_mode, elt_value);
5637 if (!op)
5638 op = force_reg (elt_mode, elt_value);
5639 return expand_vector_broadcast (mode, op);
5643 /* Try using INDEX. */
5644 rtx base, step;
5645 if (const_vec_series_p (src, &base, &step))
5647 aarch64_expand_vec_series (target, base, step);
5648 return target;
5651 /* From here on, it's better to force the whole constant to memory
5652 if we can. */
5653 if (GET_MODE_NUNITS (mode).is_constant ())
5654 return NULL_RTX;
5656 if (nelts_per_pattern == 2)
5657 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5658 return res;
5660 /* Expand each pattern individually. */
5661 gcc_assert (npatterns > 1);
5662 rtx_vector_builder builder;
5663 auto_vec<rtx, 16> vectors (npatterns);
5664 for (unsigned int i = 0; i < npatterns; ++i)
5666 builder.new_vector (mode, 1, nelts_per_pattern);
5667 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5668 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5669 vectors.quick_push (force_reg (mode, builder.build ()));
5672 /* Use permutes to interleave the separate vectors. */
5673 while (npatterns > 1)
5675 npatterns /= 2;
5676 for (unsigned int i = 0; i < npatterns; ++i)
5678 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5679 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5680 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5681 vectors[i] = tmp;
5684 gcc_assert (vectors[0] == target);
5685 return target;
5688 /* Use WHILE to set a predicate register of mode MODE in which the first
5689 VL bits are set and the rest are clear. Use TARGET for the register
5690 if it's nonnull and convenient. */
5692 static rtx
5693 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5694 unsigned int vl)
5696 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5697 target = aarch64_target_reg (target, mode);
5698 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5699 target, const0_rtx, limit));
5700 return target;
5703 static rtx
5704 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5706 /* BUILDER is a constant predicate in which the index of every set bit
5707 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5708 by inverting every element at a multiple of ELT_SIZE and EORing the
5709 result with an ELT_SIZE PTRUE.
5711 Return a register that contains the constant on success, otherwise
5712 return null. Use TARGET as the register if it is nonnull and
5713 convenient. */
5715 static rtx
5716 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5717 unsigned int elt_size)
5719 /* Invert every element at a multiple of ELT_SIZE, keeping the
5720 other bits zero. */
5721 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5722 builder.nelts_per_pattern ());
5723 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5724 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5725 inv_builder.quick_push (const1_rtx);
5726 else
5727 inv_builder.quick_push (const0_rtx);
5728 inv_builder.finalize ();
5730 /* See if we can load the constant cheaply. */
5731 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5732 if (!inv)
5733 return NULL_RTX;
5735 /* EOR the result with an ELT_SIZE PTRUE. */
5736 rtx mask = aarch64_ptrue_all (elt_size);
5737 mask = force_reg (VNx16BImode, mask);
5738 inv = gen_lowpart (VNx16BImode, inv);
5739 target = aarch64_target_reg (target, VNx16BImode);
5740 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5741 return target;
5744 /* BUILDER is a constant predicate in which the index of every set bit
5745 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5746 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5747 register on success, otherwise return null. Use TARGET as the register
5748 if nonnull and convenient. */
5750 static rtx
5751 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5752 unsigned int elt_size,
5753 unsigned int permute_size)
5755 /* We're going to split the constant into two new constants A and B,
5756 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5757 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5759 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5760 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5762 where _ indicates elements that will be discarded by the permute.
5764 First calculate the ELT_SIZEs for A and B. */
5765 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5766 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5767 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5768 if (INTVAL (builder.elt (i)) != 0)
5770 if (i & permute_size)
5771 b_elt_size |= i - permute_size;
5772 else
5773 a_elt_size |= i;
5775 a_elt_size &= -a_elt_size;
5776 b_elt_size &= -b_elt_size;
5778 /* Now construct the vectors themselves. */
5779 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5780 builder.nelts_per_pattern ());
5781 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5782 builder.nelts_per_pattern ());
5783 unsigned int nelts = builder.encoded_nelts ();
5784 for (unsigned int i = 0; i < nelts; ++i)
5785 if (i & (elt_size - 1))
5787 a_builder.quick_push (const0_rtx);
5788 b_builder.quick_push (const0_rtx);
5790 else if ((i & permute_size) == 0)
5792 /* The A and B elements are significant. */
5793 a_builder.quick_push (builder.elt (i));
5794 b_builder.quick_push (builder.elt (i + permute_size));
5796 else
5798 /* The A and B elements are going to be discarded, so pick whatever
5799 is likely to give a nice constant. We are targeting element
5800 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5801 with the aim of each being a sequence of ones followed by
5802 a sequence of zeros. So:
5804 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5805 duplicate the last X_ELT_SIZE element, to extend the
5806 current sequence of ones or zeros.
5808 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5809 zero, so that the constant really does have X_ELT_SIZE and
5810 not a smaller size. */
5811 if (a_elt_size > permute_size)
5812 a_builder.quick_push (const0_rtx);
5813 else
5814 a_builder.quick_push (a_builder.elt (i - a_elt_size));
5815 if (b_elt_size > permute_size)
5816 b_builder.quick_push (const0_rtx);
5817 else
5818 b_builder.quick_push (b_builder.elt (i - b_elt_size));
5820 a_builder.finalize ();
5821 b_builder.finalize ();
5823 /* Try loading A into a register. */
5824 rtx_insn *last = get_last_insn ();
5825 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5826 if (!a)
5827 return NULL_RTX;
5829 /* Try loading B into a register. */
5830 rtx b = a;
5831 if (a_builder != b_builder)
5833 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5834 if (!b)
5836 delete_insns_since (last);
5837 return NULL_RTX;
5841 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
5842 operands but permutes them as though they had mode MODE. */
5843 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5844 target = aarch64_target_reg (target, GET_MODE (a));
5845 rtx type_reg = CONST0_RTX (mode);
5846 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5847 return target;
5850 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5851 constant in BUILDER into an SVE predicate register. Return the register
5852 on success, otherwise return null. Use TARGET for the register if
5853 nonnull and convenient.
5855 ALLOW_RECURSE_P is true if we can use methods that would call this
5856 function recursively. */
5858 static rtx
5859 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5860 bool allow_recurse_p)
5862 if (builder.encoded_nelts () == 1)
5863 /* A PFALSE or a PTRUE .B ALL. */
5864 return aarch64_emit_set_immediate (target, builder);
5866 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5867 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5869 /* If we can load the constant using PTRUE, use it as-is. */
5870 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5871 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5872 return aarch64_emit_set_immediate (target, builder);
5874 /* Otherwise use WHILE to set the first VL bits. */
5875 return aarch64_sve_move_pred_via_while (target, mode, vl);
5878 if (!allow_recurse_p)
5879 return NULL_RTX;
5881 /* Try inverting the vector in element size ELT_SIZE and then EORing
5882 the result with an ELT_SIZE PTRUE. */
5883 if (INTVAL (builder.elt (0)) == 0)
5884 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5885 elt_size))
5886 return res;
5888 /* Try using TRN1 to permute two simpler constants. */
5889 for (unsigned int i = elt_size; i <= 8; i *= 2)
5890 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5891 elt_size, i))
5892 return res;
5894 return NULL_RTX;
5897 /* Return an SVE predicate register that contains the VNx16BImode
5898 constant in BUILDER, without going through the move expanders.
5900 The returned register can have whatever mode seems most natural
5901 given the contents of BUILDER. Use TARGET for the result if
5902 convenient. */
5904 static rtx
5905 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5907 /* Try loading the constant using pure predicate operations. */
5908 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5909 return res;
5911 /* Try forcing the constant to memory. */
5912 if (builder.full_nelts ().is_constant ())
5913 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5915 target = aarch64_target_reg (target, VNx16BImode);
5916 emit_move_insn (target, mem);
5917 return target;
5920 /* The last resort is to load the constant as an integer and then
5921 compare it against zero. Use -1 for set bits in order to increase
5922 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5923 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5924 builder.nelts_per_pattern ());
5925 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5926 int_builder.quick_push (INTVAL (builder.elt (i))
5927 ? constm1_rtx : const0_rtx);
5928 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5929 int_builder.build ());
5932 /* Set DEST to immediate IMM. */
5934 void
5935 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5937 machine_mode mode = GET_MODE (dest);
5939 /* Check on what type of symbol it is. */
5940 scalar_int_mode int_mode;
5941 if ((SYMBOL_REF_P (imm)
5942 || LABEL_REF_P (imm)
5943 || GET_CODE (imm) == CONST
5944 || GET_CODE (imm) == CONST_POLY_INT)
5945 && is_a <scalar_int_mode> (mode, &int_mode))
5947 rtx mem;
5948 poly_int64 offset;
5949 HOST_WIDE_INT const_offset;
5950 enum aarch64_symbol_type sty;
5952 /* If we have (const (plus symbol offset)), separate out the offset
5953 before we start classifying the symbol. */
5954 rtx base = strip_offset (imm, &offset);
5956 /* We must always add an offset involving VL separately, rather than
5957 folding it into the relocation. */
5958 if (!offset.is_constant (&const_offset))
5960 if (!TARGET_SVE)
5962 aarch64_report_sve_required ();
5963 return;
5965 if (base == const0_rtx
5966 && (aarch64_sve_cnt_immediate_p (offset)
5967 || aarch64_sve_rdvl_immediate_p (offset)))
5968 emit_insn (gen_rtx_SET (dest, imm));
5969 else
5971 /* Do arithmetic on 32-bit values if the result is smaller
5972 than that. */
5973 if (partial_subreg_p (int_mode, SImode))
5975 /* It is invalid to do symbol calculations in modes
5976 narrower than SImode. */
5977 gcc_assert (base == const0_rtx);
5978 dest = gen_lowpart (SImode, dest);
5979 int_mode = SImode;
5981 if (base != const0_rtx)
5983 base = aarch64_force_temporary (int_mode, dest, base);
5984 aarch64_add_offset (int_mode, dest, base, offset,
5985 NULL_RTX, NULL_RTX, 0, false);
5987 else
5988 aarch64_add_offset (int_mode, dest, base, offset,
5989 dest, NULL_RTX, 0, false);
5991 return;
5994 if (aarch64_rdsvl_immediate_p (base))
5996 /* We could handle non-constant offsets if they are ever
5997 generated. */
5998 gcc_assert (const_offset == 0);
5999 emit_insn (gen_rtx_SET (dest, imm));
6000 return;
6003 sty = aarch64_classify_symbol (base, const_offset);
6004 switch (sty)
6006 case SYMBOL_FORCE_TO_MEM:
6007 if (int_mode != ptr_mode)
6008 imm = convert_memory_address (ptr_mode, imm);
6010 if (const_offset != 0
6011 && targetm.cannot_force_const_mem (ptr_mode, imm))
6013 gcc_assert (can_create_pseudo_p ());
6014 base = aarch64_force_temporary (int_mode, dest, base);
6015 aarch64_add_offset (int_mode, dest, base, const_offset,
6016 NULL_RTX, NULL_RTX, 0, false);
6017 return;
6020 mem = force_const_mem (ptr_mode, imm);
6021 gcc_assert (mem);
6023 /* If we aren't generating PC relative literals, then
6024 we need to expand the literal pool access carefully.
6025 This is something that needs to be done in a number
6026 of places, so could well live as a separate function. */
6027 if (!aarch64_pcrelative_literal_loads)
6029 gcc_assert (can_create_pseudo_p ());
6030 base = gen_reg_rtx (ptr_mode);
6031 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6032 if (ptr_mode != Pmode)
6033 base = convert_memory_address (Pmode, base);
6034 mem = gen_rtx_MEM (ptr_mode, base);
6037 if (int_mode != ptr_mode)
6038 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6040 emit_insn (gen_rtx_SET (dest, mem));
6042 return;
6044 case SYMBOL_SMALL_TLSGD:
6045 case SYMBOL_SMALL_TLSDESC:
6046 case SYMBOL_SMALL_TLSIE:
6047 case SYMBOL_SMALL_GOT_28K:
6048 case SYMBOL_SMALL_GOT_4G:
6049 case SYMBOL_TINY_GOT:
6050 case SYMBOL_TINY_TLSIE:
6051 if (const_offset != 0)
6053 gcc_assert(can_create_pseudo_p ());
6054 base = aarch64_force_temporary (int_mode, dest, base);
6055 aarch64_add_offset (int_mode, dest, base, const_offset,
6056 NULL_RTX, NULL_RTX, 0, false);
6057 return;
6059 /* FALLTHRU */
6061 case SYMBOL_SMALL_ABSOLUTE:
6062 case SYMBOL_TINY_ABSOLUTE:
6063 case SYMBOL_TLSLE12:
6064 case SYMBOL_TLSLE24:
6065 case SYMBOL_TLSLE32:
6066 case SYMBOL_TLSLE48:
6067 aarch64_load_symref_appropriately (dest, imm, sty);
6068 return;
6070 default:
6071 gcc_unreachable ();
6075 if (!CONST_INT_P (imm))
6077 if (aarch64_sve_pred_mode_p (mode))
6079 /* Only the low bit of each .H, .S and .D element is defined,
6080 so we can set the upper bits to whatever we like. If the
6081 predicate is all-true in MODE, prefer to set all the undefined
6082 bits as well, so that we can share a single .B predicate for
6083 all modes. */
6084 if (imm == CONSTM1_RTX (mode))
6085 imm = CONSTM1_RTX (VNx16BImode);
6087 /* All methods for constructing predicate modes wider than VNx16BI
6088 will set the upper bits of each element to zero. Expose this
6089 by moving such constants as a VNx16BI, so that all bits are
6090 significant and so that constants for different modes can be
6091 shared. The wider constant will still be available as a
6092 REG_EQUAL note. */
6093 rtx_vector_builder builder;
6094 if (aarch64_get_sve_pred_bits (builder, imm))
6096 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6097 if (dest != res)
6098 emit_move_insn (dest, gen_lowpart (mode, res));
6099 return;
6103 if (GET_CODE (imm) == HIGH
6104 || aarch64_simd_valid_immediate (imm, NULL))
6106 emit_insn (gen_rtx_SET (dest, imm));
6107 return;
6110 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6111 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6113 if (dest != res)
6114 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6115 return;
6118 rtx mem = force_const_mem (mode, imm);
6119 gcc_assert (mem);
6120 emit_move_insn (dest, mem);
6121 return;
6124 aarch64_internal_mov_immediate (dest, imm, true, mode);
6127 /* Return the MEM rtx that provides the canary value that should be used
6128 for stack-smashing protection. MODE is the mode of the memory.
6129 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6130 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6131 indicates whether the caller is performing a SET or a TEST operation. */
6134 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6135 aarch64_salt_type salt_type)
6137 rtx addr;
6138 if (aarch64_stack_protector_guard == SSP_GLOBAL)
6140 gcc_assert (MEM_P (decl_rtl));
6141 addr = XEXP (decl_rtl, 0);
6142 poly_int64 offset;
6143 rtx base = strip_offset_and_salt (addr, &offset);
6144 if (!SYMBOL_REF_P (base))
6145 return decl_rtl;
6147 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6148 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6149 addr = gen_rtx_CONST (Pmode, addr);
6150 addr = plus_constant (Pmode, addr, offset);
6152 else
6154 /* Calculate the address from the system register. */
6155 rtx salt = GEN_INT (salt_type);
6156 addr = gen_reg_rtx (mode);
6157 if (mode == DImode)
6158 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6159 else
6161 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6162 addr = convert_memory_address (Pmode, addr);
6164 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6166 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6169 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6170 that is known to contain PTRUE. */
6172 void
6173 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6175 expand_operand ops[3];
6176 machine_mode mode = GET_MODE (dest);
6177 create_output_operand (&ops[0], dest, mode);
6178 create_input_operand (&ops[1], pred, GET_MODE(pred));
6179 create_input_operand (&ops[2], src, mode);
6180 temporary_volatile_ok v (true);
6181 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6184 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6185 operand is in memory. In this case we need to use the predicated LD1
6186 and ST1 instead of LDR and STR, both for correctness on big-endian
6187 targets and because LD1 and ST1 support a wider range of addressing modes.
6188 PRED_MODE is the mode of the predicate.
6190 See the comment at the head of aarch64-sve.md for details about the
6191 big-endian handling. */
6193 void
6194 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6196 machine_mode mode = GET_MODE (dest);
6197 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6198 if (!register_operand (src, mode)
6199 && !register_operand (dest, mode))
6201 rtx tmp = gen_reg_rtx (mode);
6202 if (MEM_P (src))
6203 aarch64_emit_sve_pred_move (tmp, ptrue, src);
6204 else
6205 emit_move_insn (tmp, src);
6206 src = tmp;
6208 aarch64_emit_sve_pred_move (dest, ptrue, src);
6211 /* Called only on big-endian targets. See whether an SVE vector move
6212 from SRC to DEST is effectively a REV[BHW] instruction, because at
6213 least one operand is a subreg of an SVE vector that has wider or
6214 narrower elements. Return true and emit the instruction if so.
6216 For example:
6218 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6220 represents a VIEW_CONVERT between the following vectors, viewed
6221 in memory order:
6223 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6224 R1: { [0], [1], [2], [3], ... }
6226 The high part of lane X in R2 should therefore correspond to lane X*2
6227 of R1, but the register representations are:
6229 msb lsb
6230 R2: ...... [1].high [1].low [0].high [0].low
6231 R1: ...... [3] [2] [1] [0]
6233 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6234 We therefore need a reverse operation to swap the high and low values
6235 around.
6237 This is purely an optimization. Without it we would spill the
6238 subreg operand to the stack in one mode and reload it in the
6239 other mode, which has the same effect as the REV. */
6241 bool
6242 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6244 gcc_assert (BYTES_BIG_ENDIAN);
6246 /* Do not try to optimize subregs that LRA has created for matched
6247 reloads. These subregs only exist as a temporary measure to make
6248 the RTL well-formed, but they are exempt from the usual
6249 TARGET_CAN_CHANGE_MODE_CLASS rules.
6251 For example, if we have:
6253 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6255 and the constraints require R1 and R2 to be in the same register,
6256 LRA may need to create RTL such as:
6258 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6259 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6260 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6262 which forces both the input and output of the original instruction
6263 to use the same hard register. But for this to work, the normal
6264 rules have to be suppressed on the subreg input, otherwise LRA
6265 would need to reload that input too, meaning that the process
6266 would never terminate. To compensate for this, the normal rules
6267 are also suppressed for the subreg output of the first move.
6268 Ignoring the special case and handling the first move normally
6269 would therefore generate wrong code: we would reverse the elements
6270 for the first subreg but not reverse them back for the second subreg. */
6271 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6272 dest = SUBREG_REG (dest);
6273 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6274 src = SUBREG_REG (src);
6276 /* The optimization handles two single SVE REGs with different element
6277 sizes. */
6278 if (!REG_P (dest)
6279 || !REG_P (src)
6280 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6281 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6282 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6283 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6284 return false;
6286 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
6287 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6288 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6289 UNSPEC_REV_SUBREG);
6290 emit_insn (gen_rtx_SET (dest, unspec));
6291 return true;
6294 /* Return a copy of X with mode MODE, without changing its other
6295 attributes. Unlike gen_lowpart, this doesn't care whether the
6296 mode change is valid. */
6299 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6301 if (GET_MODE (x) == mode)
6302 return x;
6304 x = shallow_copy_rtx (x);
6305 set_mode_and_regno (x, mode, REGNO (x));
6306 return x;
6309 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6310 stored in wider integer containers. */
6312 static unsigned int
6313 aarch64_sve_rev_unspec (machine_mode mode)
6315 switch (GET_MODE_UNIT_SIZE (mode))
6317 case 1: return UNSPEC_REVB;
6318 case 2: return UNSPEC_REVH;
6319 case 4: return UNSPEC_REVW;
6321 gcc_unreachable ();
6324 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6325 operands. */
6327 void
6328 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6330 /* Decide which REV operation we need. The mode with wider elements
6331 determines the mode of the operands and the mode with the narrower
6332 elements determines the reverse width. */
6333 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6334 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6335 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6336 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6337 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6339 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6340 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6342 /* Get the operands in the appropriate modes and emit the instruction. */
6343 ptrue = gen_lowpart (pred_mode, ptrue);
6344 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6345 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6346 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6347 dest, ptrue, src));
6350 static bool
6351 aarch64_function_ok_for_sibcall (tree, tree exp)
6353 if (crtl->abi->id () != expr_callee_abi (exp).id ())
6354 return false;
6356 tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6357 if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6358 return false;
6359 for (auto state : { "za", "zt0" })
6360 if (bool (aarch64_cfun_shared_flags (state))
6361 != bool (aarch64_fntype_shared_flags (fntype, state)))
6362 return false;
6363 return true;
6366 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6367 passed in SVE registers. */
6369 static bool
6370 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6371 const function_arg_info &arg)
6373 HOST_WIDE_INT size;
6374 machine_mode dummymode;
6375 int nregs;
6377 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6378 if (arg.mode == BLKmode && arg.type)
6379 size = int_size_in_bytes (arg.type);
6380 else
6381 /* No frontends can create types with variable-sized modes, so we
6382 shouldn't be asked to pass or return them. */
6383 size = GET_MODE_SIZE (arg.mode).to_constant ();
6385 /* Aggregates are passed by reference based on their size. */
6386 if (arg.aggregate_type_p ())
6387 size = int_size_in_bytes (arg.type);
6389 /* Variable sized arguments are always returned by reference. */
6390 if (size < 0)
6391 return true;
6393 /* Can this be a candidate to be passed in fp/simd register(s)? */
6394 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6395 &dummymode, &nregs, NULL,
6396 !pcum || pcum->silent_p))
6397 return false;
6399 /* Arguments which are variable sized or larger than 2 registers are
6400 passed by reference unless they are a homogenous floating point
6401 aggregate. */
6402 return size > 2 * UNITS_PER_WORD;
6405 /* Implement TARGET_PASS_BY_REFERENCE. */
6407 static bool
6408 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6409 const function_arg_info &arg)
6411 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6413 if (!arg.type)
6414 return aarch64_pass_by_reference_1 (pcum, arg);
6416 pure_scalable_type_info pst_info;
6417 switch (pst_info.analyze (arg.type))
6419 case pure_scalable_type_info::IS_PST:
6420 if (pcum && !pcum->silent_p && !TARGET_SVE)
6421 /* We can't gracefully recover at this point, so make this a
6422 fatal error. */
6423 fatal_error (input_location, "arguments of type %qT require"
6424 " the SVE ISA extension", arg.type);
6426 /* Variadic SVE types are passed by reference. Normal non-variadic
6427 arguments are too if we've run out of registers. */
6428 return (!arg.named
6429 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6430 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6432 case pure_scalable_type_info::DOESNT_MATTER:
6433 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6434 return true;
6436 case pure_scalable_type_info::NO_ABI_IDENTITY:
6437 case pure_scalable_type_info::ISNT_PST:
6438 return aarch64_pass_by_reference_1 (pcum, arg);
6440 gcc_unreachable ();
6443 /* Return TRUE if VALTYPE is padded to its least significant bits. */
6444 static bool
6445 aarch64_return_in_msb (const_tree valtype)
6447 machine_mode dummy_mode;
6448 int dummy_int;
6450 /* Never happens in little-endian mode. */
6451 if (!BYTES_BIG_ENDIAN)
6452 return false;
6454 /* Only composite types smaller than or equal to 16 bytes can
6455 be potentially returned in registers. */
6456 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6457 || int_size_in_bytes (valtype) <= 0
6458 || int_size_in_bytes (valtype) > 16)
6459 return false;
6461 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6462 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6463 is always passed/returned in the least significant bits of fp/simd
6464 register(s). */
6465 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6466 &dummy_mode, &dummy_int, NULL,
6467 false))
6468 return false;
6470 /* Likewise pure scalable types for SVE vector and predicate registers. */
6471 pure_scalable_type_info pst_info;
6472 if (pst_info.analyze_registers (valtype))
6473 return false;
6475 return true;
6478 /* Implement TARGET_FUNCTION_VALUE.
6479 Define how to find the value returned by a function. */
6481 static rtx
6482 aarch64_function_value (const_tree type, const_tree func,
6483 bool outgoing ATTRIBUTE_UNUSED)
6485 machine_mode mode;
6486 int unsignedp;
6488 mode = TYPE_MODE (type);
6489 if (INTEGRAL_TYPE_P (type))
6490 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6492 pure_scalable_type_info pst_info;
6493 if (type && pst_info.analyze_registers (type))
6494 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6496 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6497 are returned in memory, not by value. */
6498 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6499 bool sve_p = (vec_flags & VEC_ANY_SVE);
6501 if (aarch64_return_in_msb (type))
6503 HOST_WIDE_INT size = int_size_in_bytes (type);
6505 if (size % UNITS_PER_WORD != 0)
6507 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6508 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6512 int count;
6513 machine_mode ag_mode;
6514 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6515 NULL, false))
6517 gcc_assert (!sve_p);
6518 if (!aarch64_composite_type_p (type, mode))
6520 gcc_assert (count == 1 && mode == ag_mode);
6521 return gen_rtx_REG (mode, V0_REGNUM);
6523 else if (aarch64_advsimd_full_struct_mode_p (mode)
6524 && known_eq (GET_MODE_SIZE (ag_mode), 16))
6525 return gen_rtx_REG (mode, V0_REGNUM);
6526 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6527 && known_eq (GET_MODE_SIZE (ag_mode), 8))
6528 return gen_rtx_REG (mode, V0_REGNUM);
6529 else
6531 int i;
6532 rtx par;
6534 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6535 for (i = 0; i < count; i++)
6537 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6538 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6539 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6540 XVECEXP (par, 0, i) = tmp;
6542 return par;
6545 else
6547 if (sve_p)
6549 /* Vector types can acquire a partial SVE mode using things like
6550 __attribute__((vector_size(N))), and this is potentially useful.
6551 However, the choice of mode doesn't affect the type's ABI
6552 identity, so we should treat the types as though they had
6553 the associated integer mode, just like they did before SVE
6554 was introduced.
6556 We know that the vector must be 128 bits or smaller,
6557 otherwise we'd have returned it in memory instead. */
6558 gcc_assert (type
6559 && (aarch64_some_values_include_pst_objects_p (type)
6560 || (vec_flags & VEC_PARTIAL)));
6562 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6563 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6564 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6565 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6567 return gen_rtx_REG (mode, R0_REGNUM);
6571 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6572 Return true if REGNO is the number of a hard register in which the values
6573 of called function may come back. */
6575 static bool
6576 aarch64_function_value_regno_p (const unsigned int regno)
6578 /* Maximum of 16 bytes can be returned in the general registers. Examples
6579 of 16-byte return values are: 128-bit integers and 16-byte small
6580 structures (excluding homogeneous floating-point aggregates). */
6581 if (regno == R0_REGNUM || regno == R1_REGNUM)
6582 return true;
6584 /* Up to four fp/simd registers can return a function value, e.g. a
6585 homogeneous floating-point aggregate having four members. */
6586 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6587 return TARGET_FLOAT;
6589 if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6590 return TARGET_SVE;
6592 return false;
6595 /* Subroutine for aarch64_return_in_memory for types that are not returned
6596 in SVE registers. */
6598 static bool
6599 aarch64_return_in_memory_1 (const_tree type)
6601 HOST_WIDE_INT size;
6602 machine_mode ag_mode;
6603 int count;
6605 if (!AGGREGATE_TYPE_P (type)
6606 && TREE_CODE (type) != BITINT_TYPE
6607 && TREE_CODE (type) != COMPLEX_TYPE
6608 && TREE_CODE (type) != VECTOR_TYPE)
6609 /* Simple scalar types always returned in registers. */
6610 return false;
6612 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6613 &ag_mode, &count, NULL, false))
6614 return false;
6616 /* Types larger than 2 registers returned in memory. */
6617 size = int_size_in_bytes (type);
6618 return (size < 0 || size > 2 * UNITS_PER_WORD);
6621 /* Implement TARGET_RETURN_IN_MEMORY.
6623 If the type T of the result of a function is such that
6624 void func (T arg)
6625 would require that arg be passed as a value in a register (or set of
6626 registers) according to the parameter passing rules, then the result
6627 is returned in the same registers as would be used for such an
6628 argument. */
6630 static bool
6631 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6633 pure_scalable_type_info pst_info;
6634 switch (pst_info.analyze (type))
6636 case pure_scalable_type_info::IS_PST:
6637 return (pst_info.num_zr () > NUM_FP_ARG_REGS
6638 || pst_info.num_pr () > NUM_PR_ARG_REGS);
6640 case pure_scalable_type_info::DOESNT_MATTER:
6641 gcc_assert (aarch64_return_in_memory_1 (type));
6642 return true;
6644 case pure_scalable_type_info::NO_ABI_IDENTITY:
6645 case pure_scalable_type_info::ISNT_PST:
6646 return aarch64_return_in_memory_1 (type);
6648 gcc_unreachable ();
6651 static bool
6652 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6653 const_tree type, int *nregs)
6655 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6656 return aarch64_vfp_is_call_or_return_candidate (mode, type,
6657 &pcum->aapcs_vfp_rmode,
6658 nregs, NULL, pcum->silent_p);
6661 /* Given MODE and TYPE of a function argument, return the alignment in
6662 bits. The idea is to suppress any stronger alignment requested by
6663 the user and opt for the natural alignment (specified in AAPCS64 \S
6664 4.1). ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6665 was incorrectly calculated in versions of GCC prior to GCC 9.
6666 ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6667 calculated in versions between GCC 9 and GCC 13. If the alignment
6668 might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6669 is the old GCC 13 alignment, otherwise it is zero.
6671 This is a helper function for local use only. */
6673 static unsigned int
6674 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6675 unsigned int *abi_break_gcc_9,
6676 unsigned int *abi_break_gcc_13,
6677 unsigned int *abi_break_gcc_14)
6679 *abi_break_gcc_9 = 0;
6680 *abi_break_gcc_13 = 0;
6681 *abi_break_gcc_14 = 0;
6682 if (!type)
6683 return GET_MODE_ALIGNMENT (mode);
6685 if (integer_zerop (TYPE_SIZE (type)))
6686 return 0;
6688 gcc_assert (TYPE_MODE (type) == mode);
6690 if (!AGGREGATE_TYPE_P (type))
6692 /* The ABI alignment is the natural alignment of the type, without
6693 any attributes applied. Normally this is the alignment of the
6694 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6695 For now we just handle the known exceptions explicitly. */
6696 type = TYPE_MAIN_VARIANT (type);
6697 if (POINTER_TYPE_P (type))
6699 gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6700 return POINTER_SIZE;
6702 if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6704 *abi_break_gcc_14 = TYPE_ALIGN (type);
6705 type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6707 gcc_assert (!TYPE_USER_ALIGN (type));
6708 return TYPE_ALIGN (type);
6711 if (TREE_CODE (type) == ARRAY_TYPE)
6712 return TYPE_ALIGN (TREE_TYPE (type));
6714 unsigned int alignment = 0;
6715 unsigned int bitfield_alignment_with_packed = 0;
6716 unsigned int bitfield_alignment = 0;
6717 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6718 if (TREE_CODE (field) == FIELD_DECL)
6720 /* Note that we explicitly consider zero-sized fields here,
6721 even though they don't map to AAPCS64 machine types.
6722 For example, in:
6724 struct __attribute__((aligned(8))) empty {};
6726 struct s {
6727 [[no_unique_address]] empty e;
6728 int x;
6731 "s" contains only one Fundamental Data Type (the int field)
6732 but gains 8-byte alignment and size thanks to "e". */
6733 alignment = std::max (alignment, DECL_ALIGN (field));
6734 if (DECL_BIT_FIELD_TYPE (field))
6736 /* Take the bit-field type's alignment into account only
6737 if the user didn't reduce this field's alignment with
6738 the packed attribute. */
6739 if (!DECL_PACKED (field))
6740 bitfield_alignment
6741 = std::max (bitfield_alignment,
6742 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6744 /* Compute the alignment even if the bit-field is
6745 packed, so that we can emit a warning in case the
6746 alignment changed between GCC versions. */
6747 bitfield_alignment_with_packed
6748 = std::max (bitfield_alignment_with_packed,
6749 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6753 /* Emit a warning if the alignment is different when taking the
6754 'packed' attribute into account. */
6755 if (bitfield_alignment != bitfield_alignment_with_packed
6756 && bitfield_alignment_with_packed > alignment)
6757 *abi_break_gcc_13 = bitfield_alignment_with_packed;
6759 if (bitfield_alignment > alignment)
6761 *abi_break_gcc_9 = alignment;
6762 return bitfield_alignment;
6765 return alignment;
6768 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
6769 _BitInt(N) type. These include ARRAY_TYPE's with an element that is a
6770 _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
6771 with a field member that is a _BitInt(N) or an aggregate that uses it.
6772 Return false otherwise. */
6774 static bool
6775 bitint_or_aggr_of_bitint_p (tree type)
6777 if (!type)
6778 return false;
6780 if (TREE_CODE (type) == BITINT_TYPE)
6781 return true;
6783 /* If ARRAY_TYPE, check it's element type. */
6784 if (TREE_CODE (type) == ARRAY_TYPE)
6785 return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
6787 /* If RECORD_TYPE or UNION_TYPE, check the fields' types. */
6788 if (RECORD_OR_UNION_TYPE_P (type))
6789 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
6791 if (TREE_CODE (field) != FIELD_DECL)
6792 continue;
6793 if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
6794 return true;
6796 return false;
6799 /* Layout a function argument according to the AAPCS64 rules. The rule
6800 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
6801 mode that was originally given to us by the target hook, whereas the
6802 mode in ARG might be the result of replacing partial SVE modes with
6803 the equivalent integer mode. */
6805 static void
6806 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6808 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6809 tree type = arg.type;
6810 machine_mode mode = arg.mode;
6811 int ncrn, nvrn, nregs;
6812 bool allocate_ncrn, allocate_nvrn;
6813 HOST_WIDE_INT size;
6814 unsigned int abi_break_gcc_9;
6815 unsigned int abi_break_gcc_13;
6816 unsigned int abi_break_gcc_14;
6818 /* We need to do this once per argument. */
6819 if (pcum->aapcs_arg_processed)
6820 return;
6822 bool warn_pcs_change
6823 = (warn_psabi
6824 && !pcum->silent_p
6825 && (currently_expanding_function_start
6826 || currently_expanding_gimple_stmt));
6828 /* HFAs and HVAs can have an alignment greater than 16 bytes. For example:
6830 typedef struct foo {
6831 __Int8x16_t foo[2] __attribute__((aligned(32)));
6832 } foo;
6834 is still a HVA despite its larger-than-normal alignment.
6835 However, such over-aligned HFAs and HVAs are guaranteed to have
6836 no padding.
6838 If we exclude HFAs and HVAs from the discussion below, then there
6839 are several things to note:
6841 - Both the C and AAPCS64 interpretations of a type's alignment should
6842 give a value that is no greater than the type's size.
6844 - Types bigger than 16 bytes are passed indirectly.
6846 - If an argument of type T is passed indirectly, TYPE and MODE describe
6847 a pointer to T rather than T iself.
6849 It follows that the AAPCS64 alignment of TYPE must be no greater
6850 than 16 bytes.
6852 Versions prior to GCC 9.1 ignored a bitfield's underlying type
6853 and so could calculate an alignment that was too small. If this
6854 happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6856 Although GCC 9.1 fixed that bug, it introduced a different one:
6857 it would consider the alignment of a bitfield's underlying type even
6858 if the field was packed (which should have the effect of overriding
6859 the alignment of the underlying type). This was fixed in GCC 13.1.
6861 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6862 that was too big. If this happened for TYPE, ABI_BREAK_GCC_13 is
6863 this older, too-big alignment.
6865 Also, the fact that GCC 9 to GCC 12 considered irrelevant
6866 alignments meant they could calculate type alignments that were
6867 bigger than the type's size, contrary to the assumption above.
6868 The handling of register arguments was nevertheless (and justifiably)
6869 written to follow the assumption that the alignment can never be
6870 greater than the size. The same was not true for stack arguments;
6871 their alignment was instead handled by MIN bounds in
6872 aarch64_function_arg_boundary.
6874 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6875 an alignment of more than 16 bytes for TYPE then:
6877 - If the argument was passed in registers, these GCC versions
6878 would treat the alignment as though it was *less than* 16 bytes.
6880 - If the argument was passed on the stack, these GCC versions
6881 would treat the alignment as though it was *equal to* 16 bytes.
6883 Both behaviors were wrong, but in different cases. */
6885 pcum->aapcs_arg_processed = true;
6887 pure_scalable_type_info pst_info;
6888 if (type && pst_info.analyze_registers (type))
6890 /* aarch64_function_arg_alignment has never had an effect on
6891 this case. */
6893 /* The PCS says that it is invalid to pass an SVE value to an
6894 unprototyped function. There is no ABI-defined location we
6895 can return in this case, so we have no real choice but to raise
6896 an error immediately, even though this is only a query function. */
6897 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6899 gcc_assert (!pcum->silent_p);
6900 error ("SVE type %qT cannot be passed to an unprototyped function",
6901 arg.type);
6902 /* Avoid repeating the message, and avoid tripping the assert
6903 below. */
6904 pcum->pcs_variant = ARM_PCS_SVE;
6907 /* We would have converted the argument into pass-by-reference
6908 form if it didn't fit in registers. */
6909 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6910 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6911 gcc_assert (arg.named
6912 && pcum->pcs_variant == ARM_PCS_SVE
6913 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6914 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6915 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6916 P0_REGNUM + pcum->aapcs_nprn);
6917 return;
6920 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6921 are passed by reference, not by value. */
6922 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6923 bool sve_p = (vec_flags & VEC_ANY_SVE);
6924 if (sve_p)
6925 /* Vector types can acquire a partial SVE mode using things like
6926 __attribute__((vector_size(N))), and this is potentially useful.
6927 However, the choice of mode doesn't affect the type's ABI
6928 identity, so we should treat the types as though they had
6929 the associated integer mode, just like they did before SVE
6930 was introduced.
6932 We know that the vector must be 128 bits or smaller,
6933 otherwise we'd have passed it in memory instead. */
6934 gcc_assert (type
6935 && (aarch64_some_values_include_pst_objects_p (type)
6936 || (vec_flags & VEC_PARTIAL)));
6938 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6939 if (type)
6940 size = int_size_in_bytes (type);
6941 else
6942 /* No frontends can create types with variable-sized modes, so we
6943 shouldn't be asked to pass or return them. */
6944 size = GET_MODE_SIZE (mode).to_constant ();
6945 size = ROUND_UP (size, UNITS_PER_WORD);
6947 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6948 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6949 mode,
6950 type,
6951 &nregs);
6952 gcc_assert (!sve_p || !allocate_nvrn);
6954 unsigned int alignment
6955 = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
6956 &abi_break_gcc_13, &abi_break_gcc_14);
6958 gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
6959 && (!alignment || abi_break_gcc_9 < alignment)
6960 && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
6962 /* _BitInt(N) was only added in GCC 14. */
6963 bool warn_pcs_change_le_gcc14
6964 = warn_pcs_change && !bitint_or_aggr_of_bitint_p (type);
6966 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6967 The following code thus handles passing by SIMD/FP registers first. */
6969 nvrn = pcum->aapcs_nvrn;
6971 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6972 and homogenous short-vector aggregates (HVA). */
6973 if (allocate_nvrn)
6975 /* aarch64_function_arg_alignment has never had an effect on
6976 this case. */
6977 if (!pcum->silent_p && !TARGET_FLOAT)
6978 aarch64_err_no_fpadvsimd (mode);
6980 if (nvrn + nregs <= NUM_FP_ARG_REGS)
6982 pcum->aapcs_nextnvrn = nvrn + nregs;
6983 if (!aarch64_composite_type_p (type, mode))
6985 gcc_assert (nregs == 1);
6986 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6988 else if (aarch64_advsimd_full_struct_mode_p (mode)
6989 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
6990 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6991 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6992 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
6993 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6994 else
6996 rtx par;
6997 int i;
6998 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6999 for (i = 0; i < nregs; i++)
7001 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7002 V0_REGNUM + nvrn + i);
7003 rtx offset = gen_int_mode
7004 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7005 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7006 XVECEXP (par, 0, i) = tmp;
7008 pcum->aapcs_reg = par;
7010 return;
7012 else
7014 /* C.3 NSRN is set to 8. */
7015 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7016 goto on_stack;
7020 ncrn = pcum->aapcs_ncrn;
7021 nregs = size / UNITS_PER_WORD;
7023 /* C6 - C9. though the sign and zero extension semantics are
7024 handled elsewhere. This is the case where the argument fits
7025 entirely general registers. */
7026 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7028 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7030 /* C.8 if the argument has an alignment of 16 then the NGRN is
7031 rounded up to the next even number. */
7032 if (nregs == 2
7033 && ncrn % 2)
7035 /* Emit a warning if the alignment changed when taking the
7036 'packed' attribute into account. */
7037 if (warn_pcs_change_le_gcc14
7038 && abi_break_gcc_13
7039 && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
7040 != (alignment == 16 * BITS_PER_UNIT)))
7041 inform (input_location, "parameter passing for argument of type "
7042 "%qT changed in GCC 13.1", type);
7044 if (warn_pcs_change_le_gcc14
7045 && abi_break_gcc_14
7046 && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
7047 != (alignment == 16 * BITS_PER_UNIT)))
7048 inform (input_location, "parameter passing for argument of type "
7049 "%qT changed in GCC 14.1", type);
7051 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7052 comparison is there because for > 16 * BITS_PER_UNIT
7053 alignment nregs should be > 2 and therefore it should be
7054 passed by reference rather than value. */
7055 if (alignment == 16 * BITS_PER_UNIT)
7057 if (warn_pcs_change_le_gcc14
7058 && abi_break_gcc_9)
7059 inform (input_location, "parameter passing for argument of type "
7060 "%qT changed in GCC 9.1", type);
7061 ++ncrn;
7062 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7066 /* If an argument with an SVE mode needs to be shifted up to the
7067 high part of the register, treat it as though it had an integer mode.
7068 Using the normal (parallel [...]) would suppress the shifting. */
7069 if (sve_p
7070 && BYTES_BIG_ENDIAN
7071 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7072 && aarch64_pad_reg_upward (mode, type, false))
7074 mode = int_mode_for_mode (mode).require ();
7075 sve_p = false;
7078 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7079 A reg is still generated for it, but the caller should be smart
7080 enough not to use it. */
7081 if (nregs == 0
7082 || (nregs == 1 && !sve_p)
7083 || GET_MODE_CLASS (mode) == MODE_INT)
7084 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7085 else
7087 rtx par;
7088 int i;
7090 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7091 for (i = 0; i < nregs; i++)
7093 scalar_int_mode reg_mode = word_mode;
7094 if (nregs == 1)
7095 reg_mode = int_mode_for_mode (mode).require ();
7096 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7097 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7098 GEN_INT (i * UNITS_PER_WORD));
7099 XVECEXP (par, 0, i) = tmp;
7101 pcum->aapcs_reg = par;
7104 pcum->aapcs_nextncrn = ncrn + nregs;
7105 return;
7108 /* C.11 */
7109 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7111 /* The argument is passed on stack; record the needed number of words for
7112 this argument and align the total size if necessary. */
7113 on_stack:
7114 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7116 if (warn_pcs_change_le_gcc14
7117 && abi_break_gcc_13
7118 && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7119 != (alignment >= 16 * BITS_PER_UNIT)))
7120 inform (input_location, "parameter passing for argument of type "
7121 "%qT changed in GCC 13.1", type);
7123 if (warn_pcs_change_le_gcc14
7124 && abi_break_gcc_14
7125 && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7126 != (alignment >= 16 * BITS_PER_UNIT)))
7127 inform (input_location, "parameter passing for argument of type "
7128 "%qT changed in GCC 14.1", type);
7130 if (alignment == 16 * BITS_PER_UNIT)
7132 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7133 if (pcum->aapcs_stack_size != new_size)
7135 if (warn_pcs_change_le_gcc14
7136 && abi_break_gcc_9)
7137 inform (input_location, "parameter passing for argument of type "
7138 "%qT changed in GCC 9.1", type);
7139 pcum->aapcs_stack_size = new_size;
7142 return;
7145 /* Add the current argument register to the set of those that need
7146 to be saved and restored around a change to PSTATE.SM. */
7148 static void
7149 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7151 subrtx_var_iterator::array_type array;
7152 FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7154 rtx x = *iter;
7155 if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7157 unsigned int i = pcum->num_sme_mode_switch_args++;
7158 gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7159 pcum->sme_mode_switch_args[i] = x;
7164 /* Return a parallel that contains all the registers that need to be
7165 saved around a change to PSTATE.SM. Return const0_rtx if there is
7166 no such mode switch, or if no registers need to be saved. */
7168 static rtx
7169 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7171 if (!pcum->num_sme_mode_switch_args)
7172 return const0_rtx;
7174 auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7175 pcum->sme_mode_switch_args);
7176 return gen_rtx_PARALLEL (VOIDmode, argvec);
7179 /* Implement TARGET_FUNCTION_ARG. */
7181 static rtx
7182 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7184 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7185 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7186 || pcum->pcs_variant == ARM_PCS_SIMD
7187 || pcum->pcs_variant == ARM_PCS_SVE);
7189 if (arg.end_marker_p ())
7191 rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7192 pcum->pcs_variant);
7193 rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7194 rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7195 rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7196 return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7197 sme_mode_switch_args,
7198 shared_za_flags,
7199 shared_zt0_flags));
7202 aarch64_layout_arg (pcum_v, arg);
7203 return pcum->aapcs_reg;
7206 void
7207 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7208 const_tree fntype,
7209 rtx libname ATTRIBUTE_UNUSED,
7210 const_tree fndecl,
7211 unsigned n_named ATTRIBUTE_UNUSED,
7212 bool silent_p)
7214 pcum->aapcs_ncrn = 0;
7215 pcum->aapcs_nvrn = 0;
7216 pcum->aapcs_nprn = 0;
7217 pcum->aapcs_nextncrn = 0;
7218 pcum->aapcs_nextnvrn = 0;
7219 pcum->aapcs_nextnprn = 0;
7220 if (fntype)
7222 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7223 pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7225 else
7227 pcum->pcs_variant = ARM_PCS_AAPCS64;
7228 pcum->isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
7230 pcum->aapcs_reg = NULL_RTX;
7231 pcum->aapcs_arg_processed = false;
7232 pcum->aapcs_stack_words = 0;
7233 pcum->aapcs_stack_size = 0;
7234 pcum->silent_p = silent_p;
7235 pcum->shared_za_flags
7236 = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7237 pcum->shared_zt0_flags
7238 = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7239 pcum->num_sme_mode_switch_args = 0;
7241 if (!silent_p
7242 && !TARGET_FLOAT
7243 && fntype && fntype != error_mark_node)
7245 const_tree type = TREE_TYPE (fntype);
7246 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7247 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7248 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7249 &mode, &nregs, NULL, false))
7250 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7253 if (!silent_p
7254 && !TARGET_SVE
7255 && pcum->pcs_variant == ARM_PCS_SVE)
7257 /* We can't gracefully recover at this point, so make this a
7258 fatal error. */
7259 if (fndecl)
7260 fatal_error (input_location, "%qE requires the SVE ISA extension",
7261 fndecl);
7262 else
7263 fatal_error (input_location, "calls to functions of type %qT require"
7264 " the SVE ISA extension", fntype);
7268 static void
7269 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7270 const function_arg_info &arg)
7272 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7273 if (pcum->pcs_variant == ARM_PCS_AAPCS64
7274 || pcum->pcs_variant == ARM_PCS_SIMD
7275 || pcum->pcs_variant == ARM_PCS_SVE)
7277 aarch64_layout_arg (pcum_v, arg);
7278 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7279 != (pcum->aapcs_stack_words != 0));
7280 if (pcum->aapcs_reg
7281 && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7282 aarch64_record_sme_mode_switch_args (pcum);
7284 pcum->aapcs_arg_processed = false;
7285 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7286 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7287 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7288 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7289 pcum->aapcs_stack_words = 0;
7290 pcum->aapcs_reg = NULL_RTX;
7294 bool
7295 aarch64_function_arg_regno_p (unsigned regno)
7297 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7298 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7299 || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7302 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7303 PARM_BOUNDARY bits of alignment, but will be given anything up
7304 to STACK_BOUNDARY bits if the type requires it. This makes sure
7305 that both before and after the layout of each argument, the Next
7306 Stacked Argument Address (NSAA) will have a minimum alignment of
7307 8 bytes. */
7309 static unsigned int
7310 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7312 unsigned int abi_break_gcc_9;
7313 unsigned int abi_break_gcc_13;
7314 unsigned int abi_break_gcc_14;
7315 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7316 &abi_break_gcc_9,
7317 &abi_break_gcc_13,
7318 &abi_break_gcc_14);
7319 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7320 to emit warnings about ABI incompatibility. */
7321 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7322 return alignment;
7325 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7327 static fixed_size_mode
7328 aarch64_get_reg_raw_mode (int regno)
7330 /* Don't use any non GP registers for __builtin_apply and
7331 __builtin_return if general registers only mode is requested. */
7332 if (TARGET_GENERAL_REGS_ONLY && !GP_REGNUM_P (regno))
7333 return as_a <fixed_size_mode> (VOIDmode);
7334 if (TARGET_SVE && FP_REGNUM_P (regno))
7335 /* Don't use the SVE part of the register for __builtin_apply and
7336 __builtin_return. The SVE registers aren't used by the normal PCS,
7337 so using them there would be a waste of time. The PCS extensions
7338 for SVE types are fundamentally incompatible with the
7339 __builtin_return/__builtin_apply interface. */
7340 return as_a <fixed_size_mode> (V16QImode);
7341 if (PR_REGNUM_P (regno))
7342 /* For SVE PR regs, indicate that they should be ignored for
7343 __builtin_apply/__builtin_return. */
7344 return as_a <fixed_size_mode> (VOIDmode);
7345 return default_get_reg_raw_mode (regno);
7348 /* Implement TARGET_FUNCTION_ARG_PADDING.
7350 Small aggregate types are placed in the lowest memory address.
7352 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7354 static pad_direction
7355 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7357 /* On little-endian targets, the least significant byte of every stack
7358 argument is passed at the lowest byte address of the stack slot. */
7359 if (!BYTES_BIG_ENDIAN)
7360 return PAD_UPWARD;
7362 /* Otherwise, integral, floating-point and pointer types are padded downward:
7363 the least significant byte of a stack argument is passed at the highest
7364 byte address of the stack slot. */
7365 if (type
7366 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7367 || POINTER_TYPE_P (type))
7368 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7369 return PAD_DOWNWARD;
7371 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7372 return PAD_UPWARD;
7375 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7377 It specifies padding for the last (may also be the only)
7378 element of a block move between registers and memory. If
7379 assuming the block is in the memory, padding upward means that
7380 the last element is padded after its highest significant byte,
7381 while in downward padding, the last element is padded at the
7382 its least significant byte side.
7384 Small aggregates and small complex types are always padded
7385 upwards.
7387 We don't need to worry about homogeneous floating-point or
7388 short-vector aggregates; their move is not affected by the
7389 padding direction determined here. Regardless of endianness,
7390 each element of such an aggregate is put in the least
7391 significant bits of a fp/simd register.
7393 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7394 register has useful data, and return the opposite if the most
7395 significant byte does. */
7397 bool
7398 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7399 bool first ATTRIBUTE_UNUSED)
7402 /* Aside from pure scalable types, small composite types are always
7403 padded upward. */
7404 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7406 HOST_WIDE_INT size;
7407 if (type)
7408 size = int_size_in_bytes (type);
7409 else
7410 /* No frontends can create types with variable-sized modes, so we
7411 shouldn't be asked to pass or return them. */
7412 size = GET_MODE_SIZE (mode).to_constant ();
7413 if (size < 2 * UNITS_PER_WORD)
7415 pure_scalable_type_info pst_info;
7416 if (pst_info.analyze_registers (type))
7417 return false;
7418 return true;
7422 /* Otherwise, use the default padding. */
7423 return !BYTES_BIG_ENDIAN;
7426 static scalar_int_mode
7427 aarch64_libgcc_cmp_return_mode (void)
7429 return SImode;
7432 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7434 /* We use the 12-bit shifted immediate arithmetic instructions so values
7435 must be multiple of (1 << 12), i.e. 4096. */
7436 #define ARITH_FACTOR 4096
7438 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7439 #error Cannot use simple address calculation for stack probing
7440 #endif
7442 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7443 inclusive. These are offsets from the current stack pointer. */
7445 static void
7446 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7448 HOST_WIDE_INT size;
7449 if (!poly_size.is_constant (&size))
7451 sorry ("stack probes for SVE frames");
7452 return;
7455 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7457 /* See the same assertion on PROBE_INTERVAL above. */
7458 gcc_assert ((first % ARITH_FACTOR) == 0);
7460 /* See if we have a constant small number of probes to generate. If so,
7461 that's the easy case. */
7462 if (size <= PROBE_INTERVAL)
7464 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7466 emit_set_insn (reg1,
7467 plus_constant (Pmode,
7468 stack_pointer_rtx, -(first + base)));
7469 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7472 /* The run-time loop is made up of 8 insns in the generic case while the
7473 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7474 else if (size <= 4 * PROBE_INTERVAL)
7476 HOST_WIDE_INT i, rem;
7478 emit_set_insn (reg1,
7479 plus_constant (Pmode,
7480 stack_pointer_rtx,
7481 -(first + PROBE_INTERVAL)));
7482 emit_stack_probe (reg1);
7484 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7485 it exceeds SIZE. If only two probes are needed, this will not
7486 generate any code. Then probe at FIRST + SIZE. */
7487 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7489 emit_set_insn (reg1,
7490 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7491 emit_stack_probe (reg1);
7494 rem = size - (i - PROBE_INTERVAL);
7495 if (rem > 256)
7497 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7499 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7500 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7502 else
7503 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7506 /* Otherwise, do the same as above, but in a loop. Note that we must be
7507 extra careful with variables wrapping around because we might be at
7508 the very top (or the very bottom) of the address space and we have
7509 to be able to handle this case properly; in particular, we use an
7510 equality test for the loop condition. */
7511 else
7513 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7515 /* Step 1: round SIZE to the previous multiple of the interval. */
7517 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7520 /* Step 2: compute initial and final value of the loop counter. */
7522 /* TEST_ADDR = SP + FIRST. */
7523 emit_set_insn (reg1,
7524 plus_constant (Pmode, stack_pointer_rtx, -first));
7526 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
7527 HOST_WIDE_INT adjustment = - (first + rounded_size);
7528 if (! aarch64_uimm12_shift (adjustment))
7530 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7531 true, Pmode);
7532 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7534 else
7535 emit_set_insn (reg2,
7536 plus_constant (Pmode, stack_pointer_rtx, adjustment));
7538 /* Step 3: the loop
7542 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7543 probe at TEST_ADDR
7545 while (TEST_ADDR != LAST_ADDR)
7547 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7548 until it is equal to ROUNDED_SIZE. */
7550 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7553 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7554 that SIZE is equal to ROUNDED_SIZE. */
7556 if (size != rounded_size)
7558 HOST_WIDE_INT rem = size - rounded_size;
7560 if (rem > 256)
7562 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7564 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7565 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7567 else
7568 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7572 /* Make sure nothing is scheduled before we are done. */
7573 emit_insn (gen_blockage ());
7576 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7577 absolute addresses. */
7579 const char *
7580 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7582 static int labelno = 0;
7583 char loop_lab[32];
7584 rtx xops[2];
7586 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7588 /* Loop. */
7589 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7591 HOST_WIDE_INT stack_clash_probe_interval
7592 = 1 << param_stack_clash_protection_guard_size;
7594 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7595 xops[0] = reg1;
7596 HOST_WIDE_INT interval;
7597 if (flag_stack_clash_protection)
7598 interval = stack_clash_probe_interval;
7599 else
7600 interval = PROBE_INTERVAL;
7602 gcc_assert (aarch64_uimm12_shift (interval));
7603 xops[1] = GEN_INT (interval);
7605 output_asm_insn ("sub\t%0, %0, %1", xops);
7607 /* If doing stack clash protection then we probe up by the ABI specified
7608 amount. We do this because we're dropping full pages at a time in the
7609 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7610 if (flag_stack_clash_protection)
7611 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7612 else
7613 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7615 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7616 by this amount for each iteration. */
7617 output_asm_insn ("str\txzr, [%0, %1]", xops);
7619 /* Test if TEST_ADDR == LAST_ADDR. */
7620 xops[1] = reg2;
7621 output_asm_insn ("cmp\t%0, %1", xops);
7623 /* Branch. */
7624 fputs ("\tb.ne\t", asm_out_file);
7625 assemble_name_raw (asm_out_file, loop_lab);
7626 fputc ('\n', asm_out_file);
7628 return "";
7631 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7632 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7633 of GUARD_SIZE. When a probe is emitted it is done at most
7634 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7635 at most MIN_PROBE_THRESHOLD. By the end of this function
7636 BASE = BASE - ADJUSTMENT. */
7638 const char *
7639 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7640 rtx min_probe_threshold, rtx guard_size)
7642 /* This function is not allowed to use any instruction generation function
7643 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7644 so instead emit the code you want using output_asm_insn. */
7645 gcc_assert (flag_stack_clash_protection);
7646 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7647 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7649 /* The minimum required allocation before the residual requires probing. */
7650 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7652 /* Clamp the value down to the nearest value that can be used with a cmp. */
7653 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7654 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7656 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7657 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7659 static int labelno = 0;
7660 char loop_start_lab[32];
7661 char loop_end_lab[32];
7662 rtx xops[2];
7664 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7665 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7667 /* Emit loop start label. */
7668 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7670 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7671 xops[0] = adjustment;
7672 xops[1] = probe_offset_value_rtx;
7673 output_asm_insn ("cmp\t%0, %1", xops);
7675 /* Branch to end if not enough adjustment to probe. */
7676 fputs ("\tb.lt\t", asm_out_file);
7677 assemble_name_raw (asm_out_file, loop_end_lab);
7678 fputc ('\n', asm_out_file);
7680 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7681 xops[0] = base;
7682 xops[1] = probe_offset_value_rtx;
7683 output_asm_insn ("sub\t%0, %0, %1", xops);
7685 /* Probe at BASE. */
7686 xops[1] = const0_rtx;
7687 output_asm_insn ("str\txzr, [%0, %1]", xops);
7689 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7690 xops[0] = adjustment;
7691 xops[1] = probe_offset_value_rtx;
7692 output_asm_insn ("sub\t%0, %0, %1", xops);
7694 /* Branch to start if still more bytes to allocate. */
7695 fputs ("\tb\t", asm_out_file);
7696 assemble_name_raw (asm_out_file, loop_start_lab);
7697 fputc ('\n', asm_out_file);
7699 /* No probe leave. */
7700 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7702 /* BASE = BASE - ADJUSTMENT. */
7703 xops[0] = base;
7704 xops[1] = adjustment;
7705 output_asm_insn ("sub\t%0, %0, %1", xops);
7706 return "";
7709 /* Determine whether a frame chain needs to be generated. */
7710 static bool
7711 aarch64_needs_frame_chain (void)
7713 if (frame_pointer_needed)
7714 return true;
7716 /* A leaf function cannot have calls or write LR. */
7717 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7719 /* Don't use a frame chain in leaf functions if leaf frame pointers
7720 are disabled. */
7721 if (flag_omit_leaf_frame_pointer && is_leaf)
7722 return false;
7724 return aarch64_use_frame_pointer;
7727 /* Return true if the current function should save registers above
7728 the locals area, rather than below it. */
7730 static bool
7731 aarch64_save_regs_above_locals_p ()
7733 /* When using stack smash protection, make sure that the canary slot
7734 comes between the locals and the saved registers. Otherwise,
7735 it would be possible for a carefully sized smash attack to change
7736 the saved registers (particularly LR and FP) without reaching the
7737 canary. */
7738 return crtl->stack_protect_guard;
7741 /* Return true if the current function needs to record the incoming
7742 value of PSTATE.SM. */
7743 static bool
7744 aarch64_need_old_pstate_sm ()
7746 /* Exit early if the incoming value of PSTATE.SM is known at
7747 compile time. */
7748 if (aarch64_cfun_incoming_pstate_sm () != 0)
7749 return false;
7751 if (aarch64_cfun_enables_pstate_sm ())
7752 return true;
7754 /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7755 but the function needs to return with PSTATE.SM unchanged. */
7756 if (nonlocal_goto_handler_labels)
7757 return true;
7759 /* Likewise for exception handlers. */
7760 eh_landing_pad lp;
7761 for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
7762 if (lp && lp->post_landing_pad)
7763 return true;
7765 /* Non-local gotos need to set PSTATE.SM to zero. It's possible to call
7766 streaming-compatible functions without SME being available, so PSTATE.SM
7767 should only be changed if it is currently set to one. */
7768 if (crtl->has_nonlocal_goto)
7769 return true;
7771 if (cfun->machine->call_switches_pstate_sm)
7772 for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
7773 if (auto *call = dyn_cast<rtx_call_insn *> (insn))
7774 if (!SIBLING_CALL_P (call))
7776 /* Return true if there is a call to a non-streaming-compatible
7777 function. */
7778 auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
7779 if (aarch64_call_switches_pstate_sm (callee_isa_mode))
7780 return true;
7782 return false;
7785 /* Mark the registers that need to be saved by the callee and calculate
7786 the size of the callee-saved registers area and frame record (both FP
7787 and LR may be omitted). */
7788 static void
7789 aarch64_layout_frame (void)
7791 unsigned regno, last_fp_reg = INVALID_REGNUM;
7792 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7793 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7794 bool frame_related_fp_reg_p = false;
7795 aarch64_frame &frame = cfun->machine->frame;
7796 poly_int64 top_of_locals = -1;
7797 bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
7799 vec_safe_truncate (frame.saved_gprs, 0);
7800 vec_safe_truncate (frame.saved_fprs, 0);
7801 vec_safe_truncate (frame.saved_prs, 0);
7803 frame.emit_frame_chain = aarch64_needs_frame_chain ();
7805 /* Adjust the outgoing arguments size if required. Keep it in sync with what
7806 the mid-end is doing. */
7807 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7809 #define SLOT_NOT_REQUIRED (-2)
7810 #define SLOT_REQUIRED (-1)
7812 frame.wb_push_candidate1 = INVALID_REGNUM;
7813 frame.wb_push_candidate2 = INVALID_REGNUM;
7814 frame.spare_pred_reg = INVALID_REGNUM;
7816 /* First mark all the registers that really need to be saved... */
7817 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7818 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7819 frame.old_svcr_offset = SLOT_NOT_REQUIRED;
7821 /* ... that includes the eh data registers (if needed)... */
7822 if (crtl->calls_eh_return)
7823 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7824 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7826 /* ... and any callee saved register that dataflow says is live. */
7827 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7828 if (df_regs_ever_live_p (regno)
7829 && !fixed_regs[regno]
7830 && (regno == R30_REGNUM
7831 || !crtl->abi->clobbers_full_reg_p (regno)))
7832 frame.reg_offset[regno] = SLOT_REQUIRED;
7834 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7835 if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7836 && !fixed_regs[regno]
7837 && !crtl->abi->clobbers_full_reg_p (regno))
7839 frame.reg_offset[regno] = SLOT_REQUIRED;
7840 last_fp_reg = regno;
7841 if (aarch64_emit_cfi_for_reg_p (regno))
7842 frame_related_fp_reg_p = true;
7845 /* Big-endian SVE frames need a spare predicate register in order
7846 to save Z8-Z15. Decide which register they should use. Prefer
7847 an unused argument register if possible, so that we don't force P4
7848 to be saved unnecessarily. */
7849 if (frame_related_fp_reg_p
7850 && crtl->abi->id () == ARM_PCS_SVE
7851 && BYTES_BIG_ENDIAN)
7853 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7854 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7855 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7856 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7857 break;
7858 gcc_assert (regno <= P7_REGNUM);
7859 frame.spare_pred_reg = regno;
7860 df_set_regs_ever_live (regno, true);
7863 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7864 if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7865 && !fixed_regs[regno]
7866 && !crtl->abi->clobbers_full_reg_p (regno))
7867 frame.reg_offset[regno] = SLOT_REQUIRED;
7869 bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
7871 poly_int64 offset = crtl->outgoing_args_size;
7872 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7873 if (regs_at_top_p)
7875 offset += get_frame_size ();
7876 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7877 top_of_locals = offset;
7879 frame.bytes_below_saved_regs = offset;
7880 frame.sve_save_and_probe = INVALID_REGNUM;
7882 /* Now assign stack slots for the registers. Start with the predicate
7883 registers, since predicate LDR and STR have a relatively small
7884 offset range. These saves happen below the hard frame pointer. */
7885 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7886 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7888 vec_safe_push (frame.saved_prs, regno);
7889 if (frame.sve_save_and_probe == INVALID_REGNUM)
7890 frame.sve_save_and_probe = regno;
7891 frame.reg_offset[regno] = offset;
7892 offset += BYTES_PER_SVE_PRED;
7895 poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
7896 if (maybe_ne (saved_prs_size, 0))
7898 /* If we have any vector registers to save above the predicate registers,
7899 the offset of the vector register save slots need to be a multiple
7900 of the vector size. This lets us use the immediate forms of LDR/STR
7901 (or LD1/ST1 for big-endian).
7903 A vector register is 8 times the size of a predicate register,
7904 and we need to save a maximum of 12 predicate registers, so the
7905 first vector register will be at either #1, MUL VL or #2, MUL VL.
7907 If we don't have any vector registers to save, and we know how
7908 big the predicate save area is, we can just round it up to the
7909 next 16-byte boundary. */
7910 if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
7911 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7912 else
7914 if (known_le (saved_prs_size, vector_save_size))
7915 offset = frame.bytes_below_saved_regs + vector_save_size;
7916 else if (known_le (saved_prs_size, vector_save_size * 2))
7917 offset = frame.bytes_below_saved_regs + vector_save_size * 2;
7918 else
7919 gcc_unreachable ();
7923 /* If we need to save any SVE vector registers, add them next. */
7924 if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7925 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7926 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7928 vec_safe_push (frame.saved_fprs, regno);
7929 if (frame.sve_save_and_probe == INVALID_REGNUM)
7930 frame.sve_save_and_probe = regno;
7931 frame.reg_offset[regno] = offset;
7932 offset += vector_save_size;
7935 /* OFFSET is now the offset of the hard frame pointer from the bottom
7936 of the callee save area. */
7937 auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
7938 bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
7939 gcc_assert (!saves_below_hard_fp_p
7940 || (frame.sve_save_and_probe != INVALID_REGNUM
7941 && known_eq (frame.reg_offset[frame.sve_save_and_probe],
7942 frame.bytes_below_saved_regs)));
7944 frame.bytes_below_hard_fp = offset;
7945 frame.hard_fp_save_and_probe = INVALID_REGNUM;
7947 auto allocate_gpr_slot = [&](unsigned int regno)
7949 vec_safe_push (frame.saved_gprs, regno);
7950 frame.reg_offset[regno] = offset;
7951 offset += UNITS_PER_WORD;
7954 if (frame.emit_frame_chain)
7956 /* FP and LR are placed in the linkage record. */
7957 allocate_gpr_slot (R29_REGNUM);
7958 allocate_gpr_slot (R30_REGNUM);
7960 else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
7961 && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
7962 /* Put the LR save slot first, since it makes a good choice of probe
7963 for stack clash purposes. The idea is that the link register usually
7964 has to be saved before a call anyway, and so we lose little by
7965 stopping it from being individually shrink-wrapped. */
7966 allocate_gpr_slot (R30_REGNUM);
7968 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7969 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7970 allocate_gpr_slot (regno);
7972 if (aarch64_need_old_pstate_sm ())
7974 frame.old_svcr_offset = offset;
7975 offset += UNITS_PER_WORD;
7978 /* If the current function changes the SVE vector length, ensure that the
7979 old value of the DWARF VG register is saved and available in the CFI,
7980 so that outer frames with VL-sized offsets can be processed correctly. */
7981 if (cfun->machine->call_switches_pstate_sm
7982 || aarch64_cfun_enables_pstate_sm ())
7984 frame.reg_offset[VG_REGNUM] = offset;
7985 offset += UNITS_PER_WORD;
7988 poly_int64 max_int_offset = offset;
7989 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7990 bool has_align_gap = maybe_ne (offset, max_int_offset);
7992 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7993 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7995 vec_safe_push (frame.saved_fprs, regno);
7996 /* If there is an alignment gap between integer and fp callee-saves,
7997 allocate the last fp register to it if possible. */
7998 if (regno == last_fp_reg
7999 && has_align_gap
8000 && known_eq (vector_save_size, 8)
8001 && multiple_p (offset, 16))
8003 frame.reg_offset[regno] = max_int_offset;
8004 break;
8007 frame.reg_offset[regno] = offset;
8008 offset += vector_save_size;
8011 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8012 auto saved_regs_size = offset - frame.bytes_below_saved_regs;
8014 array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
8015 ? frame.saved_gprs
8016 : frame.saved_fprs);
8017 if (!push_regs.empty ()
8018 && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
8020 frame.hard_fp_save_and_probe = push_regs[0];
8021 frame.wb_push_candidate1 = push_regs[0];
8022 if (push_regs.size () > 1)
8023 frame.wb_push_candidate2 = push_regs[1];
8026 /* With stack-clash, a register must be saved in non-leaf functions.
8027 The saving of the bottommost register counts as an implicit probe,
8028 which allows us to maintain the invariant described in the comment
8029 at expand_prologue. */
8030 gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
8032 if (!regs_at_top_p)
8034 offset += get_frame_size ();
8035 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8036 top_of_locals = offset;
8038 offset += frame.saved_varargs_size;
8039 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8040 frame.frame_size = offset;
8042 frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
8043 gcc_assert (known_ge (top_of_locals, 0));
8044 frame.bytes_above_locals = frame.frame_size - top_of_locals;
8046 frame.initial_adjust = 0;
8047 frame.final_adjust = 0;
8048 frame.callee_adjust = 0;
8049 frame.sve_callee_adjust = 0;
8051 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8052 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8054 /* Shadow call stack only deals with functions where the LR is pushed
8055 onto the stack and without specifying the "no_sanitize" attribute
8056 with the argument "shadow-call-stack". */
8057 frame.is_scs_enabled
8058 = (!crtl->calls_eh_return
8059 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8060 && known_ge (frame.reg_offset[LR_REGNUM], 0));
8062 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8063 restore x30, and we don't need to pop x30 again in the traditional
8064 way. Pop candidates record the registers that need to be popped
8065 eventually. */
8066 if (frame.is_scs_enabled)
8068 if (frame.wb_pop_candidate2 == R30_REGNUM)
8069 frame.wb_pop_candidate2 = INVALID_REGNUM;
8070 else if (frame.wb_pop_candidate1 == R30_REGNUM)
8071 frame.wb_pop_candidate1 = INVALID_REGNUM;
8074 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8075 256 to ensure that the offset meets the requirements of emit_move_insn.
8076 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8077 max_push_offset to 0, because no registers are popped at this time,
8078 so callee_adjust cannot be adjusted. */
8079 HOST_WIDE_INT max_push_offset = 0;
8080 if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8082 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8083 max_push_offset = 512;
8084 else
8085 max_push_offset = 256;
8088 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
8089 HOST_WIDE_INT const_saved_regs_size;
8090 if (known_eq (saved_regs_size, 0))
8091 frame.initial_adjust = frame.frame_size;
8092 else if (frame.frame_size.is_constant (&const_size)
8093 && const_size < max_push_offset
8094 && known_eq (frame.bytes_above_hard_fp, const_size))
8096 /* Simple, small frame with no data below the saved registers.
8098 stp reg1, reg2, [sp, -frame_size]!
8099 stp reg3, reg4, [sp, 16] */
8100 frame.callee_adjust = const_size;
8102 else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
8103 && saved_regs_size.is_constant (&const_saved_regs_size)
8104 && const_below_saved_regs + const_saved_regs_size < 512
8105 /* We could handle this case even with data below the saved
8106 registers, provided that that data left us with valid offsets
8107 for all predicate and vector save slots. It's such a rare
8108 case that it hardly seems worth the effort though. */
8109 && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8110 && !(cfun->calls_alloca
8111 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8112 && const_above_fp < max_push_offset))
8114 /* Frame with small area below the saved registers:
8116 sub sp, sp, frame_size
8117 stp reg1, reg2, [sp, bytes_below_saved_regs]
8118 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
8119 frame.initial_adjust = frame.frame_size;
8121 else if (saves_below_hard_fp_p
8122 && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8124 /* Frame in which all saves are SVE saves:
8126 sub sp, sp, frame_size - bytes_below_saved_regs
8127 save SVE registers relative to SP
8128 sub sp, sp, bytes_below_saved_regs */
8129 frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8130 frame.final_adjust = frame.bytes_below_saved_regs;
8132 else if (frame.wb_push_candidate1 != INVALID_REGNUM
8133 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8134 && const_above_fp < max_push_offset)
8136 /* Frame with large area below the saved registers, or with SVE saves,
8137 but with a small area above:
8139 stp reg1, reg2, [sp, -hard_fp_offset]!
8140 stp reg3, reg4, [sp, 16]
8141 [sub sp, sp, below_hard_fp_saved_regs_size]
8142 [save SVE registers relative to SP]
8143 sub sp, sp, bytes_below_saved_regs */
8144 frame.callee_adjust = const_above_fp;
8145 frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8146 frame.final_adjust = frame.bytes_below_saved_regs;
8148 else
8150 /* General case:
8152 sub sp, sp, hard_fp_offset
8153 stp x29, x30, [sp, 0]
8154 add x29, sp, 0
8155 stp reg3, reg4, [sp, 16]
8156 [sub sp, sp, below_hard_fp_saved_regs_size]
8157 [save SVE registers relative to SP]
8158 sub sp, sp, bytes_below_saved_regs */
8159 frame.initial_adjust = frame.bytes_above_hard_fp;
8160 frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8161 frame.final_adjust = frame.bytes_below_saved_regs;
8164 /* The frame is allocated in pieces, with each non-final piece
8165 including a register save at offset 0 that acts as a probe for
8166 the following piece. In addition, the save of the bottommost register
8167 acts as a probe for callees and allocas. Roll back any probes that
8168 aren't needed.
8170 A probe isn't needed if it is associated with the final allocation
8171 (including callees and allocas) that happens before the epilogue is
8172 executed. */
8173 if (crtl->is_leaf
8174 && !cfun->calls_alloca
8175 && known_eq (frame.final_adjust, 0))
8177 if (maybe_ne (frame.sve_callee_adjust, 0))
8178 frame.sve_save_and_probe = INVALID_REGNUM;
8179 else
8180 frame.hard_fp_save_and_probe = INVALID_REGNUM;
8183 /* Make sure the individual adjustments add up to the full frame size. */
8184 gcc_assert (known_eq (frame.initial_adjust
8185 + frame.callee_adjust
8186 + frame.sve_callee_adjust
8187 + frame.final_adjust, frame.frame_size));
8189 if (frame.callee_adjust == 0)
8191 /* We've decided not to do a "real" push and pop. However,
8192 setting up the frame chain is treated as being essentially
8193 a multi-instruction push. */
8194 frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8195 if (!frame.emit_frame_chain)
8196 frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8199 frame.laid_out = true;
8202 /* Return true if the register REGNO is saved on entry to
8203 the current function. */
8205 static bool
8206 aarch64_register_saved_on_entry (int regno)
8208 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8211 /* Push the register number REGNO of mode MODE to the stack with write-back
8212 adjusting the stack by ADJUSTMENT. */
8214 static void
8215 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8216 HOST_WIDE_INT adjustment)
8218 rtx base_rtx = stack_pointer_rtx;
8219 rtx insn, reg, mem;
8221 reg = gen_rtx_REG (mode, regno);
8222 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8223 plus_constant (Pmode, base_rtx, -adjustment));
8224 mem = gen_frame_mem (mode, mem);
8226 insn = emit_move_insn (mem, reg);
8227 RTX_FRAME_RELATED_P (insn) = 1;
8230 /* Generate and return an instruction to store the pair of registers
8231 REG and REG2 of mode MODE to location BASE with write-back adjusting
8232 the stack location BASE by ADJUSTMENT. */
8234 static rtx
8235 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8236 HOST_WIDE_INT adjustment)
8238 rtx new_base = plus_constant (Pmode, base, -adjustment);
8239 rtx mem = gen_frame_mem (mode, new_base);
8240 rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8242 return gen_rtx_PARALLEL (VOIDmode,
8243 gen_rtvec (3,
8244 gen_rtx_SET (base, new_base),
8245 gen_rtx_SET (mem, reg),
8246 gen_rtx_SET (mem2, reg2)));
8249 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8250 stack pointer by ADJUSTMENT. */
8252 static void
8253 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8255 rtx_insn *insn;
8256 machine_mode mode = aarch64_reg_save_mode (regno1);
8258 if (regno2 == INVALID_REGNUM)
8259 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8261 rtx reg1 = gen_rtx_REG (mode, regno1);
8262 rtx reg2 = gen_rtx_REG (mode, regno2);
8264 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8265 reg2, adjustment));
8266 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8267 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8268 RTX_FRAME_RELATED_P (insn) = 1;
8271 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8272 adjusting it by ADJUSTMENT afterwards. */
8274 static rtx
8275 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8276 HOST_WIDE_INT adjustment)
8278 rtx mem = gen_frame_mem (mode, base);
8279 rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8280 rtx new_base = plus_constant (Pmode, base, adjustment);
8282 return gen_rtx_PARALLEL (VOIDmode,
8283 gen_rtvec (3,
8284 gen_rtx_SET (base, new_base),
8285 gen_rtx_SET (reg, mem),
8286 gen_rtx_SET (reg2, mem2)));
8289 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8290 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8291 into CFI_OPS. */
8293 static void
8294 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8295 rtx *cfi_ops)
8297 machine_mode mode = aarch64_reg_save_mode (regno1);
8298 rtx reg1 = gen_rtx_REG (mode, regno1);
8300 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8302 if (regno2 == INVALID_REGNUM)
8304 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8305 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8306 emit_move_insn (reg1, gen_frame_mem (mode, mem));
8308 else
8310 rtx reg2 = gen_rtx_REG (mode, regno2);
8311 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8312 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8313 reg2, adjustment));
8317 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8318 for a mem rtx representing the entire pair. */
8320 static machine_mode
8321 aarch64_pair_mode_for_mode (machine_mode mode)
8323 if (known_eq (GET_MODE_SIZE (mode), 4))
8324 return V2x4QImode;
8325 else if (known_eq (GET_MODE_SIZE (mode), 8))
8326 return V2x8QImode;
8327 else if (known_eq (GET_MODE_SIZE (mode), 16))
8328 return V2x16QImode;
8329 else
8330 gcc_unreachable ();
8333 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8334 operand, return an rtx like MEM which instead represents the entire pair. */
8336 static rtx
8337 aarch64_pair_mem_from_base (rtx mem)
8339 auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8340 mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8341 gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8342 return mem;
8345 /* Generate and return a store pair instruction to store REG1 and REG2
8346 into memory starting at BASE_MEM. All three rtxes should have modes of the
8347 same size. */
8350 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8352 rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8354 return gen_rtx_SET (pair_mem,
8355 gen_rtx_UNSPEC (GET_MODE (pair_mem),
8356 gen_rtvec (2, reg1, reg2),
8357 UNSPEC_STP));
8360 /* Generate and return a load pair instruction to load a pair of
8361 registers starting at BASE_MEM into REG1 and REG2. If CODE is
8362 UNKNOWN, all three rtxes should have modes of the same size.
8363 Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8364 and REG{1,2} should be in DImode. */
8367 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8369 rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8371 const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8372 if (any_extend_p)
8373 gcc_checking_assert (GET_MODE (base_mem) == SImode
8374 && GET_MODE (reg1) == DImode
8375 && GET_MODE (reg2) == DImode);
8376 else
8377 gcc_assert (code == UNKNOWN);
8379 rtx unspecs[2] = {
8380 gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8381 gen_rtvec (1, pair_mem),
8382 UNSPEC_LDP_FST),
8383 gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8384 gen_rtvec (1, copy_rtx (pair_mem)),
8385 UNSPEC_LDP_SND)
8388 if (any_extend_p)
8389 for (int i = 0; i < 2; i++)
8390 unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8392 return gen_rtx_PARALLEL (VOIDmode,
8393 gen_rtvec (2,
8394 gen_rtx_SET (reg1, unspecs[0]),
8395 gen_rtx_SET (reg2, unspecs[1])));
8398 /* Return TRUE if return address signing should be enabled for the current
8399 function, otherwise return FALSE. */
8401 bool
8402 aarch64_return_address_signing_enabled (void)
8404 /* This function should only be called after frame laid out. */
8405 gcc_assert (cfun->machine->frame.laid_out);
8407 /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8408 if its LR is pushed onto stack. */
8409 return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8410 || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8411 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8414 /* Only used by the arm backend. */
8415 void aarch_bti_arch_check (void)
8418 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8419 bool
8420 aarch_bti_enabled (void)
8422 return (aarch_enable_bti == 1);
8425 /* Check if INSN is a BTI J insn. */
8426 bool
8427 aarch_bti_j_insn_p (rtx_insn *insn)
8429 if (!insn || !INSN_P (insn))
8430 return false;
8432 rtx pat = PATTERN (insn);
8433 return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8436 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction. */
8437 bool
8438 aarch_pac_insn_p (rtx x)
8440 if (!INSN_P (x))
8441 return false;
8443 subrtx_var_iterator::array_type array;
8444 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8446 rtx sub = *iter;
8447 if (sub && GET_CODE (sub) == UNSPEC)
8449 int unspec_val = XINT (sub, 1);
8450 switch (unspec_val)
8452 case UNSPEC_PACIASP:
8453 case UNSPEC_PACIBSP:
8454 return true;
8456 default:
8457 return false;
8459 iter.skip_subrtxes ();
8462 return false;
8465 rtx aarch_gen_bti_c (void)
8467 return gen_bti_c ();
8470 rtx aarch_gen_bti_j (void)
8472 return gen_bti_j ();
8475 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8476 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8477 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8479 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8480 or LD1D address
8482 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8483 if the variable isn't already nonnull
8485 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8486 Handle this case using a temporary base register that is suitable for
8487 all offsets in that range. Use ANCHOR_REG as this base register if it
8488 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8490 static inline void
8491 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8492 rtx &anchor_reg, poly_int64 &offset,
8493 rtx &ptrue)
8495 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8497 /* This is the maximum valid offset of the anchor from the base.
8498 Lower values would be valid too. */
8499 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8500 if (!anchor_reg)
8502 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8503 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8504 gen_int_mode (anchor_offset, Pmode)));
8506 base_rtx = anchor_reg;
8507 offset -= anchor_offset;
8509 if (!ptrue)
8511 int pred_reg = cfun->machine->frame.spare_pred_reg;
8512 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8513 CONSTM1_RTX (VNx16BImode));
8514 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8518 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8519 is saved at BASE + OFFSET. */
8521 static void
8522 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8523 rtx base, poly_int64 offset)
8525 rtx mem = gen_frame_mem (GET_MODE (reg),
8526 plus_constant (Pmode, base, offset));
8527 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8530 /* Emit code to save the callee-saved registers in REGS. Skip any
8531 write-back candidates if SKIP_WB is true, otherwise consider only
8532 write-back candidates.
8534 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8535 of the static frame. HARD_FP_VALID_P is true if the hard frame pointer
8536 has been set up. */
8538 static void
8539 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8540 array_slice<unsigned int> regs, bool skip_wb,
8541 bool hard_fp_valid_p)
8543 aarch64_frame &frame = cfun->machine->frame;
8544 rtx_insn *insn;
8545 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8547 auto skip_save_p = [&](unsigned int regno)
8549 if (cfun->machine->reg_is_wrapped_separately[regno])
8550 return true;
8552 if (skip_wb == (regno == frame.wb_push_candidate1
8553 || regno == frame.wb_push_candidate2))
8554 return true;
8556 return false;
8559 for (unsigned int i = 0; i < regs.size (); ++i)
8561 unsigned int regno = regs[i];
8562 poly_int64 offset;
8563 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8565 if (skip_save_p (regno))
8566 continue;
8568 machine_mode mode = aarch64_reg_save_mode (regno);
8569 rtx reg = gen_rtx_REG (mode, regno);
8570 rtx move_src = reg;
8571 offset = frame.reg_offset[regno] - bytes_below_sp;
8572 if (regno == VG_REGNUM)
8574 move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8575 emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8577 rtx base_rtx = stack_pointer_rtx;
8578 poly_int64 sp_offset = offset;
8580 HOST_WIDE_INT const_offset;
8581 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8582 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8583 offset, ptrue);
8584 else if (GP_REGNUM_P (REGNO (reg))
8585 && (!offset.is_constant (&const_offset) || const_offset >= 512))
8587 poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8588 if (hard_fp_valid_p)
8589 base_rtx = hard_frame_pointer_rtx;
8590 else
8592 if (!anchor_reg)
8594 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8595 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8596 gen_int_mode (fp_offset, Pmode)));
8598 base_rtx = anchor_reg;
8600 offset -= fp_offset;
8602 rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8603 rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8604 stack_pointer_rtx,
8605 sp_offset));
8606 rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8607 bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8609 unsigned int regno2;
8610 if (!aarch64_sve_mode_p (mode)
8611 && reg == move_src
8612 && i + 1 < regs.size ()
8613 && (regno2 = regs[i + 1], !skip_save_p (regno2))
8614 && known_eq (GET_MODE_SIZE (mode),
8615 frame.reg_offset[regno2] - frame.reg_offset[regno]))
8617 rtx reg2 = gen_rtx_REG (mode, regno2);
8619 offset += GET_MODE_SIZE (mode);
8620 insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8622 rtx cfi_mem2
8623 = gen_frame_mem (mode,
8624 plus_constant (Pmode,
8625 stack_pointer_rtx,
8626 sp_offset + GET_MODE_SIZE (mode)));
8627 rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8629 /* The first part of a frame-related parallel insn is always
8630 assumed to be relevant to the frame calculations;
8631 subsequent parts, are only frame-related if
8632 explicitly marked. */
8633 if (aarch64_emit_cfi_for_reg_p (regno2))
8634 RTX_FRAME_RELATED_P (cfi_set2) = 1;
8636 /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8637 representation of stp cannot be understood directly by
8638 dwarf2cfi. */
8639 rtx par = gen_rtx_PARALLEL (VOIDmode,
8640 gen_rtvec (2, cfi_set, cfi_set2));
8641 add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8643 regno = regno2;
8644 ++i;
8646 else
8648 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8650 insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8651 ptrue, move_src));
8652 need_cfi_note_p = true;
8654 else if (aarch64_sve_mode_p (mode))
8655 insn = emit_insn (gen_rtx_SET (mem, move_src));
8656 else
8657 insn = emit_move_insn (mem, move_src);
8659 if (frame_related_p && (need_cfi_note_p || move_src != reg))
8660 add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8663 RTX_FRAME_RELATED_P (insn) = frame_related_p;
8665 /* Emit a fake instruction to indicate that the VG save slot has
8666 been initialized. */
8667 if (regno == VG_REGNUM)
8668 emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8672 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8673 and any other registers that are handled separately. Write the appropriate
8674 REG_CFA_RESTORE notes into CFI_OPS.
8676 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8677 of the static frame. */
8679 static void
8680 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8681 array_slice<unsigned int> regs, rtx *cfi_ops)
8683 aarch64_frame &frame = cfun->machine->frame;
8684 poly_int64 offset;
8685 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8687 auto skip_restore_p = [&](unsigned int regno)
8689 if (cfun->machine->reg_is_wrapped_separately[regno])
8690 return true;
8692 if (regno == frame.wb_pop_candidate1
8693 || regno == frame.wb_pop_candidate2)
8694 return true;
8696 /* The shadow call stack code restores LR separately. */
8697 if (frame.is_scs_enabled && regno == LR_REGNUM)
8698 return true;
8700 return false;
8703 for (unsigned int i = 0; i < regs.size (); ++i)
8705 unsigned int regno = regs[i];
8706 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8707 if (skip_restore_p (regno))
8708 continue;
8710 machine_mode mode = aarch64_reg_save_mode (regno);
8711 rtx reg = gen_rtx_REG (mode, regno);
8712 offset = frame.reg_offset[regno] - bytes_below_sp;
8713 rtx base_rtx = stack_pointer_rtx;
8714 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8715 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8716 offset, ptrue);
8717 rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8719 unsigned int regno2;
8720 if (!aarch64_sve_mode_p (mode)
8721 && i + 1 < regs.size ()
8722 && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8723 && known_eq (GET_MODE_SIZE (mode),
8724 frame.reg_offset[regno2] - frame.reg_offset[regno]))
8726 rtx reg2 = gen_rtx_REG (mode, regno2);
8728 offset += GET_MODE_SIZE (mode);
8729 emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8731 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8732 regno = regno2;
8733 ++i;
8735 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8736 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8737 else if (aarch64_sve_mode_p (mode))
8738 emit_insn (gen_rtx_SET (reg, mem));
8739 else
8740 emit_move_insn (reg, mem);
8741 if (frame_related_p)
8742 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8746 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8747 of MODE. */
8749 static inline bool
8750 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8752 HOST_WIDE_INT multiple;
8753 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8754 && IN_RANGE (multiple, -8, 7));
8757 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8758 of MODE. */
8760 static inline bool
8761 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8763 HOST_WIDE_INT multiple;
8764 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8765 && IN_RANGE (multiple, -32, 31));
8768 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8769 of MODE. */
8771 static inline bool
8772 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8774 HOST_WIDE_INT multiple;
8775 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8776 && IN_RANGE (multiple, 0, 63));
8779 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8780 of MODE. */
8782 bool
8783 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8785 HOST_WIDE_INT multiple;
8786 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8787 && IN_RANGE (multiple, -64, 63));
8790 /* Return true if OFFSET is a signed 9-bit value. */
8792 bool
8793 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8794 poly_int64 offset)
8796 HOST_WIDE_INT const_offset;
8797 return (offset.is_constant (&const_offset)
8798 && IN_RANGE (const_offset, -256, 255));
8801 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8802 of MODE. */
8804 static inline bool
8805 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8807 HOST_WIDE_INT multiple;
8808 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8809 && IN_RANGE (multiple, -256, 255));
8812 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8813 of MODE. */
8815 static inline bool
8816 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8818 HOST_WIDE_INT multiple;
8819 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8820 && IN_RANGE (multiple, 0, 4095));
8823 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
8825 static sbitmap
8826 aarch64_get_separate_components (void)
8828 aarch64_frame &frame = cfun->machine->frame;
8829 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8830 bitmap_clear (components);
8832 /* The registers we need saved to the frame. */
8833 bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8834 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8835 if (aarch64_register_saved_on_entry (regno))
8837 /* Disallow shrink wrapping for registers that will be clobbered
8838 by an SMSTART SM in the prologue. */
8839 if (enables_pstate_sm
8840 && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
8841 continue;
8843 /* Punt on saves and restores that use ST1D and LD1D. We could
8844 try to be smarter, but it would involve making sure that the
8845 spare predicate register itself is safe to use at the save
8846 and restore points. Also, when a frame pointer is being used,
8847 the slots are often out of reach of ST1D and LD1D anyway. */
8848 machine_mode mode = aarch64_reg_save_mode (regno);
8849 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8850 continue;
8852 poly_int64 offset = frame.reg_offset[regno];
8854 /* Get the offset relative to the register we'll use. */
8855 if (frame_pointer_needed)
8856 offset -= frame.bytes_below_hard_fp;
8858 /* Check that we can access the stack slot of the register with one
8859 direct load with no adjustments needed. */
8860 if (aarch64_sve_mode_p (mode)
8861 ? offset_9bit_signed_scaled_p (mode, offset)
8862 : offset_12bit_unsigned_scaled_p (mode, offset))
8863 bitmap_set_bit (components, regno);
8866 /* Don't mess with the hard frame pointer. */
8867 if (frame_pointer_needed)
8868 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8870 /* If the spare predicate register used by big-endian SVE code
8871 is call-preserved, it must be saved in the main prologue
8872 before any saves that use it. */
8873 if (frame.spare_pred_reg != INVALID_REGNUM)
8874 bitmap_clear_bit (components, frame.spare_pred_reg);
8876 unsigned reg1 = frame.wb_push_candidate1;
8877 unsigned reg2 = frame.wb_push_candidate2;
8878 /* If registers have been chosen to be stored/restored with
8879 writeback don't interfere with them to avoid having to output explicit
8880 stack adjustment instructions. */
8881 if (reg2 != INVALID_REGNUM)
8882 bitmap_clear_bit (components, reg2);
8883 if (reg1 != INVALID_REGNUM)
8884 bitmap_clear_bit (components, reg1);
8886 bitmap_clear_bit (components, LR_REGNUM);
8887 bitmap_clear_bit (components, SP_REGNUM);
8888 if (flag_stack_clash_protection)
8890 if (frame.sve_save_and_probe != INVALID_REGNUM)
8891 bitmap_clear_bit (components, frame.sve_save_and_probe);
8892 if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
8893 bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
8896 /* The VG save sequence needs a temporary GPR. Punt for now on trying
8897 to find one. */
8898 bitmap_clear_bit (components, VG_REGNUM);
8900 return components;
8903 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
8905 static sbitmap
8906 aarch64_components_for_bb (basic_block bb)
8908 bitmap in = DF_LIVE_IN (bb);
8909 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8910 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8912 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8913 bitmap_clear (components);
8915 /* Clobbered registers don't generate values in any meaningful sense,
8916 since nothing after the clobber can rely on their value. And we can't
8917 say that partially-clobbered registers are unconditionally killed,
8918 because whether they're killed or not depends on the mode of the
8919 value they're holding. Thus partially call-clobbered registers
8920 appear in neither the kill set nor the gen set.
8922 Check manually for any calls that clobber more of a register than the
8923 current function can. */
8924 function_abi_aggregator callee_abis;
8925 rtx_insn *insn;
8926 FOR_BB_INSNS (bb, insn)
8927 if (CALL_P (insn))
8928 callee_abis.note_callee_abi (insn_callee_abi (insn));
8929 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8931 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
8932 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8933 if (!fixed_regs[regno]
8934 && !crtl->abi->clobbers_full_reg_p (regno)
8935 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8936 || bitmap_bit_p (in, regno)
8937 || bitmap_bit_p (gen, regno)
8938 || bitmap_bit_p (kill, regno)))
8940 bitmap_set_bit (components, regno);
8942 /* If there is a callee-save at an adjacent offset, add it too
8943 to increase the use of LDP/STP. */
8944 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8945 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8947 if (regno2 <= LAST_SAVED_REGNUM)
8949 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8950 if (regno < regno2
8951 ? known_eq (offset + 8, offset2)
8952 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8953 bitmap_set_bit (components, regno2);
8957 return components;
8960 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8961 Nothing to do for aarch64. */
8963 static void
8964 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8968 /* Return the next set bit in BMP from START onwards. Return the total number
8969 of bits in BMP if no set bit is found at or after START. */
8971 static unsigned int
8972 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8974 unsigned int nbits = SBITMAP_SIZE (bmp);
8975 if (start == nbits)
8976 return start;
8978 gcc_assert (start < nbits);
8979 for (unsigned int i = start; i < nbits; i++)
8980 if (bitmap_bit_p (bmp, i))
8981 return i;
8983 return nbits;
8986 /* Do the work for aarch64_emit_prologue_components and
8987 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
8988 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8989 for these components or the epilogue sequence. That is, it determines
8990 whether we should emit stores or loads and what kind of CFA notes to attach
8991 to the insns. Otherwise the logic for the two sequences is very
8992 similar. */
8994 static void
8995 aarch64_process_components (sbitmap components, bool prologue_p)
8997 aarch64_frame &frame = cfun->machine->frame;
8998 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8999 ? HARD_FRAME_POINTER_REGNUM
9000 : STACK_POINTER_REGNUM);
9002 unsigned last_regno = SBITMAP_SIZE (components);
9003 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9004 rtx_insn *insn = NULL;
9006 while (regno != last_regno)
9008 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9009 machine_mode mode = aarch64_reg_save_mode (regno);
9011 rtx reg = gen_rtx_REG (mode, regno);
9012 poly_int64 offset = frame.reg_offset[regno];
9013 if (frame_pointer_needed)
9014 offset -= frame.bytes_below_hard_fp;
9016 rtx addr = plus_constant (Pmode, ptr_reg, offset);
9017 rtx mem = gen_frame_mem (mode, addr);
9019 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9020 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9021 /* No more registers to handle after REGNO.
9022 Emit a single save/restore and exit. */
9023 if (regno2 == last_regno)
9025 insn = emit_insn (set);
9026 if (frame_related_p)
9028 RTX_FRAME_RELATED_P (insn) = 1;
9029 if (prologue_p)
9030 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9031 else
9032 add_reg_note (insn, REG_CFA_RESTORE, reg);
9034 break;
9037 poly_int64 offset2 = frame.reg_offset[regno2];
9038 /* The next register is not of the same class or its offset is not
9039 mergeable with the current one into a pair. */
9040 if (aarch64_sve_mode_p (mode)
9041 || !satisfies_constraint_Ump (mem)
9042 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9043 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9044 || maybe_ne ((offset2 - frame.reg_offset[regno]),
9045 GET_MODE_SIZE (mode)))
9047 insn = emit_insn (set);
9048 if (frame_related_p)
9050 RTX_FRAME_RELATED_P (insn) = 1;
9051 if (prologue_p)
9052 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9053 else
9054 add_reg_note (insn, REG_CFA_RESTORE, reg);
9057 regno = regno2;
9058 continue;
9061 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9063 /* REGNO2 can be saved/restored in a pair with REGNO. */
9064 rtx reg2 = gen_rtx_REG (mode, regno2);
9065 if (frame_pointer_needed)
9066 offset2 -= frame.bytes_below_hard_fp;
9067 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9068 rtx mem2 = gen_frame_mem (mode, addr2);
9069 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9070 : gen_rtx_SET (reg2, mem2);
9072 if (prologue_p)
9073 insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
9074 else
9075 insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9077 if (frame_related_p || frame_related2_p)
9079 RTX_FRAME_RELATED_P (insn) = 1;
9080 if (prologue_p)
9082 if (frame_related_p)
9083 add_reg_note (insn, REG_CFA_OFFSET, set);
9084 if (frame_related2_p)
9085 add_reg_note (insn, REG_CFA_OFFSET, set2);
9087 else
9089 if (frame_related_p)
9090 add_reg_note (insn, REG_CFA_RESTORE, reg);
9091 if (frame_related2_p)
9092 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9096 regno = aarch64_get_next_set_bit (components, regno2 + 1);
9100 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9102 static void
9103 aarch64_emit_prologue_components (sbitmap components)
9105 aarch64_process_components (components, true);
9108 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9110 static void
9111 aarch64_emit_epilogue_components (sbitmap components)
9113 aarch64_process_components (components, false);
9116 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9118 static void
9119 aarch64_set_handled_components (sbitmap components)
9121 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9122 if (bitmap_bit_p (components, regno))
9123 cfun->machine->reg_is_wrapped_separately[regno] = true;
9126 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9127 determining the probe offset for alloca. */
9129 static HOST_WIDE_INT
9130 aarch64_stack_clash_protection_alloca_probe_range (void)
9132 return STACK_CLASH_CALLER_GUARD;
9135 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9136 subsequent memory accesses and that requires the stack pointer and REG
9137 to have their current values. REG can be stack_pointer_rtx if no
9138 other register's value needs to be fixed. */
9140 static void
9141 aarch64_emit_stack_tie (rtx reg)
9143 emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9146 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9147 registers. If POLY_SIZE is not large enough to require a probe this function
9148 will only adjust the stack. When allocating the stack space
9149 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9150 FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9151 the saved registers. If we are then we ensure that any allocation
9152 larger than the ABI defined buffer needs a probe so that the
9153 invariant of having a 1KB buffer is maintained.
9155 We emit barriers after each stack adjustment to prevent optimizations from
9156 breaking the invariant that we never drop the stack more than a page. This
9157 invariant is needed to make it easier to correctly handle asynchronous
9158 events, e.g. if we were to allow the stack to be dropped by more than a page
9159 and then have multiple probes up and we take a signal somewhere in between
9160 then the signal handler doesn't know the state of the stack and can make no
9161 assumptions about which pages have been probed.
9163 FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
9164 is measured relative to the SME vector length instead of the current
9165 prevailing vector length. It is 0 otherwise. */
9167 static void
9168 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9169 poly_int64 poly_size,
9170 aarch64_feature_flags force_isa_mode,
9171 bool frame_related_p,
9172 bool final_adjustment_p)
9174 aarch64_frame &frame = cfun->machine->frame;
9175 HOST_WIDE_INT guard_size
9176 = 1 << param_stack_clash_protection_guard_size;
9177 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9178 HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9179 gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9180 HOST_WIDE_INT min_probe_threshold
9181 = (final_adjustment_p
9182 ? guard_used_by_caller + byte_sp_alignment
9183 : guard_size - guard_used_by_caller);
9184 poly_int64 frame_size = frame.frame_size;
9186 /* We should always have a positive probe threshold. */
9187 gcc_assert (min_probe_threshold > 0);
9189 if (flag_stack_clash_protection && !final_adjustment_p)
9191 poly_int64 initial_adjust = frame.initial_adjust;
9192 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9193 poly_int64 final_adjust = frame.final_adjust;
9195 if (known_eq (frame_size, 0))
9197 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9199 else if (known_lt (initial_adjust + sve_callee_adjust,
9200 guard_size - guard_used_by_caller)
9201 && known_lt (final_adjust, guard_used_by_caller))
9203 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9207 /* If SIZE is not large enough to require probing, just adjust the stack and
9208 exit. */
9209 if (known_lt (poly_size, min_probe_threshold)
9210 || !flag_stack_clash_protection)
9212 aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9213 frame_related_p);
9214 return;
9217 HOST_WIDE_INT size;
9218 /* Handle the SVE non-constant case first. */
9219 if (!poly_size.is_constant (&size))
9221 if (dump_file)
9223 fprintf (dump_file, "Stack clash SVE prologue: ");
9224 print_dec (poly_size, dump_file);
9225 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9228 /* First calculate the amount of bytes we're actually spilling. */
9229 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9230 poly_size, temp1, temp2, force_isa_mode,
9231 false, true);
9233 rtx_insn *insn = get_last_insn ();
9235 if (frame_related_p)
9237 /* This is done to provide unwinding information for the stack
9238 adjustments we're about to do, however to prevent the optimizers
9239 from removing the R11 move and leaving the CFA note (which would be
9240 very wrong) we tie the old and new stack pointer together.
9241 The tie will expand to nothing but the optimizers will not touch
9242 the instruction. */
9243 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9244 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9245 aarch64_emit_stack_tie (stack_ptr_copy);
9247 /* We want the CFA independent of the stack pointer for the
9248 duration of the loop. */
9249 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9250 RTX_FRAME_RELATED_P (insn) = 1;
9253 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9254 rtx guard_const = gen_int_mode (guard_size, Pmode);
9256 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9257 stack_pointer_rtx, temp1,
9258 probe_const, guard_const));
9260 /* Now reset the CFA register if needed. */
9261 if (frame_related_p)
9263 add_reg_note (insn, REG_CFA_DEF_CFA,
9264 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9265 gen_int_mode (poly_size, Pmode)));
9266 RTX_FRAME_RELATED_P (insn) = 1;
9269 return;
9272 if (dump_file)
9273 fprintf (dump_file,
9274 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9275 " bytes, probing will be required.\n", size);
9277 /* Round size to the nearest multiple of guard_size, and calculate the
9278 residual as the difference between the original size and the rounded
9279 size. */
9280 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9281 HOST_WIDE_INT residual = size - rounded_size;
9283 /* We can handle a small number of allocations/probes inline. Otherwise
9284 punt to a loop. */
9285 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9287 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9289 aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9290 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9291 guard_used_by_caller));
9292 emit_insn (gen_blockage ());
9294 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9296 else
9298 /* Compute the ending address. */
9299 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9300 temp1, NULL, force_isa_mode, false, true);
9301 rtx_insn *insn = get_last_insn ();
9303 /* For the initial allocation, we don't have a frame pointer
9304 set up, so we always need CFI notes. If we're doing the
9305 final allocation, then we may have a frame pointer, in which
9306 case it is the CFA, otherwise we need CFI notes.
9308 We can determine which allocation we are doing by looking at
9309 the value of FRAME_RELATED_P since the final allocations are not
9310 frame related. */
9311 if (frame_related_p)
9313 /* We want the CFA independent of the stack pointer for the
9314 duration of the loop. */
9315 add_reg_note (insn, REG_CFA_DEF_CFA,
9316 plus_constant (Pmode, temp1, rounded_size));
9317 RTX_FRAME_RELATED_P (insn) = 1;
9320 /* This allocates and probes the stack. Note that this re-uses some of
9321 the existing Ada stack protection code. However we are guaranteed not
9322 to enter the non loop or residual branches of that code.
9324 The non-loop part won't be entered because if our allocation amount
9325 doesn't require a loop, the case above would handle it.
9327 The residual amount won't be entered because TEMP1 is a mutliple of
9328 the allocation size. The residual will always be 0. As such, the only
9329 part we are actually using from that code is the loop setup. The
9330 actual probing is done in aarch64_output_probe_stack_range. */
9331 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9332 stack_pointer_rtx, temp1));
9334 /* Now reset the CFA register if needed. */
9335 if (frame_related_p)
9337 add_reg_note (insn, REG_CFA_DEF_CFA,
9338 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9339 RTX_FRAME_RELATED_P (insn) = 1;
9342 emit_insn (gen_blockage ());
9343 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9346 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9347 be probed. This maintains the requirement that each page is probed at
9348 least once. For initial probing we probe only if the allocation is
9349 more than GUARD_SIZE - buffer, and below the saved registers we probe
9350 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9351 GUARD_SIZE. This works that for any allocation that is large enough to
9352 trigger a probe here, we'll have at least one, and if they're not large
9353 enough for this code to emit anything for them, The page would have been
9354 probed by the saving of FP/LR either by this function or any callees. If
9355 we don't have any callees then we won't have more stack adjustments and so
9356 are still safe. */
9357 if (residual)
9359 gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9361 /* If we're doing final adjustments, and we've done any full page
9362 allocations then any residual needs to be probed. */
9363 if (final_adjustment_p && rounded_size != 0)
9364 min_probe_threshold = 0;
9366 aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9367 if (residual >= min_probe_threshold)
9369 if (dump_file)
9370 fprintf (dump_file,
9371 "Stack clash AArch64 prologue residuals: "
9372 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9373 "\n", residual);
9375 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9376 guard_used_by_caller));
9377 emit_insn (gen_blockage ());
9382 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY. */
9384 void
9385 aarch64_extra_live_on_entry (bitmap regs)
9387 if (TARGET_ZA)
9389 bitmap_set_bit (regs, LOWERING_REGNUM);
9390 bitmap_set_bit (regs, SME_STATE_REGNUM);
9391 bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9392 bitmap_set_bit (regs, ZA_FREE_REGNUM);
9393 bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9395 /* The only time ZA can't have live contents on entry is when
9396 the function explicitly treats it as a pure output. */
9397 auto za_flags = aarch64_cfun_shared_flags ("za");
9398 if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9399 bitmap_set_bit (regs, ZA_REGNUM);
9401 /* Since ZT0 is call-clobbered, it is only live on input if
9402 it is explicitly shared, and is not a pure output. */
9403 auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9404 if (zt0_flags != 0
9405 && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9406 bitmap_set_bit (regs, ZT0_REGNUM);
9410 /* Return 1 if the register is used by the epilogue. We need to say the
9411 return register is used, but only after epilogue generation is complete.
9412 Note that in the case of sibcalls, the values "used by the epilogue" are
9413 considered live at the start of the called function. */
9416 aarch64_epilogue_uses (int regno)
9418 if (epilogue_completed)
9420 if (regno == LR_REGNUM)
9421 return 1;
9423 if (regno == LOWERING_REGNUM && TARGET_ZA)
9424 return 1;
9425 if (regno == SME_STATE_REGNUM && TARGET_ZA)
9426 return 1;
9427 if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9428 return 1;
9429 /* If the function shares SME state with its caller, ensure that that
9430 data is not in the lazy save buffer on exit. */
9431 if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9432 return 1;
9433 if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9434 return 1;
9435 if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9436 return 1;
9437 return 0;
9440 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
9442 static bool
9443 aarch64_use_late_prologue_epilogue ()
9445 return aarch64_cfun_enables_pstate_sm ();
9448 /* The current function's frame has a save slot for the incoming state
9449 of SVCR. Return a legitimate memory for the slot, based on the hard
9450 frame pointer. */
9452 static rtx
9453 aarch64_old_svcr_mem ()
9455 gcc_assert (frame_pointer_needed
9456 && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9457 rtx base = hard_frame_pointer_rtx;
9458 poly_int64 offset = (0
9459 /* hard fp -> bottom of frame. */
9460 - cfun->machine->frame.bytes_below_hard_fp
9461 /* bottom of frame -> save slot. */
9462 + cfun->machine->frame.old_svcr_offset);
9463 return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9466 /* The current function's frame has a save slot for the incoming state
9467 of SVCR. Load the slot into register REGNO and return the register. */
9469 static rtx
9470 aarch64_read_old_svcr (unsigned int regno)
9472 rtx svcr = gen_rtx_REG (DImode, regno);
9473 emit_move_insn (svcr, aarch64_old_svcr_mem ());
9474 return svcr;
9477 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9478 load the incoming value of SVCR from its save slot into temporary
9479 register REGNO. */
9481 static rtx_insn *
9482 aarch64_guard_switch_pstate_sm (unsigned int regno,
9483 aarch64_feature_flags local_mode)
9485 rtx old_svcr = aarch64_read_old_svcr (regno);
9486 return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9489 /* AArch64 stack frames generated by this compiler look like:
9491 +-------------------------------+
9493 | incoming stack arguments |
9495 +-------------------------------+
9496 | | <-- incoming stack pointer (aligned)
9497 | callee-allocated save area |
9498 | for register varargs |
9500 +-------------------------------+
9501 | local variables (1) | <-- frame_pointer_rtx
9503 +-------------------------------+
9504 | padding (1) |
9505 +-------------------------------+
9506 | callee-saved registers |
9507 +-------------------------------+
9508 | LR' |
9509 +-------------------------------+
9510 | FP' |
9511 +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9512 | SVE vector registers |
9513 +-------------------------------+
9514 | SVE predicate registers |
9515 +-------------------------------+
9516 | local variables (2) |
9517 +-------------------------------+
9518 | padding (2) |
9519 +-------------------------------+
9520 | dynamic allocation |
9521 +-------------------------------+
9522 | padding |
9523 +-------------------------------+
9524 | outgoing stack arguments | <-- arg_pointer
9526 +-------------------------------+
9527 | | <-- stack_pointer_rtx (aligned)
9529 The regions marked (1) and (2) are mutually exclusive. (2) is used
9530 when aarch64_save_regs_above_locals_p is true.
9532 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9533 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9534 unchanged.
9536 By default for stack-clash we assume the guard is at least 64KB, but this
9537 value is configurable to either 4KB or 64KB. We also force the guard size to
9538 be the same as the probing interval and both values are kept in sync.
9540 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9541 on the guard size) of stack space without probing.
9543 When probing is needed, we emit a probe at the start of the prologue
9544 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9546 We can also use register saves as probes. These are stored in
9547 sve_save_and_probe and hard_fp_save_and_probe.
9549 For outgoing arguments we probe if the size is larger than 1KB, such that
9550 the ABI specified buffer is maintained for the next callee.
9552 The following registers are reserved during frame layout and should not be
9553 used for any other purpose:
9555 - r11: Used by stack clash protection when SVE is enabled, and also
9556 as an anchor register when saving and restoring registers
9557 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9558 - r14 and r15: Used for speculation tracking.
9559 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9560 - r30(LR), r29(FP): Used by standard frame layout.
9562 These registers must be avoided in frame layout related code unless the
9563 explicit intention is to interact with one of the features listed above. */
9565 /* Generate the prologue instructions for entry into a function.
9566 Establish the stack frame by decreasing the stack pointer with a
9567 properly calculated size and, if necessary, create a frame record
9568 filled with the values of LR and previous frame pointer. The
9569 current FP is also set up if it is in use. */
9571 void
9572 aarch64_expand_prologue (void)
9574 aarch64_frame &frame = cfun->machine->frame;
9575 poly_int64 frame_size = frame.frame_size;
9576 poly_int64 initial_adjust = frame.initial_adjust;
9577 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9578 poly_int64 final_adjust = frame.final_adjust;
9579 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9580 unsigned reg1 = frame.wb_push_candidate1;
9581 unsigned reg2 = frame.wb_push_candidate2;
9582 bool emit_frame_chain = frame.emit_frame_chain;
9583 rtx_insn *insn;
9584 aarch64_feature_flags force_isa_mode = 0;
9585 if (aarch64_cfun_enables_pstate_sm ())
9586 force_isa_mode = AARCH64_FL_SM_ON;
9588 if (flag_stack_clash_protection
9589 && known_eq (callee_adjust, 0)
9590 && known_lt (frame.reg_offset[VG_REGNUM], 0))
9592 /* Fold the SVE allocation into the initial allocation.
9593 We don't do this in aarch64_layout_arg to avoid pessimizing
9594 the epilogue code. */
9595 initial_adjust += sve_callee_adjust;
9596 sve_callee_adjust = 0;
9599 /* Sign return address for functions. */
9600 if (aarch64_return_address_signing_enabled ())
9602 switch (aarch64_ra_sign_key)
9604 case AARCH64_KEY_A:
9605 insn = emit_insn (gen_paciasp ());
9606 break;
9607 case AARCH64_KEY_B:
9608 insn = emit_insn (gen_pacibsp ());
9609 break;
9610 default:
9611 gcc_unreachable ();
9613 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9614 RTX_FRAME_RELATED_P (insn) = 1;
9617 /* Push return address to shadow call stack. */
9618 if (frame.is_scs_enabled)
9619 emit_insn (gen_scs_push ());
9621 if (flag_stack_usage_info)
9622 current_function_static_stack_size = constant_lower_bound (frame_size);
9624 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9626 if (crtl->is_leaf && !cfun->calls_alloca)
9628 if (maybe_gt (frame_size, PROBE_INTERVAL)
9629 && maybe_gt (frame_size, get_stack_check_protect ()))
9630 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9631 (frame_size
9632 - get_stack_check_protect ()));
9634 else if (maybe_gt (frame_size, 0))
9635 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9638 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9639 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9641 /* In theory we should never have both an initial adjustment
9642 and a callee save adjustment. Verify that is the case since the
9643 code below does not handle it for -fstack-clash-protection. */
9644 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9646 /* Will only probe if the initial adjustment is larger than the guard
9647 less the amount of the guard reserved for use by the caller's
9648 outgoing args. */
9649 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9650 force_isa_mode, true, false);
9652 if (callee_adjust != 0)
9653 aarch64_push_regs (reg1, reg2, callee_adjust);
9655 /* The offset of the current SP from the bottom of the static frame. */
9656 poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9658 if (emit_frame_chain)
9660 /* The offset of the frame chain record (if any) from the current SP. */
9661 poly_int64 chain_offset = (initial_adjust + callee_adjust
9662 - frame.bytes_above_hard_fp);
9663 gcc_assert (known_ge (chain_offset, 0));
9665 gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9666 if (callee_adjust == 0)
9667 aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9668 false, false);
9669 else
9670 gcc_assert (known_eq (chain_offset, 0));
9671 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9672 stack_pointer_rtx, chain_offset,
9673 tmp1_rtx, tmp0_rtx, force_isa_mode,
9674 frame_pointer_needed);
9675 if (frame_pointer_needed && !frame_size.is_constant ())
9677 /* Variable-sized frames need to describe the save slot
9678 address using DW_CFA_expression rather than DW_CFA_offset.
9679 This means that, without taking further action, the
9680 locations of the registers that we've already saved would
9681 remain based on the stack pointer even after we redefine
9682 the CFA based on the frame pointer. We therefore need new
9683 DW_CFA_expressions to re-express the save slots with addresses
9684 based on the frame pointer. */
9685 rtx_insn *insn = get_last_insn ();
9686 gcc_assert (RTX_FRAME_RELATED_P (insn));
9688 /* Add an explicit CFA definition if this was previously
9689 implicit. */
9690 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9692 rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9693 add_reg_note (insn, REG_CFA_ADJUST_CFA,
9694 gen_rtx_SET (hard_frame_pointer_rtx, src));
9697 /* Change the save slot expressions for the registers that
9698 we've already saved. */
9699 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9700 hard_frame_pointer_rtx, UNITS_PER_WORD);
9701 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9702 hard_frame_pointer_rtx, 0);
9704 aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9707 aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9708 emit_frame_chain);
9709 if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9711 unsigned int saved_regs[] = { VG_REGNUM };
9712 aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9713 emit_frame_chain);
9715 if (maybe_ne (sve_callee_adjust, 0))
9717 gcc_assert (!flag_stack_clash_protection
9718 || known_eq (initial_adjust, 0)
9719 /* The VG save isn't shrink-wrapped and so serves as
9720 a probe of the initial allocation. */
9721 || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp));
9722 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9723 sve_callee_adjust,
9724 force_isa_mode,
9725 !frame_pointer_needed, false);
9726 bytes_below_sp -= sve_callee_adjust;
9728 aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9729 emit_frame_chain);
9730 aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
9731 emit_frame_chain);
9733 /* We may need to probe the final adjustment if it is larger than the guard
9734 that is assumed by the called. */
9735 gcc_assert (known_eq (bytes_below_sp, final_adjust));
9736 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9737 force_isa_mode,
9738 !frame_pointer_needed, true);
9739 if (emit_frame_chain && maybe_ne (final_adjust, 0))
9740 aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9742 /* Save the incoming value of PSTATE.SM, if required. Code further
9743 down does this for locally-streaming functions. */
9744 if (known_ge (frame.old_svcr_offset, 0)
9745 && !aarch64_cfun_enables_pstate_sm ())
9747 rtx mem = aarch64_old_svcr_mem ();
9748 MEM_VOLATILE_P (mem) = 1;
9749 if (TARGET_SME)
9751 rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
9752 emit_insn (gen_aarch64_read_svcr (reg));
9753 emit_move_insn (mem, reg);
9755 else
9757 rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
9758 auto &args = crtl->args.info;
9759 if (args.aapcs_ncrn > 0)
9761 old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
9762 emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
9764 if (args.aapcs_ncrn > 1)
9766 old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
9767 emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
9769 emit_insn (gen_aarch64_get_sme_state ());
9770 emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
9771 if (old_r0)
9772 emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
9773 if (old_r1)
9774 emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
9778 /* Enable PSTATE.SM, if required. */
9779 if (aarch64_cfun_enables_pstate_sm ())
9781 rtx_insn *guard_label = nullptr;
9782 if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9784 /* The current function is streaming-compatible. Save the
9785 original state of PSTATE.SM. */
9786 rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
9787 emit_insn (gen_aarch64_read_svcr (svcr));
9788 emit_move_insn (aarch64_old_svcr_mem (), svcr);
9789 guard_label = aarch64_guard_switch_pstate_sm (svcr,
9790 aarch64_isa_flags);
9792 aarch64_sme_mode_switch_regs args_switch;
9793 auto &args = crtl->args.info;
9794 for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
9796 rtx x = args.sme_mode_switch_args[i];
9797 args_switch.add_reg (GET_MODE (x), REGNO (x));
9799 args_switch.emit_prologue ();
9800 emit_insn (gen_aarch64_smstart_sm ());
9801 args_switch.emit_epilogue ();
9802 if (guard_label)
9803 emit_label (guard_label);
9807 /* Return TRUE if we can use a simple_return insn.
9809 This function checks whether the callee saved stack is empty, which
9810 means no restore actions are need. The pro_and_epilogue will use
9811 this to check whether shrink-wrapping opt is feasible. */
9813 bool
9814 aarch64_use_return_insn_p (void)
9816 if (!reload_completed)
9817 return false;
9819 if (crtl->profile)
9820 return false;
9822 return known_eq (cfun->machine->frame.frame_size, 0);
9825 /* Generate the epilogue instructions for returning from a function.
9826 This is almost exactly the reverse of the prolog sequence, except
9827 that we need to insert barriers to avoid scheduling loads that read
9828 from a deallocated stack, and we optimize the unwind records by
9829 emitting them all together if possible. */
9830 void
9831 aarch64_expand_epilogue (rtx_call_insn *sibcall)
9833 aarch64_frame &frame = cfun->machine->frame;
9834 poly_int64 initial_adjust = frame.initial_adjust;
9835 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9836 poly_int64 final_adjust = frame.final_adjust;
9837 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9838 poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
9839 unsigned reg1 = frame.wb_pop_candidate1;
9840 unsigned reg2 = frame.wb_pop_candidate2;
9841 rtx cfi_ops = NULL;
9842 rtx_insn *insn;
9843 /* A stack clash protection prologue may not have left EP0_REGNUM or
9844 EP1_REGNUM in a usable state. The same is true for allocations
9845 with an SVE component, since we then need both temporary registers
9846 for each allocation. For stack clash we are in a usable state if
9847 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
9848 HOST_WIDE_INT guard_size
9849 = 1 << param_stack_clash_protection_guard_size;
9850 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9851 aarch64_feature_flags force_isa_mode = 0;
9852 if (aarch64_cfun_enables_pstate_sm ())
9853 force_isa_mode = AARCH64_FL_SM_ON;
9855 /* We can re-use the registers when:
9857 (a) the deallocation amount is the same as the corresponding
9858 allocation amount (which is false if we combine the initial
9859 and SVE callee save allocations in the prologue); and
9861 (b) the allocation amount doesn't need a probe (which is false
9862 if the amount is guard_size - guard_used_by_caller or greater).
9864 In such situations the register should remain live with the correct
9865 value. */
9866 bool can_inherit_p = (initial_adjust.is_constant ()
9867 && final_adjust.is_constant ()
9868 && (!flag_stack_clash_protection
9869 || (known_lt (initial_adjust,
9870 guard_size - guard_used_by_caller)
9871 && known_eq (sve_callee_adjust, 0))));
9873 /* We need to add memory barrier to prevent read from deallocated stack. */
9874 bool need_barrier_p
9875 = maybe_ne (get_frame_size ()
9876 + frame.saved_varargs_size, 0);
9878 /* Reset PSTATE.SM, if required. */
9879 if (aarch64_cfun_enables_pstate_sm ())
9881 rtx_insn *guard_label = nullptr;
9882 if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9883 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
9884 aarch64_isa_flags);
9885 aarch64_sme_mode_switch_regs return_switch;
9886 if (sibcall)
9887 return_switch.add_call_args (sibcall);
9888 else if (crtl->return_rtx && REG_P (crtl->return_rtx))
9889 return_switch.add_reg (GET_MODE (crtl->return_rtx),
9890 REGNO (crtl->return_rtx));
9891 return_switch.emit_prologue ();
9892 emit_insn (gen_aarch64_smstop_sm ());
9893 return_switch.emit_epilogue ();
9894 if (guard_label)
9895 emit_label (guard_label);
9898 /* Emit a barrier to prevent loads from a deallocated stack. */
9899 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9900 || cfun->calls_alloca
9901 || crtl->calls_eh_return)
9903 aarch64_emit_stack_tie (stack_pointer_rtx);
9904 need_barrier_p = false;
9907 /* Restore the stack pointer from the frame pointer if it may not
9908 be the same as the stack pointer. */
9909 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9910 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9911 if (frame_pointer_needed
9912 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9913 /* If writeback is used when restoring callee-saves, the CFA
9914 is restored on the instruction doing the writeback. */
9915 aarch64_add_offset (Pmode, stack_pointer_rtx,
9916 hard_frame_pointer_rtx,
9917 -bytes_below_hard_fp + final_adjust,
9918 tmp1_rtx, tmp0_rtx, force_isa_mode,
9919 callee_adjust == 0);
9920 else
9921 /* The case where we need to re-use the register here is very rare, so
9922 avoid the complicated condition and just always emit a move if the
9923 immediate doesn't fit. */
9924 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
9926 /* Restore the vector registers before the predicate registers,
9927 so that we can use P4 as a temporary for big-endian SVE frames. */
9928 aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
9929 aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
9930 if (maybe_ne (sve_callee_adjust, 0))
9931 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
9932 force_isa_mode, true);
9934 /* When shadow call stack is enabled, the scs_pop in the epilogue will
9935 restore x30, we don't need to restore x30 again in the traditional
9936 way. */
9937 aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
9938 frame.saved_gprs, &cfi_ops);
9940 if (need_barrier_p)
9941 aarch64_emit_stack_tie (stack_pointer_rtx);
9943 if (callee_adjust != 0)
9944 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9946 /* If we have no register restore information, the CFA must have been
9947 defined in terms of the stack pointer since the end of the prologue. */
9948 gcc_assert (cfi_ops || !frame_pointer_needed);
9950 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9952 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
9953 insn = get_last_insn ();
9954 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9955 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9956 RTX_FRAME_RELATED_P (insn) = 1;
9957 cfi_ops = NULL;
9960 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9961 add restriction on emit_move optimization to leaf functions. */
9962 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
9963 (!can_inherit_p || !crtl->is_leaf
9964 || df_regs_ever_live_p (EP0_REGNUM)));
9966 if (cfi_ops)
9968 /* Emit delayed restores and reset the CFA to be SP. */
9969 insn = get_last_insn ();
9970 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9971 REG_NOTES (insn) = cfi_ops;
9972 RTX_FRAME_RELATED_P (insn) = 1;
9975 /* Pop return address from shadow call stack. */
9976 if (frame.is_scs_enabled)
9978 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
9979 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
9981 insn = emit_insn (gen_scs_pop ());
9982 add_reg_note (insn, REG_CFA_RESTORE, reg);
9983 RTX_FRAME_RELATED_P (insn) = 1;
9986 /* Stack adjustment for exception handler. */
9987 if (crtl->calls_eh_return && !sibcall)
9989 /* If the EH_RETURN_TAKEN_RTX flag is set then we need
9990 to unwind the stack and jump to the handler, otherwise
9991 skip this eh_return logic and continue with normal
9992 return after the label. We have already reset the CFA
9993 to be SP; letting the CFA move during this adjustment
9994 is just as correct as retaining the CFA from the body
9995 of the function. Therefore, do nothing special. */
9996 rtx_code_label *label = gen_label_rtx ();
9997 rtx x = aarch64_gen_compare_zero_and_branch (EQ, EH_RETURN_TAKEN_RTX,
9998 label);
9999 rtx jump = emit_jump_insn (x);
10000 JUMP_LABEL (jump) = label;
10001 LABEL_NUSES (label)++;
10002 emit_insn (gen_add2_insn (stack_pointer_rtx,
10003 EH_RETURN_STACKADJ_RTX));
10004 emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
10005 emit_barrier ();
10006 emit_label (label);
10009 /* We prefer to emit the combined return/authenticate instruction RETAA,
10010 however there are three cases in which we must instead emit an explicit
10011 authentication instruction.
10013 1) Sibcalls don't return in a normal way, so if we're about to call one
10014 we must authenticate.
10016 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10017 generating code for !TARGET_ARMV8_3 we can't use it and must
10018 explicitly authenticate.
10020 if (aarch64_return_address_signing_enabled ()
10021 && (sibcall || !TARGET_ARMV8_3))
10023 switch (aarch64_ra_sign_key)
10025 case AARCH64_KEY_A:
10026 insn = emit_insn (gen_autiasp ());
10027 break;
10028 case AARCH64_KEY_B:
10029 insn = emit_insn (gen_autibsp ());
10030 break;
10031 default:
10032 gcc_unreachable ();
10034 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10035 RTX_FRAME_RELATED_P (insn) = 1;
10038 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10039 if (!sibcall)
10040 emit_jump_insn (ret_rtx);
10043 /* Output code to add DELTA to the first argument, and then jump
10044 to FUNCTION. Used for C++ multiple inheritance. */
10045 static void
10046 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10047 HOST_WIDE_INT delta,
10048 HOST_WIDE_INT vcall_offset,
10049 tree function)
10051 /* The this pointer is always in x0. Note that this differs from
10052 Arm where the this pointer maybe bumped to r1 if r0 is required
10053 to return a pointer to an aggregate. On AArch64 a result value
10054 pointer will be in x8. */
10055 int this_regno = R0_REGNUM;
10056 rtx this_rtx, temp0, temp1, addr, funexp;
10057 rtx_insn *insn;
10058 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10060 if (aarch_bti_enabled ())
10061 emit_insn (gen_bti_c());
10063 reload_completed = 1;
10064 emit_note (NOTE_INSN_PROLOGUE_END);
10066 this_rtx = gen_rtx_REG (Pmode, this_regno);
10067 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10068 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10070 if (vcall_offset == 0)
10071 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
10072 0, false);
10073 else
10075 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10077 addr = this_rtx;
10078 if (delta != 0)
10080 if (delta >= -256 && delta < 256)
10081 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10082 plus_constant (Pmode, this_rtx, delta));
10083 else
10084 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10085 temp1, temp0, 0, false);
10088 if (Pmode == ptr_mode)
10089 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10090 else
10091 aarch64_emit_move (temp0,
10092 gen_rtx_ZERO_EXTEND (Pmode,
10093 gen_rtx_MEM (ptr_mode, addr)));
10095 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10096 addr = plus_constant (Pmode, temp0, vcall_offset);
10097 else
10099 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10100 Pmode);
10101 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10104 if (Pmode == ptr_mode)
10105 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10106 else
10107 aarch64_emit_move (temp1,
10108 gen_rtx_SIGN_EXTEND (Pmode,
10109 gen_rtx_MEM (ptr_mode, addr)));
10111 emit_insn (gen_add2_insn (this_rtx, temp1));
10114 /* Generate a tail call to the target function. */
10115 if (!TREE_USED (function))
10117 assemble_external (function);
10118 TREE_USED (function) = 1;
10120 funexp = XEXP (DECL_RTL (function), 0);
10121 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10122 auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10123 auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10124 rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant);
10125 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10126 SIBLING_CALL_P (insn) = 1;
10128 insn = get_insns ();
10129 shorten_branches (insn);
10131 assemble_start_function (thunk, fnname);
10132 final_start_function (insn, file, 1);
10133 final (insn, file, 1);
10134 final_end_function ();
10135 assemble_end_function (thunk, fnname);
10137 /* Stop pretending to be a post-reload pass. */
10138 reload_completed = 0;
10141 static bool
10142 aarch64_tls_referenced_p (rtx x)
10144 if (!TARGET_HAVE_TLS)
10145 return false;
10146 subrtx_iterator::array_type array;
10147 FOR_EACH_SUBRTX (iter, array, x, ALL)
10149 const_rtx x = *iter;
10150 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10151 return true;
10152 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10153 TLS offsets, not real symbol references. */
10154 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10155 iter.skip_subrtxes ();
10157 return false;
10161 static bool
10162 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10164 if (GET_CODE (x) == HIGH)
10165 return true;
10167 /* There's no way to calculate VL-based values using relocations. */
10168 subrtx_iterator::array_type array;
10169 HOST_WIDE_INT factor;
10170 FOR_EACH_SUBRTX (iter, array, x, ALL)
10171 if (GET_CODE (*iter) == CONST_POLY_INT
10172 || aarch64_sme_vq_unspec_p (x, &factor))
10173 return true;
10175 poly_int64 offset;
10176 rtx base = strip_offset_and_salt (x, &offset);
10177 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10179 /* We checked for POLY_INT_CST offsets above. */
10180 if (aarch64_classify_symbol (base, offset.to_constant ())
10181 != SYMBOL_FORCE_TO_MEM)
10182 return true;
10183 else
10184 /* Avoid generating a 64-bit relocation in ILP32; leave
10185 to aarch64_expand_mov_immediate to handle it properly. */
10186 return mode != ptr_mode;
10189 return aarch64_tls_referenced_p (x);
10192 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10193 The expansion for a table switch is quite expensive due to the number
10194 of instructions, the table lookup and hard to predict indirect jump.
10195 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10196 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10197 performance. When optimizing for size, use 8 for smallest codesize. */
10199 static unsigned int
10200 aarch64_case_values_threshold (void)
10202 /* Use the specified limit for the number of cases before using jump
10203 tables at higher optimization levels. */
10204 if (optimize > 2
10205 && aarch64_tune_params.max_case_values != 0)
10206 return aarch64_tune_params.max_case_values;
10207 else
10208 return optimize_size ? 8 : 11;
10211 /* Return true if register REGNO is a valid index register.
10212 STRICT_P is true if REG_OK_STRICT is in effect. */
10214 bool
10215 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10217 if (!HARD_REGISTER_NUM_P (regno))
10219 if (!strict_p)
10220 return true;
10222 if (!reg_renumber)
10223 return false;
10225 regno = reg_renumber[regno];
10227 return GP_REGNUM_P (regno);
10230 /* Return true if register REGNO is a valid base register for mode MODE.
10231 STRICT_P is true if REG_OK_STRICT is in effect. */
10233 bool
10234 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10236 if (!HARD_REGISTER_NUM_P (regno))
10238 if (!strict_p)
10239 return true;
10241 if (!reg_renumber)
10242 return false;
10244 regno = reg_renumber[regno];
10247 /* The fake registers will be eliminated to either the stack or
10248 hard frame pointer, both of which are usually valid base registers.
10249 Reload deals with the cases where the eliminated form isn't valid. */
10250 return (GP_REGNUM_P (regno)
10251 || regno == SP_REGNUM
10252 || regno == FRAME_POINTER_REGNUM
10253 || regno == ARG_POINTER_REGNUM);
10256 /* Return true if X is a valid base register for mode MODE.
10257 STRICT_P is true if REG_OK_STRICT is in effect. */
10259 static bool
10260 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10262 if (!strict_p
10263 && SUBREG_P (x)
10264 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10265 x = SUBREG_REG (x);
10267 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10270 /* Return true if address offset is a valid index. If it is, fill in INFO
10271 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10273 static bool
10274 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10275 machine_mode mode, bool strict_p)
10277 enum aarch64_address_type type;
10278 rtx index;
10279 int shift;
10281 /* (reg:P) */
10282 if ((REG_P (x) || SUBREG_P (x))
10283 && GET_MODE (x) == Pmode)
10285 type = ADDRESS_REG_REG;
10286 index = x;
10287 shift = 0;
10289 /* (sign_extend:DI (reg:SI)) */
10290 else if ((GET_CODE (x) == SIGN_EXTEND
10291 || GET_CODE (x) == ZERO_EXTEND)
10292 && GET_MODE (x) == DImode
10293 && GET_MODE (XEXP (x, 0)) == SImode)
10295 type = (GET_CODE (x) == SIGN_EXTEND)
10296 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10297 index = XEXP (x, 0);
10298 shift = 0;
10300 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10301 else if (GET_CODE (x) == MULT
10302 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10303 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10304 && GET_MODE (XEXP (x, 0)) == DImode
10305 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10306 && CONST_INT_P (XEXP (x, 1)))
10308 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10309 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10310 index = XEXP (XEXP (x, 0), 0);
10311 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10313 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10314 else if (GET_CODE (x) == ASHIFT
10315 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10316 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10317 && GET_MODE (XEXP (x, 0)) == DImode
10318 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10319 && CONST_INT_P (XEXP (x, 1)))
10321 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10322 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10323 index = XEXP (XEXP (x, 0), 0);
10324 shift = INTVAL (XEXP (x, 1));
10326 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10327 (const_int 0xffffffff<<shift)) */
10328 else if (GET_CODE (x) == AND
10329 && GET_MODE (x) == DImode
10330 && GET_CODE (XEXP (x, 0)) == MULT
10331 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10332 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10333 && CONST_INT_P (XEXP (x, 1)))
10335 type = ADDRESS_REG_UXTW;
10336 index = XEXP (XEXP (x, 0), 0);
10337 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10338 /* Avoid undefined code dealing with shift being -1. */
10339 if (shift != -1
10340 && INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10341 shift = -1;
10343 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10344 (const_int 0xffffffff<<shift)) */
10345 else if (GET_CODE (x) == AND
10346 && GET_MODE (x) == DImode
10347 && GET_CODE (XEXP (x, 0)) == ASHIFT
10348 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10349 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10350 && CONST_INT_P (XEXP (x, 1)))
10352 type = ADDRESS_REG_UXTW;
10353 index = XEXP (XEXP (x, 0), 0);
10354 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10355 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10356 shift = -1;
10358 /* (mult:P (reg:P) (const_int scale)) */
10359 else if (GET_CODE (x) == MULT
10360 && GET_MODE (x) == Pmode
10361 && GET_MODE (XEXP (x, 0)) == Pmode
10362 && CONST_INT_P (XEXP (x, 1)))
10364 type = ADDRESS_REG_REG;
10365 index = XEXP (x, 0);
10366 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10368 /* (ashift:P (reg:P) (const_int shift)) */
10369 else if (GET_CODE (x) == ASHIFT
10370 && GET_MODE (x) == Pmode
10371 && GET_MODE (XEXP (x, 0)) == Pmode
10372 && CONST_INT_P (XEXP (x, 1)))
10374 type = ADDRESS_REG_REG;
10375 index = XEXP (x, 0);
10376 shift = INTVAL (XEXP (x, 1));
10378 else
10379 return false;
10381 if (!strict_p
10382 && SUBREG_P (index)
10383 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10384 index = SUBREG_REG (index);
10386 if (aarch64_sve_data_mode_p (mode) || mode == VNx1TImode)
10388 if (type != ADDRESS_REG_REG
10389 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10390 return false;
10392 else
10394 if (shift != 0
10395 && !(IN_RANGE (shift, 1, 3)
10396 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10397 return false;
10400 if (REG_P (index)
10401 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10403 info->type = type;
10404 info->offset = index;
10405 info->shift = shift;
10406 return true;
10409 return false;
10412 /* Return true if MODE is one of the modes for which we
10413 support LDP/STP operations. */
10415 static bool
10416 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10418 return mode == SImode || mode == DImode
10419 || mode == SFmode || mode == DFmode
10420 || mode == SDmode || mode == DDmode
10421 || (aarch64_vector_mode_supported_p (mode)
10422 && (known_eq (GET_MODE_SIZE (mode), 8)
10423 || known_eq (GET_MODE_SIZE (mode), 16)));
10426 /* Return true if REGNO is a virtual pointer register, or an eliminable
10427 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10428 include stack_pointer or hard_frame_pointer. */
10429 static bool
10430 virt_or_elim_regno_p (unsigned regno)
10432 return ((regno >= FIRST_VIRTUAL_REGISTER
10433 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10434 || regno == FRAME_POINTER_REGNUM
10435 || regno == ARG_POINTER_REGNUM);
10438 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10439 If it is, fill in INFO appropriately. STRICT_P is true if
10440 REG_OK_STRICT is in effect. */
10442 bool
10443 aarch64_classify_address (struct aarch64_address_info *info,
10444 rtx x, machine_mode mode, bool strict_p,
10445 aarch64_addr_query_type type)
10447 enum rtx_code code = GET_CODE (x);
10448 rtx op0, op1;
10449 poly_int64 offset;
10451 HOST_WIDE_INT const_size;
10453 /* Whether a vector mode is partial doesn't affect address legitimacy.
10454 Partial vectors like VNx8QImode allow the same indexed addressing
10455 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10456 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10457 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10458 vec_flags &= ~VEC_PARTIAL;
10460 /* On BE, we use load/store pair for all large int mode load/stores.
10461 TI/TF/TDmode may also use a load/store pair. */
10462 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10463 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10464 || type == ADDR_QUERY_LDP_STP_N
10465 || mode == TImode
10466 || mode == TFmode
10467 || mode == TDmode
10468 || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10469 && advsimd_struct_p));
10470 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10471 corresponds to the actual size of the memory being loaded/stored and the
10472 mode of the corresponding addressing mode is half of that. */
10473 if (type == ADDR_QUERY_LDP_STP_N)
10475 if (known_eq (GET_MODE_SIZE (mode), 32))
10476 mode = V16QImode;
10477 else if (known_eq (GET_MODE_SIZE (mode), 16))
10478 mode = DFmode;
10479 else if (known_eq (GET_MODE_SIZE (mode), 8))
10480 mode = SFmode;
10481 else
10482 return false;
10484 /* This isn't really an Advanced SIMD struct mode, but a mode
10485 used to represent the complete mem in a load/store pair. */
10486 advsimd_struct_p = false;
10489 bool allow_reg_index_p = (!load_store_pair_p
10490 && ((vec_flags == 0
10491 && known_lt (GET_MODE_SIZE (mode), 16))
10492 || vec_flags == VEC_ADVSIMD
10493 || vec_flags & VEC_SVE_DATA
10494 || mode == VNx1TImode));
10496 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10497 The latter is not valid for SVE predicates, and that's rejected through
10498 allow_reg_index_p above. */
10499 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10500 && (code != REG && code != PLUS))
10501 return false;
10503 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10504 REG addressing. */
10505 if (advsimd_struct_p
10506 && TARGET_SIMD
10507 && !BYTES_BIG_ENDIAN
10508 && (code != POST_INC && code != REG))
10509 return false;
10511 gcc_checking_assert (GET_MODE (x) == VOIDmode
10512 || SCALAR_INT_MODE_P (GET_MODE (x)));
10514 switch (code)
10516 case REG:
10517 case SUBREG:
10518 info->type = ADDRESS_REG_IMM;
10519 info->base = x;
10520 info->offset = const0_rtx;
10521 info->const_offset = 0;
10522 return aarch64_base_register_rtx_p (x, strict_p);
10524 case PLUS:
10525 op0 = XEXP (x, 0);
10526 op1 = XEXP (x, 1);
10528 if (! strict_p
10529 && REG_P (op0)
10530 && virt_or_elim_regno_p (REGNO (op0))
10531 && poly_int_rtx_p (op1, &offset))
10533 info->type = ADDRESS_REG_IMM;
10534 info->base = op0;
10535 info->offset = op1;
10536 info->const_offset = offset;
10538 return true;
10541 if (maybe_ne (GET_MODE_SIZE (mode), 0)
10542 && aarch64_base_register_rtx_p (op0, strict_p)
10543 && poly_int_rtx_p (op1, &offset))
10545 info->type = ADDRESS_REG_IMM;
10546 info->base = op0;
10547 info->offset = op1;
10548 info->const_offset = offset;
10550 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10551 registers and individual Q registers. The available
10552 address modes are:
10553 X,X: 7-bit signed scaled offset
10554 Q: 9-bit signed offset
10555 We conservatively require an offset representable in either mode.
10556 When performing the check for pairs of X registers i.e. LDP/STP
10557 pass down DImode since that is the natural size of the LDP/STP
10558 instruction memory accesses. */
10559 if (mode == TImode || mode == TFmode || mode == TDmode)
10560 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10561 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10562 || offset_12bit_unsigned_scaled_p (mode, offset)));
10564 if (mode == V8DImode)
10565 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10566 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10568 /* A 7bit offset check because OImode will emit a ldp/stp
10569 instruction (only !TARGET_SIMD or big endian will get here).
10570 For ldp/stp instructions, the offset is scaled for the size of a
10571 single element of the pair. */
10572 if (aarch64_advsimd_partial_struct_mode_p (mode)
10573 && known_eq (GET_MODE_SIZE (mode), 16))
10574 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10575 if (aarch64_advsimd_full_struct_mode_p (mode)
10576 && known_eq (GET_MODE_SIZE (mode), 32))
10577 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10579 /* Three 9/12 bit offsets checks because CImode will emit three
10580 ldr/str instructions (only !TARGET_SIMD or big endian will
10581 get here). */
10582 if (aarch64_advsimd_partial_struct_mode_p (mode)
10583 && known_eq (GET_MODE_SIZE (mode), 24))
10584 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10585 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10586 offset + 16)
10587 || offset_12bit_unsigned_scaled_p (DImode,
10588 offset + 16)));
10589 if (aarch64_advsimd_full_struct_mode_p (mode)
10590 && known_eq (GET_MODE_SIZE (mode), 48))
10591 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10592 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10593 offset + 32)
10594 || offset_12bit_unsigned_scaled_p (TImode,
10595 offset + 32)));
10597 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10598 instructions (only big endian will get here). */
10599 if (aarch64_advsimd_partial_struct_mode_p (mode)
10600 && known_eq (GET_MODE_SIZE (mode), 32))
10601 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10602 && aarch64_offset_7bit_signed_scaled_p (DImode,
10603 offset + 16));
10604 if (aarch64_advsimd_full_struct_mode_p (mode)
10605 && known_eq (GET_MODE_SIZE (mode), 64))
10606 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10607 && aarch64_offset_7bit_signed_scaled_p (TImode,
10608 offset + 32));
10610 /* Make "m" use the LD1 offset range for SVE data modes, so
10611 that pre-RTL optimizers like ivopts will work to that
10612 instead of the wider LDR/STR range. */
10613 if (vec_flags == VEC_SVE_DATA || mode == VNx1TImode)
10614 return (type == ADDR_QUERY_M
10615 ? offset_4bit_signed_scaled_p (mode, offset)
10616 : offset_9bit_signed_scaled_p (mode, offset));
10618 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10620 poly_int64 end_offset = (offset
10621 + GET_MODE_SIZE (mode)
10622 - BYTES_PER_SVE_VECTOR);
10623 return (type == ADDR_QUERY_M
10624 ? offset_4bit_signed_scaled_p (mode, offset)
10625 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10626 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10627 end_offset)));
10630 if (vec_flags == VEC_SVE_PRED)
10631 return offset_9bit_signed_scaled_p (mode, offset);
10633 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10635 poly_int64 end_offset = (offset
10636 + GET_MODE_SIZE (mode)
10637 - BYTES_PER_SVE_PRED);
10638 return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10639 && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10642 if (load_store_pair_p)
10643 return ((known_eq (GET_MODE_SIZE (mode), 4)
10644 || known_eq (GET_MODE_SIZE (mode), 8)
10645 || known_eq (GET_MODE_SIZE (mode), 16))
10646 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10647 else
10648 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10649 || offset_12bit_unsigned_scaled_p (mode, offset));
10652 if (allow_reg_index_p)
10654 /* Look for base + (scaled/extended) index register. */
10655 if (aarch64_base_register_rtx_p (op0, strict_p)
10656 && aarch64_classify_index (info, op1, mode, strict_p))
10658 info->base = op0;
10659 return true;
10661 if (aarch64_base_register_rtx_p (op1, strict_p)
10662 && aarch64_classify_index (info, op0, mode, strict_p))
10664 info->base = op1;
10665 return true;
10669 return false;
10671 case POST_INC:
10672 case POST_DEC:
10673 case PRE_INC:
10674 case PRE_DEC:
10675 info->type = ADDRESS_REG_WB;
10676 info->base = XEXP (x, 0);
10677 info->offset = NULL_RTX;
10678 return aarch64_base_register_rtx_p (info->base, strict_p);
10680 case POST_MODIFY:
10681 case PRE_MODIFY:
10682 info->type = ADDRESS_REG_WB;
10683 info->base = XEXP (x, 0);
10684 if (GET_CODE (XEXP (x, 1)) == PLUS
10685 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10686 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10687 && aarch64_base_register_rtx_p (info->base, strict_p))
10689 info->offset = XEXP (XEXP (x, 1), 1);
10690 info->const_offset = offset;
10692 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10693 registers and individual Q registers. The available
10694 address modes are:
10695 X,X: 7-bit signed scaled offset
10696 Q: 9-bit signed offset
10697 We conservatively require an offset representable in either mode.
10699 if (mode == TImode || mode == TFmode || mode == TDmode)
10700 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10701 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10703 if (load_store_pair_p)
10704 return ((known_eq (GET_MODE_SIZE (mode), 4)
10705 || known_eq (GET_MODE_SIZE (mode), 8)
10706 || known_eq (GET_MODE_SIZE (mode), 16))
10707 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10708 else
10709 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10711 return false;
10713 case CONST:
10714 case SYMBOL_REF:
10715 case LABEL_REF:
10716 /* load literal: pc-relative constant pool entry. Only supported
10717 for SI mode or larger. */
10718 info->type = ADDRESS_SYMBOLIC;
10720 if (!load_store_pair_p
10721 && GET_MODE_SIZE (mode).is_constant (&const_size)
10722 && const_size >= 4)
10724 poly_int64 offset;
10725 rtx sym = strip_offset_and_salt (x, &offset);
10726 return ((LABEL_REF_P (sym)
10727 || (SYMBOL_REF_P (sym)
10728 && CONSTANT_POOL_ADDRESS_P (sym)
10729 && aarch64_pcrelative_literal_loads)));
10731 return false;
10733 case LO_SUM:
10734 info->type = ADDRESS_LO_SUM;
10735 info->base = XEXP (x, 0);
10736 info->offset = XEXP (x, 1);
10737 if (allow_reg_index_p
10738 && aarch64_base_register_rtx_p (info->base, strict_p))
10740 poly_int64 offset;
10741 HOST_WIDE_INT const_offset;
10742 rtx sym = strip_offset_and_salt (info->offset, &offset);
10743 if (SYMBOL_REF_P (sym)
10744 && offset.is_constant (&const_offset)
10745 && (aarch64_classify_symbol (sym, const_offset)
10746 == SYMBOL_SMALL_ABSOLUTE))
10748 /* The symbol and offset must be aligned to the access size. */
10749 unsigned int align;
10751 if (CONSTANT_POOL_ADDRESS_P (sym))
10752 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10753 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10755 tree exp = SYMBOL_REF_DECL (sym);
10756 align = TYPE_ALIGN (TREE_TYPE (exp));
10757 align = aarch64_constant_alignment (exp, align);
10759 else if (SYMBOL_REF_DECL (sym))
10760 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10761 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10762 && SYMBOL_REF_BLOCK (sym) != NULL)
10763 align = SYMBOL_REF_BLOCK (sym)->alignment;
10764 else
10765 align = BITS_PER_UNIT;
10767 poly_int64 ref_size = GET_MODE_SIZE (mode);
10768 if (known_eq (ref_size, 0))
10769 ref_size = GET_MODE_SIZE (DImode);
10771 return (multiple_p (const_offset, ref_size)
10772 && multiple_p (align / BITS_PER_UNIT, ref_size));
10775 return false;
10777 default:
10778 return false;
10782 /* Return true if the address X is valid for a PRFM instruction.
10783 STRICT_P is true if we should do strict checking with
10784 aarch64_classify_address. */
10786 bool
10787 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10789 struct aarch64_address_info addr;
10791 /* PRFM accepts the same addresses as DImode... */
10792 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10793 if (!res)
10794 return false;
10796 /* ... except writeback forms. */
10797 return addr.type != ADDRESS_REG_WB;
10800 bool
10801 aarch64_symbolic_address_p (rtx x)
10803 poly_int64 offset;
10804 x = strip_offset_and_salt (x, &offset);
10805 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10808 /* Classify the base of symbolic expression X. */
10810 enum aarch64_symbol_type
10811 aarch64_classify_symbolic_expression (rtx x)
10813 rtx offset;
10815 split_const (x, &x, &offset);
10816 return aarch64_classify_symbol (x, INTVAL (offset));
10820 /* Return TRUE if X is a legitimate address for accessing memory in
10821 mode MODE. */
10822 static bool
10823 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
10824 code_helper = ERROR_MARK)
10826 struct aarch64_address_info addr;
10828 return aarch64_classify_address (&addr, x, mode, strict_p);
10831 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10832 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
10833 bool
10834 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10835 aarch64_addr_query_type type)
10837 struct aarch64_address_info addr;
10839 return aarch64_classify_address (&addr, x, mode, strict_p, type);
10842 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
10844 static bool
10845 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10846 poly_int64 orig_offset,
10847 machine_mode mode)
10849 HOST_WIDE_INT size;
10850 if (GET_MODE_SIZE (mode).is_constant (&size))
10852 HOST_WIDE_INT const_offset, second_offset;
10854 /* A general SVE offset is A * VQ + B. Remove the A component from
10855 coefficient 0 in order to get the constant B. */
10856 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10858 /* Split an out-of-range address displacement into a base and
10859 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
10860 range otherwise to increase opportunities for sharing the base
10861 address of different sizes. Unaligned accesses use the signed
10862 9-bit range, TImode/TFmode/TDmode use the intersection of signed
10863 scaled 7-bit and signed 9-bit offset. */
10864 if (mode == TImode || mode == TFmode || mode == TDmode)
10865 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10866 else if ((const_offset & (size - 1)) != 0)
10867 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10868 else
10869 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10871 if (second_offset == 0 || known_eq (orig_offset, second_offset))
10872 return false;
10874 /* Split the offset into second_offset and the rest. */
10875 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10876 *offset2 = gen_int_mode (second_offset, Pmode);
10877 return true;
10879 else
10881 /* Get the mode we should use as the basis of the range. For structure
10882 modes this is the mode of one vector. */
10883 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10884 machine_mode step_mode
10885 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10887 /* Get the "mul vl" multiplier we'd like to use. */
10888 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10889 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10890 if (vec_flags & VEC_SVE_DATA)
10891 /* LDR supports a 9-bit range, but the move patterns for
10892 structure modes require all vectors to be in range of the
10893 same base. The simplest way of accomodating that while still
10894 promoting reuse of anchor points between different modes is
10895 to use an 8-bit range unconditionally. */
10896 vnum = ((vnum + 128) & 255) - 128;
10897 else
10898 /* Predicates are only handled singly, so we might as well use
10899 the full range. */
10900 vnum = ((vnum + 256) & 511) - 256;
10901 if (vnum == 0)
10902 return false;
10904 /* Convert the "mul vl" multiplier into a byte offset. */
10905 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10906 if (known_eq (second_offset, orig_offset))
10907 return false;
10909 /* Split the offset into second_offset and the rest. */
10910 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10911 *offset2 = gen_int_mode (second_offset, Pmode);
10912 return true;
10916 /* Return the binary representation of floating point constant VALUE in INTVAL.
10917 If the value cannot be converted, return false without setting INTVAL.
10918 The conversion is done in the given MODE. */
10919 bool
10920 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10923 /* We make a general exception for 0. */
10924 if (aarch64_float_const_zero_rtx_p (value))
10926 *intval = 0;
10927 return true;
10930 scalar_float_mode mode;
10931 if (!CONST_DOUBLE_P (value)
10932 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10933 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10934 /* Only support up to DF mode. */
10935 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10936 return false;
10938 unsigned HOST_WIDE_INT ival = 0;
10940 long res[2];
10941 real_to_target (res,
10942 CONST_DOUBLE_REAL_VALUE (value),
10943 REAL_MODE_FORMAT (mode));
10945 if (mode == DFmode || mode == DDmode)
10947 int order = BYTES_BIG_ENDIAN ? 1 : 0;
10948 ival = zext_hwi (res[order], 32);
10949 ival |= (zext_hwi (res[1 - order], 32) << 32);
10951 else
10952 ival = zext_hwi (res[0], 32);
10954 *intval = ival;
10955 return true;
10958 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10959 single MOV(+MOVK) followed by an FMOV. */
10960 bool
10961 aarch64_float_const_rtx_p (rtx x)
10963 machine_mode mode = GET_MODE (x);
10964 if (mode == VOIDmode)
10965 return false;
10967 /* Determine whether it's cheaper to write float constants as
10968 mov/movk pairs over ldr/adrp pairs. */
10969 unsigned HOST_WIDE_INT ival;
10971 if (CONST_DOUBLE_P (x)
10972 && SCALAR_FLOAT_MODE_P (mode)
10973 && aarch64_reinterpret_float_as_int (x, &ival))
10975 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
10976 int num_instr = aarch64_internal_mov_immediate
10977 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10978 return num_instr < 3;
10981 return false;
10984 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
10985 Floating Point). */
10986 bool
10987 aarch64_float_const_zero_rtx_p (rtx x)
10989 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
10990 zr as our callers expect, so no need to check the actual
10991 value if X is of Decimal Floating Point type. */
10992 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
10993 return false;
10995 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10996 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10997 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11000 /* Return true if X is any kind of constant zero rtx. */
11002 bool
11003 aarch64_const_zero_rtx_p (rtx x)
11005 return (x == CONST0_RTX (GET_MODE (x))
11006 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
11009 /* Return TRUE if rtx X is immediate constant that fits in a single
11010 MOVI immediate operation. */
11011 bool
11012 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11014 if (!TARGET_SIMD)
11015 return false;
11017 machine_mode vmode;
11018 scalar_int_mode imode;
11019 unsigned HOST_WIDE_INT ival;
11021 if (CONST_DOUBLE_P (x)
11022 && SCALAR_FLOAT_MODE_P (mode))
11024 if (!aarch64_reinterpret_float_as_int (x, &ival))
11025 return false;
11027 /* We make a general exception for 0. */
11028 if (aarch64_float_const_zero_rtx_p (x))
11029 return true;
11031 imode = int_mode_for_mode (mode).require ();
11033 else if (CONST_INT_P (x)
11034 && is_a <scalar_int_mode> (mode, &imode))
11035 ival = INTVAL (x);
11036 else
11037 return false;
11039 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11040 a 128 bit vector mode. */
11041 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11043 vmode = aarch64_simd_container_mode (imode, width);
11044 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11046 return aarch64_simd_valid_immediate (v_op, NULL);
11050 /* Return the fixed registers used for condition codes. */
11052 static bool
11053 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11055 *p1 = CC_REGNUM;
11056 *p2 = INVALID_REGNUM;
11057 return true;
11060 /* Return a fresh memory reference to the current function's TPIDR2 block,
11061 creating a block if necessary. */
11063 static rtx
11064 aarch64_get_tpidr2_block ()
11066 if (!cfun->machine->tpidr2_block)
11067 /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11068 boundary. */
11069 cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
11070 return copy_rtx (cfun->machine->tpidr2_block);
11073 /* Return a fresh register that points to the current function's
11074 TPIDR2 block, creating a block if necessary. */
11076 static rtx
11077 aarch64_get_tpidr2_ptr ()
11079 rtx block = aarch64_get_tpidr2_block ();
11080 return force_reg (Pmode, XEXP (block, 0));
11083 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11084 current function's TPIDR2 block. */
11086 static void
11087 aarch64_init_tpidr2_block ()
11089 rtx block = aarch64_get_tpidr2_block ();
11091 /* The ZA save buffer is SVL.B*SVL.B bytes in size. */
11092 rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
11093 rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
11094 rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
11095 svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
11096 rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
11097 BITS_PER_UNIT, -1, true);
11098 za_save_buffer = force_reg (Pmode, za_save_buffer);
11099 cfun->machine->za_save_buffer = za_save_buffer;
11101 /* The first word of the block points to the save buffer and the second
11102 word is the number of ZA slices to save. */
11103 rtx block_0 = adjust_address (block, DImode, 0);
11104 emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
11106 if (!memory_operand (block, V16QImode))
11107 block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
11108 emit_insn (gen_aarch64_setup_local_tpidr2 (block));
11111 /* Restore the contents of ZA from the lazy save buffer, given that
11112 register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11113 PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null. */
11115 void
11116 aarch64_restore_za (rtx tpidr2_block)
11118 emit_insn (gen_aarch64_smstart_za ());
11119 if (REGNO (tpidr2_block) != R0_REGNUM)
11120 emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11121 emit_insn (gen_aarch64_tpidr2_restore ());
11124 /* Return the ZT0 save buffer, creating one if necessary. */
11126 static rtx
11127 aarch64_get_zt0_save_buffer ()
11129 if (!cfun->machine->zt0_save_buffer)
11130 cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11131 return cfun->machine->zt0_save_buffer;
11134 /* Save ZT0 to the current function's save buffer. */
11136 static void
11137 aarch64_save_zt0 ()
11139 rtx mem = aarch64_get_zt0_save_buffer ();
11140 mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11141 emit_insn (gen_aarch64_sme_str_zt0 (mem));
11144 /* Restore ZT0 from the current function's save buffer. FROM_LAZY_SAVE_P
11145 is true if the load is happening after a call to a private-ZA function,
11146 false if it can be treated as a normal load. */
11148 static void
11149 aarch64_restore_zt0 (bool from_lazy_save_p)
11151 rtx mem = aarch64_get_zt0_save_buffer ();
11152 mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11153 emit_insn (from_lazy_save_p
11154 ? gen_aarch64_restore_zt0 (mem)
11155 : gen_aarch64_sme_ldr_zt0 (mem));
11158 /* Implement TARGET_START_CALL_ARGS. */
11160 static void
11161 aarch64_start_call_args (cumulative_args_t ca_v)
11163 CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11165 if (!TARGET_SME && (ca->isa_mode & AARCH64_FL_SM_ON))
11167 error ("calling a streaming function requires the ISA extension %qs",
11168 "sme");
11169 inform (input_location, "you can enable %qs using the command-line"
11170 " option %<-march%>, or by using the %<target%>"
11171 " attribute or pragma", "sme");
11174 if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11175 && !aarch64_cfun_has_state ("za"))
11176 error ("call to a function that shares %qs state from a function"
11177 " that has no %qs state", "za", "za");
11178 else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11179 && !aarch64_cfun_has_state ("zt0"))
11180 error ("call to a function that shares %qs state from a function"
11181 " that has no %qs state", "zt0", "zt0");
11182 else if (!TARGET_ZA && (ca->isa_mode & AARCH64_FL_ZA_ON))
11183 error ("call to a function that shares SME state from a function"
11184 " that has no SME state");
11186 /* If this is a call to a private ZA function, emit a marker to
11187 indicate where any necessary set-up code could be inserted.
11188 The code itself is inserted by the mode-switching pass. */
11189 if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11190 emit_insn (gen_aarch64_start_private_za_call ());
11192 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11193 save and restore ZT0 around the call. */
11194 if (aarch64_cfun_has_state ("zt0")
11195 && (ca->isa_mode & AARCH64_FL_ZA_ON)
11196 && ca->shared_zt0_flags == 0)
11197 aarch64_save_zt0 ();
11200 /* This function is used by the call expanders of the machine description.
11201 RESULT is the register in which the result is returned. It's NULL for
11202 "call" and "sibcall".
11203 MEM is the location of the function call.
11204 COOKIE is either:
11205 - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11206 - a PARALLEL that contains such a const_int as its first element.
11207 The second element is a PARALLEL that lists all the argument
11208 registers that need to be saved and restored around a change
11209 in PSTATE.SM, or const0_rtx if no such switch is needed.
11210 The third and fourth elements are const_ints that contain the
11211 sharing flags for ZA and ZT0 respectively.
11212 SIBCALL indicates whether this function call is normal call or sibling call.
11213 It will generate different pattern accordingly. */
11215 void
11216 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11218 rtx call, callee, tmp;
11219 rtvec vec;
11220 machine_mode mode;
11222 rtx callee_abi = cookie;
11223 rtx sme_mode_switch_args = const0_rtx;
11224 unsigned int shared_za_flags = 0;
11225 unsigned int shared_zt0_flags = 0;
11226 if (GET_CODE (cookie) == PARALLEL)
11228 callee_abi = XVECEXP (cookie, 0, 0);
11229 sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11230 shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11231 shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11234 gcc_assert (CONST_INT_P (callee_abi));
11235 auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11237 if (aarch64_cfun_has_state ("za")
11238 && (callee_isa_mode & AARCH64_FL_ZA_ON)
11239 && !shared_za_flags)
11241 sorry ("call to a function that shares state other than %qs"
11242 " from a function that has %qs state", "za", "za");
11243 inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11244 " callee preserves ZA");
11247 gcc_assert (MEM_P (mem));
11248 callee = XEXP (mem, 0);
11250 #if TARGET_PECOFF
11251 tmp = legitimize_pe_coff_symbol (callee, false);
11252 if (tmp)
11253 callee = tmp;
11254 #endif
11256 mode = GET_MODE (callee);
11257 gcc_assert (mode == Pmode);
11259 /* Decide if we should generate indirect calls by loading the
11260 address of the callee into a register before performing
11261 the branch-and-link. */
11262 if (SYMBOL_REF_P (callee)
11263 ? (aarch64_is_long_call_p (callee)
11264 || aarch64_is_noplt_call_p (callee))
11265 : !REG_P (callee))
11266 XEXP (mem, 0) = force_reg (mode, callee);
11268 /* Accumulate the return values, including state that is shared via
11269 attributes. */
11270 auto_vec<rtx, 8> return_values;
11271 if (result)
11273 if (GET_CODE (result) == PARALLEL)
11274 for (int i = 0; i < XVECLEN (result, 0); ++i)
11275 return_values.safe_push (XVECEXP (result, 0, i));
11276 else
11277 return_values.safe_push (result);
11279 unsigned int orig_num_return_values = return_values.length ();
11280 if (shared_za_flags & AARCH64_STATE_OUT)
11281 return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11282 /* When calling private-ZA functions from functions with ZA state,
11283 we want to know whether the call committed a lazy save. */
11284 if (TARGET_ZA && !shared_za_flags)
11285 return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11286 if (shared_zt0_flags & AARCH64_STATE_OUT)
11287 return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11289 /* Create the new return value, if necessary. */
11290 if (orig_num_return_values != return_values.length ())
11292 if (return_values.length () == 1)
11293 result = return_values[0];
11294 else
11296 for (rtx &x : return_values)
11297 if (GET_CODE (x) != EXPR_LIST)
11298 x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11299 rtvec v = gen_rtvec_v (return_values.length (),
11300 return_values.address ());
11301 result = gen_rtx_PARALLEL (VOIDmode, v);
11305 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11307 if (result != NULL_RTX)
11308 call = gen_rtx_SET (result, call);
11310 if (sibcall)
11311 tmp = ret_rtx;
11312 else
11313 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11315 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11316 UNSPEC_CALLEE_ABI);
11318 vec = gen_rtvec (3, call, callee_abi, tmp);
11319 call = gen_rtx_PARALLEL (VOIDmode, vec);
11321 auto call_insn = aarch64_emit_call_insn (call);
11323 /* Check whether the call requires a change to PSTATE.SM. We can't
11324 emit the instructions to change PSTATE.SM yet, since they involve
11325 a change in vector length and a change in instruction set, which
11326 cannot be represented in RTL.
11328 For now, just record which registers will be clobbered and used
11329 by the changes to PSTATE.SM. */
11330 if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11332 aarch64_sme_mode_switch_regs args_switch;
11333 if (sme_mode_switch_args != const0_rtx)
11335 unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11336 for (unsigned int i = 0; i < num_args; ++i)
11338 rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11339 args_switch.add_reg (GET_MODE (x), REGNO (x));
11343 aarch64_sme_mode_switch_regs result_switch;
11344 if (result)
11345 result_switch.add_call_result (call_insn);
11347 unsigned int num_gprs = MAX (args_switch.num_gprs (),
11348 result_switch.num_gprs ());
11349 for (unsigned int i = 0; i < num_gprs; ++i)
11350 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11351 gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11353 for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11354 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11355 gen_rtx_REG (V4x16QImode, regno));
11357 for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11358 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11359 gen_rtx_REG (VNx16BImode, regno));
11361 /* Ensure that the VG save slot has been initialized. Also emit
11362 an instruction to model the effect of the temporary clobber
11363 of VG, so that the prologue/epilogue pass sees the need to
11364 save the old value. */
11365 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11366 gen_rtx_REG (DImode, VG_REGNUM));
11367 emit_insn_before (gen_aarch64_update_vg (), call_insn);
11369 cfun->machine->call_switches_pstate_sm = true;
11372 /* Add any ZA-related information.
11374 ZA_REGNUM represents the current function's ZA state, rather than
11375 the contents of the ZA register itself. We ensure that the function's
11376 ZA state is preserved by private-ZA call sequences, so the call itself
11377 does not use or clobber ZA_REGNUM. The same thing applies to
11378 ZT0_REGNUM. */
11379 if (TARGET_ZA)
11381 /* The callee requires ZA to be active if the callee is shared-ZA,
11382 otherwise it requires ZA to be dormant or off. The state of ZA is
11383 captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11384 and ZA_SAVED_REGNUM. */
11385 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11386 gen_rtx_REG (DImode, SME_STATE_REGNUM));
11387 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11388 gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11389 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11390 gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11392 /* Keep the aarch64_start/end_private_za_call markers live. */
11393 if (!(callee_isa_mode & AARCH64_FL_ZA_ON))
11394 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11395 gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11397 /* If the callee is a shared-ZA function, record whether it uses the
11398 current value of ZA and ZT0. */
11399 if (shared_za_flags & AARCH64_STATE_IN)
11400 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11401 gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11403 if (shared_zt0_flags & AARCH64_STATE_IN)
11404 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11405 gen_rtx_REG (V8DImode, ZT0_REGNUM));
11409 /* Implement TARGET_END_CALL_ARGS. */
11411 static void
11412 aarch64_end_call_args (cumulative_args_t ca_v)
11414 CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11416 /* If this is a call to a private ZA function, emit a marker to
11417 indicate where any necessary restoration code could be inserted.
11418 The code itself is inserted by the mode-switching pass. */
11419 if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11420 emit_insn (gen_aarch64_end_private_za_call ());
11422 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11423 save and restore ZT0 around the call. */
11424 if (aarch64_cfun_has_state ("zt0")
11425 && (ca->isa_mode & AARCH64_FL_ZA_ON)
11426 && ca->shared_zt0_flags == 0)
11427 aarch64_restore_zt0 (false);
11430 /* Emit call insn with PAT and do aarch64-specific handling. */
11432 rtx_call_insn *
11433 aarch64_emit_call_insn (rtx pat)
11435 auto insn = emit_call_insn (pat);
11437 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11438 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11439 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11440 return as_a<rtx_call_insn *> (insn);
11443 machine_mode
11444 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11446 machine_mode mode_x = GET_MODE (x);
11447 rtx_code code_x = GET_CODE (x);
11449 /* All floating point compares return CCFP if it is an equality
11450 comparison, and CCFPE otherwise. */
11451 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11453 switch (code)
11455 case EQ:
11456 case NE:
11457 case UNORDERED:
11458 case ORDERED:
11459 case UNLT:
11460 case UNLE:
11461 case UNGT:
11462 case UNGE:
11463 case UNEQ:
11464 return CCFPmode;
11466 case LT:
11467 case LE:
11468 case GT:
11469 case GE:
11470 case LTGT:
11471 return CCFPEmode;
11473 default:
11474 gcc_unreachable ();
11478 /* Equality comparisons of short modes against zero can be performed
11479 using the TST instruction with the appropriate bitmask. */
11480 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11481 && (code == EQ || code == NE)
11482 && (mode_x == HImode || mode_x == QImode))
11483 return CC_Zmode;
11485 /* Similarly, comparisons of zero_extends from shorter modes can
11486 be performed using an ANDS with an immediate mask. */
11487 if (y == const0_rtx && code_x == ZERO_EXTEND
11488 && (mode_x == SImode || mode_x == DImode)
11489 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11490 && (code == EQ || code == NE))
11491 return CC_Zmode;
11493 /* Zero extracts support equality comparisons. */
11494 if ((mode_x == SImode || mode_x == DImode)
11495 && y == const0_rtx
11496 && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11497 && CONST_INT_P (XEXP (x, 2)))
11498 && (code == EQ || code == NE))
11499 return CC_Zmode;
11501 /* ANDS/BICS/TST support equality and all signed comparisons. */
11502 if ((mode_x == SImode || mode_x == DImode)
11503 && y == const0_rtx
11504 && (code_x == AND)
11505 && (code == EQ || code == NE || code == LT || code == GE
11506 || code == GT || code == LE))
11507 return CC_NZVmode;
11509 /* ADDS/SUBS correctly set N and Z flags. */
11510 if ((mode_x == SImode || mode_x == DImode)
11511 && y == const0_rtx
11512 && (code == EQ || code == NE || code == LT || code == GE)
11513 && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11514 return CC_NZmode;
11516 /* A compare with a shifted operand. Because of canonicalization,
11517 the comparison will have to be swapped when we emit the assembly
11518 code. */
11519 if ((mode_x == SImode || mode_x == DImode)
11520 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11521 && (code_x == ASHIFT || code_x == ASHIFTRT
11522 || code_x == LSHIFTRT
11523 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11524 return CC_SWPmode;
11526 /* Similarly for a negated operand, but we can only do this for
11527 equalities. */
11528 if ((mode_x == SImode || mode_x == DImode)
11529 && (REG_P (y) || SUBREG_P (y))
11530 && (code == EQ || code == NE)
11531 && code_x == NEG)
11532 return CC_Zmode;
11534 /* A test for unsigned overflow from an addition. */
11535 if ((mode_x == DImode || mode_x == TImode)
11536 && (code == LTU || code == GEU)
11537 && code_x == PLUS
11538 && rtx_equal_p (XEXP (x, 0), y))
11539 return CC_Cmode;
11541 /* A test for unsigned overflow from an add with carry. */
11542 if ((mode_x == DImode || mode_x == TImode)
11543 && (code == LTU || code == GEU)
11544 && code_x == PLUS
11545 && CONST_SCALAR_INT_P (y)
11546 && (rtx_mode_t (y, mode_x)
11547 == (wi::shwi (1, mode_x)
11548 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11549 return CC_ADCmode;
11551 /* A test for signed overflow. */
11552 if ((mode_x == DImode || mode_x == TImode)
11553 && code == NE
11554 && code_x == PLUS
11555 && GET_CODE (y) == SIGN_EXTEND)
11556 return CC_Vmode;
11558 /* For everything else, return CCmode. */
11559 return CCmode;
11562 static int
11563 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11566 aarch64_get_condition_code (rtx x)
11568 machine_mode mode = GET_MODE (XEXP (x, 0));
11569 enum rtx_code comp_code = GET_CODE (x);
11571 if (GET_MODE_CLASS (mode) != MODE_CC)
11572 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11573 return aarch64_get_condition_code_1 (mode, comp_code);
11576 static int
11577 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11579 switch (mode)
11581 case E_CCFPmode:
11582 case E_CCFPEmode:
11583 switch (comp_code)
11585 case GE: return AARCH64_GE;
11586 case GT: return AARCH64_GT;
11587 case LE: return AARCH64_LS;
11588 case LT: return AARCH64_MI;
11589 case NE: return AARCH64_NE;
11590 case EQ: return AARCH64_EQ;
11591 case ORDERED: return AARCH64_VC;
11592 case UNORDERED: return AARCH64_VS;
11593 case UNLT: return AARCH64_LT;
11594 case UNLE: return AARCH64_LE;
11595 case UNGT: return AARCH64_HI;
11596 case UNGE: return AARCH64_PL;
11597 default: return -1;
11599 break;
11601 case E_CCmode:
11602 switch (comp_code)
11604 case NE: return AARCH64_NE;
11605 case EQ: return AARCH64_EQ;
11606 case GE: return AARCH64_GE;
11607 case GT: return AARCH64_GT;
11608 case LE: return AARCH64_LE;
11609 case LT: return AARCH64_LT;
11610 case GEU: return AARCH64_CS;
11611 case GTU: return AARCH64_HI;
11612 case LEU: return AARCH64_LS;
11613 case LTU: return AARCH64_CC;
11614 default: return -1;
11616 break;
11618 case E_CC_SWPmode:
11619 switch (comp_code)
11621 case NE: return AARCH64_NE;
11622 case EQ: return AARCH64_EQ;
11623 case GE: return AARCH64_LE;
11624 case GT: return AARCH64_LT;
11625 case LE: return AARCH64_GE;
11626 case LT: return AARCH64_GT;
11627 case GEU: return AARCH64_LS;
11628 case GTU: return AARCH64_CC;
11629 case LEU: return AARCH64_CS;
11630 case LTU: return AARCH64_HI;
11631 default: return -1;
11633 break;
11635 case E_CC_NZCmode:
11636 switch (comp_code)
11638 case NE: return AARCH64_NE; /* = any */
11639 case EQ: return AARCH64_EQ; /* = none */
11640 case GE: return AARCH64_PL; /* = nfrst */
11641 case LT: return AARCH64_MI; /* = first */
11642 case GEU: return AARCH64_CS; /* = nlast */
11643 case GTU: return AARCH64_HI; /* = pmore */
11644 case LEU: return AARCH64_LS; /* = plast */
11645 case LTU: return AARCH64_CC; /* = last */
11646 default: return -1;
11648 break;
11650 case E_CC_NZVmode:
11651 switch (comp_code)
11653 case NE: return AARCH64_NE;
11654 case EQ: return AARCH64_EQ;
11655 case GE: return AARCH64_PL;
11656 case LT: return AARCH64_MI;
11657 case GT: return AARCH64_GT;
11658 case LE: return AARCH64_LE;
11659 default: return -1;
11661 break;
11663 case E_CC_NZmode:
11664 switch (comp_code)
11666 case NE: return AARCH64_NE;
11667 case EQ: return AARCH64_EQ;
11668 case GE: return AARCH64_PL;
11669 case LT: return AARCH64_MI;
11670 default: return -1;
11672 break;
11674 case E_CC_Zmode:
11675 switch (comp_code)
11677 case NE: return AARCH64_NE;
11678 case EQ: return AARCH64_EQ;
11679 default: return -1;
11681 break;
11683 case E_CC_Cmode:
11684 switch (comp_code)
11686 case LTU: return AARCH64_CS;
11687 case GEU: return AARCH64_CC;
11688 default: return -1;
11690 break;
11692 case E_CC_ADCmode:
11693 switch (comp_code)
11695 case GEU: return AARCH64_CS;
11696 case LTU: return AARCH64_CC;
11697 default: return -1;
11699 break;
11701 case E_CC_Vmode:
11702 switch (comp_code)
11704 case NE: return AARCH64_VS;
11705 case EQ: return AARCH64_VC;
11706 default: return -1;
11708 break;
11710 default:
11711 return -1;
11714 return -1;
11717 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11718 duplicate of such constants. If so, store in RET_WI the wide_int
11719 representation of the constant paired with the inner mode of the vector mode
11720 or MODE for scalar X constants. If MODE is not provided then TImode is
11721 used. */
11723 static bool
11724 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
11725 scalar_mode mode = TImode)
11727 rtx elt = unwrap_const_vec_duplicate (x);
11728 if (!CONST_SCALAR_INT_P (elt))
11729 return false;
11730 scalar_mode smode
11731 = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
11732 *ret_wi = rtx_mode_t (elt, smode);
11733 return true;
11736 /* Return true if X is a scalar or a constant vector of integer
11737 immediates that represent the rounding constant used in the fixed-point
11738 arithmetic instructions.
11739 The accepted form of the constant is (1 << (C - 1)) where C is in the range
11740 [1, MODE_WIDTH/2]. */
11742 bool
11743 aarch64_rnd_imm_p (rtx x)
11745 wide_int rnd_cst;
11746 if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
11747 return false;
11748 int log2 = wi::exact_log2 (rnd_cst);
11749 if (log2 < 0)
11750 return false;
11751 return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
11754 /* Return true if RND is a constant vector of integer rounding constants
11755 corresponding to a constant vector of shifts, SHIFT.
11756 The relationship should be RND == (1 << (SHIFT - 1)). */
11758 bool
11759 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
11761 wide_int rnd_cst, shft_cst;
11762 if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
11763 || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
11764 return false;
11766 return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
11769 bool
11770 aarch64_const_vec_all_same_in_range_p (rtx x,
11771 HOST_WIDE_INT minval,
11772 HOST_WIDE_INT maxval)
11774 rtx elt;
11775 return (const_vec_duplicate_p (x, &elt)
11776 && CONST_INT_P (elt)
11777 && IN_RANGE (INTVAL (elt), minval, maxval));
11780 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11781 but we can still create them in various ways. If the constant in VAL can be
11782 created using alternate methods then if possible then return true and
11783 additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11784 Otherwise return false if sequence is not possible. */
11786 bool
11787 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
11789 wide_int wval;
11790 auto smode = GET_MODE_INNER (mode);
11791 if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
11792 return false;
11794 /* For Advanced SIMD we can create an integer with only the top bit set
11795 using fneg (0.0f). */
11796 if (TARGET_SIMD
11797 && !TARGET_SVE
11798 && smode == DImode
11799 && wi::only_sign_bit_p (wval))
11801 if (!target)
11802 return true;
11804 /* Use the same base type as aarch64_gen_shareable_zero. */
11805 rtx zero = CONST0_RTX (V4SImode);
11806 emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
11807 rtx neg = lowpart_subreg (V2DFmode, target, mode);
11808 emit_insn (gen_negv2df2 (neg, copy_rtx (neg)));
11809 return true;
11812 return false;
11815 /* Check if the value in VAL with mode MODE can be created using special
11816 instruction sequences. */
11818 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
11820 return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
11823 bool
11824 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11826 return aarch64_const_vec_all_same_in_range_p (x, val, val);
11829 /* Return true if VEC is a constant in which every element is in the range
11830 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11832 static bool
11833 aarch64_const_vec_all_in_range_p (rtx vec,
11834 HOST_WIDE_INT minval,
11835 HOST_WIDE_INT maxval)
11837 if (!CONST_VECTOR_P (vec)
11838 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11839 return false;
11841 int nunits;
11842 if (!CONST_VECTOR_STEPPED_P (vec))
11843 nunits = const_vector_encoded_nelts (vec);
11844 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11845 return false;
11847 for (int i = 0; i < nunits; i++)
11849 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11850 if (!CONST_INT_P (vec_elem)
11851 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11852 return false;
11854 return true;
11857 /* N Z C V. */
11858 #define AARCH64_CC_V 1
11859 #define AARCH64_CC_C (1 << 1)
11860 #define AARCH64_CC_Z (1 << 2)
11861 #define AARCH64_CC_N (1 << 3)
11863 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11864 static const int aarch64_nzcv_codes[] =
11866 0, /* EQ, Z == 1. */
11867 AARCH64_CC_Z, /* NE, Z == 0. */
11868 0, /* CS, C == 1. */
11869 AARCH64_CC_C, /* CC, C == 0. */
11870 0, /* MI, N == 1. */
11871 AARCH64_CC_N, /* PL, N == 0. */
11872 0, /* VS, V == 1. */
11873 AARCH64_CC_V, /* VC, V == 0. */
11874 0, /* HI, C ==1 && Z == 0. */
11875 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
11876 AARCH64_CC_V, /* GE, N == V. */
11877 0, /* LT, N != V. */
11878 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
11879 0, /* LE, !(Z == 0 && N == V). */
11880 0, /* AL, Any. */
11881 0 /* NV, Any. */
11884 /* Print floating-point vector immediate operand X to F, negating it
11885 first if NEGATE is true. Return true on success, false if it isn't
11886 a constant we can handle. */
11888 static bool
11889 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11891 rtx elt;
11893 if (!const_vec_duplicate_p (x, &elt))
11894 return false;
11896 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11897 if (negate)
11898 r = real_value_negate (&r);
11900 /* Handle the SVE single-bit immediates specially, since they have a
11901 fixed form in the assembly syntax. */
11902 if (real_equal (&r, &dconst0))
11903 asm_fprintf (f, "0.0");
11904 else if (real_equal (&r, &dconst2))
11905 asm_fprintf (f, "2.0");
11906 else if (real_equal (&r, &dconst1))
11907 asm_fprintf (f, "1.0");
11908 else if (real_equal (&r, &dconsthalf))
11909 asm_fprintf (f, "0.5");
11910 else
11912 const int buf_size = 20;
11913 char float_buf[buf_size] = {'\0'};
11914 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11915 1, GET_MODE (elt));
11916 asm_fprintf (f, "%s", float_buf);
11919 return true;
11922 /* Return the equivalent letter for size. */
11923 static char
11924 sizetochar (int size)
11926 switch (size)
11928 case 64: return 'd';
11929 case 32: return 's';
11930 case 16: return 'h';
11931 case 8 : return 'b';
11932 default: gcc_unreachable ();
11936 /* Print operand X to file F in a target specific manner according to CODE.
11937 The acceptable formatting commands given by CODE are:
11938 'c': An integer or symbol address without a preceding #
11939 sign.
11940 'C': Take the duplicated element in a vector constant
11941 and print it in hex.
11942 'D': Take the duplicated element in a vector constant
11943 and print it as an unsigned integer, in decimal.
11944 'e': Print the sign/zero-extend size as a character 8->b,
11945 16->h, 32->w. Can also be used for masks:
11946 0xff->b, 0xffff->h, 0xffffffff->w.
11947 'I': If the operand is a duplicated vector constant,
11948 replace it with the duplicated scalar. If the
11949 operand is then a floating-point constant, replace
11950 it with the integer bit representation. Print the
11951 transformed constant as a signed decimal number.
11952 'p': Prints N such that 2^N == X (X must be power of 2 and
11953 const int).
11954 'P': Print the number of non-zero bits in X (a const_int).
11955 'H': Print the higher numbered register of a pair (TImode)
11956 of regs.
11957 'm': Print a condition (eq, ne, etc).
11958 'M': Same as 'm', but invert condition.
11959 'N': Take the duplicated element in a vector constant
11960 and print the negative of it in decimal.
11961 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11962 'Z': Same for SVE registers. ('z' was already taken.)
11963 Note that it is not necessary to use %Z for operands
11964 that have SVE modes. The convention is to use %Z
11965 only for non-SVE (or potentially non-SVE) modes.
11966 'S/T/U/V': Print a FP/SIMD register name for a register list.
11967 The register printed is the FP/SIMD register name
11968 of X + 0/1/2/3 for S/T/U/V.
11969 'R': Print a scalar Integer/FP/SIMD register name + 1.
11970 'X': Print bottom 16 bits of integer constant in hex.
11971 'w/x': Print a general register name or the zero register
11972 (32-bit or 64-bit).
11973 '0': Print a normal operand, if it's a general register,
11974 then we assume DImode.
11975 'k': Print NZCV for conditional compare instructions.
11976 'K': Print a predicate register as pn<N> rather than p<N>
11977 'A': Output address constant representing the first
11978 argument of X, specifying a relocation offset
11979 if appropriate.
11980 'L': Output constant address specified by X
11981 with a relocation offset if appropriate.
11982 'G': Prints address of X, specifying a PC relative
11983 relocation mode if appropriate.
11984 'y': Output address of LDP or STP - this is used for
11985 some LDP/STPs which don't use a PARALLEL in their
11986 pattern (so the mode needs to be adjusted).
11987 'z': Output address of a typical LDP or STP. */
11989 static void
11990 aarch64_print_operand (FILE *f, rtx x, int code)
11992 rtx elt;
11993 switch (code)
11995 case 'c':
11996 if (CONST_INT_P (x))
11997 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11998 else
12000 poly_int64 offset;
12001 rtx base = strip_offset_and_salt (x, &offset);
12002 if (SYMBOL_REF_P (base))
12003 output_addr_const (f, x);
12004 else
12005 output_operand_lossage ("unsupported operand for code '%c'", code);
12007 break;
12009 case 'e':
12011 x = unwrap_const_vec_duplicate (x);
12012 if (!CONST_INT_P (x))
12014 output_operand_lossage ("invalid operand for '%%%c'", code);
12015 return;
12018 HOST_WIDE_INT val = INTVAL (x);
12019 if ((val & ~7) == 8 || val == 0xff)
12020 fputc ('b', f);
12021 else if ((val & ~7) == 16 || val == 0xffff)
12022 fputc ('h', f);
12023 else if ((val & ~7) == 32 || val == 0xffffffff)
12024 fputc ('w', f);
12025 else
12027 output_operand_lossage ("invalid operand for '%%%c'", code);
12028 return;
12031 break;
12033 case 'p':
12035 int n;
12037 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
12039 output_operand_lossage ("invalid operand for '%%%c'", code);
12040 return;
12043 asm_fprintf (f, "%d", n);
12045 break;
12047 case 'P':
12048 if (!CONST_INT_P (x))
12050 output_operand_lossage ("invalid operand for '%%%c'", code);
12051 return;
12054 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
12055 break;
12057 case 'H':
12058 if (x == const0_rtx)
12060 asm_fprintf (f, "xzr");
12061 break;
12064 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
12066 output_operand_lossage ("invalid operand for '%%%c'", code);
12067 return;
12070 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
12071 break;
12073 case 'I':
12075 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
12076 if (CONST_INT_P (x))
12077 asm_fprintf (f, "%wd", INTVAL (x));
12078 else
12080 output_operand_lossage ("invalid operand for '%%%c'", code);
12081 return;
12083 break;
12086 case 'M':
12087 case 'm':
12089 int cond_code;
12090 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
12091 if (x == const_true_rtx)
12093 if (code == 'M')
12094 fputs ("nv", f);
12095 return;
12098 if (!COMPARISON_P (x))
12100 output_operand_lossage ("invalid operand for '%%%c'", code);
12101 return;
12104 cond_code = aarch64_get_condition_code (x);
12105 gcc_assert (cond_code >= 0);
12106 if (code == 'M')
12107 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
12108 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
12109 fputs (aarch64_sve_condition_codes[cond_code], f);
12110 else
12111 fputs (aarch64_condition_codes[cond_code], f);
12113 break;
12115 case 'N':
12116 if (!const_vec_duplicate_p (x, &elt))
12118 output_operand_lossage ("invalid vector constant");
12119 return;
12122 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12123 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12124 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12125 && aarch64_print_vector_float_operand (f, x, true))
12127 else
12129 output_operand_lossage ("invalid vector constant");
12130 return;
12132 break;
12134 case 'b':
12135 case 'h':
12136 case 's':
12137 case 'd':
12138 case 'q':
12139 case 'Z':
12140 code = TOLOWER (code);
12141 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12143 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12144 return;
12146 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12147 break;
12149 case 'S':
12150 case 'T':
12151 case 'U':
12152 case 'V':
12153 if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12155 output_operand_lossage ("incompatible operand for '%%%c'", code);
12156 return;
12158 if (PR_REGNUM_P (REGNO (x)))
12159 asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12160 else
12161 asm_fprintf (f, "%c%d",
12162 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12163 REGNO (x) - V0_REGNUM + (code - 'S'));
12164 break;
12166 case 'R':
12167 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12168 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12169 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12170 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12171 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12172 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12173 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12174 else
12175 output_operand_lossage ("incompatible register operand for '%%%c'",
12176 code);
12177 break;
12179 case 'X':
12180 if (!CONST_INT_P (x))
12182 output_operand_lossage ("invalid operand for '%%%c'", code);
12183 return;
12185 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12186 break;
12188 case 'C':
12190 /* Print a replicated constant in hex. */
12191 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12193 output_operand_lossage ("invalid operand for '%%%c'", code);
12194 return;
12196 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12197 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12199 break;
12201 case 'D':
12203 /* Print a replicated constant in decimal, treating it as
12204 unsigned. */
12205 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12207 output_operand_lossage ("invalid operand for '%%%c'", code);
12208 return;
12210 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12211 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12213 break;
12215 case 'w':
12216 case 'x':
12217 if (aarch64_const_zero_rtx_p (x))
12219 asm_fprintf (f, "%czr", code);
12220 break;
12223 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12225 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12226 break;
12229 if (REG_P (x) && REGNO (x) == SP_REGNUM)
12231 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12232 break;
12235 /* Fall through */
12237 case 0:
12238 if (x == NULL)
12240 output_operand_lossage ("missing operand");
12241 return;
12244 switch (GET_CODE (x))
12246 case CONST_STRING:
12248 asm_fprintf (f, "%s", XSTR (x, 0));
12249 break;
12251 case REG:
12252 if (aarch64_sve_data_mode_p (GET_MODE (x)))
12254 if (REG_NREGS (x) == 1)
12255 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12256 else
12258 char suffix
12259 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12260 asm_fprintf (f, "{z%d.%c - z%d.%c}",
12261 REGNO (x) - V0_REGNUM, suffix,
12262 END_REGNO (x) - V0_REGNUM - 1, suffix);
12265 else
12266 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12267 break;
12269 case MEM:
12270 output_address (GET_MODE (x), XEXP (x, 0));
12271 break;
12273 case LABEL_REF:
12274 case SYMBOL_REF:
12275 output_addr_const (asm_out_file, x);
12276 break;
12278 case CONST_INT:
12279 asm_fprintf (f, "%wd", INTVAL (x));
12280 break;
12282 case CONST:
12283 if (!VECTOR_MODE_P (GET_MODE (x)))
12285 output_addr_const (asm_out_file, x);
12286 break;
12288 /* fall through */
12290 case CONST_VECTOR:
12291 if (!const_vec_duplicate_p (x, &elt))
12293 output_operand_lossage ("invalid vector constant");
12294 return;
12297 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12298 asm_fprintf (f, "%wd", INTVAL (elt));
12299 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12300 && aarch64_print_vector_float_operand (f, x, false))
12302 else
12304 output_operand_lossage ("invalid vector constant");
12305 return;
12307 break;
12309 case CONST_DOUBLE:
12310 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12311 be getting CONST_DOUBLEs holding integers. */
12312 gcc_assert (GET_MODE (x) != VOIDmode);
12313 if (aarch64_float_const_zero_rtx_p (x))
12315 fputc ('0', f);
12316 break;
12318 else if (aarch64_float_const_representable_p (x))
12320 #define buf_size 20
12321 char float_buf[buf_size] = {'\0'};
12322 real_to_decimal_for_mode (float_buf,
12323 CONST_DOUBLE_REAL_VALUE (x),
12324 buf_size, buf_size,
12325 1, GET_MODE (x));
12326 asm_fprintf (asm_out_file, "%s", float_buf);
12327 break;
12328 #undef buf_size
12330 output_operand_lossage ("invalid constant");
12331 return;
12332 default:
12333 output_operand_lossage ("invalid operand");
12334 return;
12336 break;
12338 case 'A':
12339 if (GET_CODE (x) == HIGH)
12340 x = XEXP (x, 0);
12342 switch (aarch64_classify_symbolic_expression (x))
12344 case SYMBOL_SMALL_GOT_4G:
12345 asm_fprintf (asm_out_file, ":got:");
12346 break;
12348 case SYMBOL_SMALL_TLSGD:
12349 asm_fprintf (asm_out_file, ":tlsgd:");
12350 break;
12352 case SYMBOL_SMALL_TLSDESC:
12353 asm_fprintf (asm_out_file, ":tlsdesc:");
12354 break;
12356 case SYMBOL_SMALL_TLSIE:
12357 asm_fprintf (asm_out_file, ":gottprel:");
12358 break;
12360 case SYMBOL_TLSLE24:
12361 asm_fprintf (asm_out_file, ":tprel:");
12362 break;
12364 case SYMBOL_TINY_GOT:
12365 gcc_unreachable ();
12366 break;
12368 default:
12369 break;
12371 output_addr_const (asm_out_file, x);
12372 break;
12374 case 'L':
12375 switch (aarch64_classify_symbolic_expression (x))
12377 case SYMBOL_SMALL_GOT_4G:
12378 asm_fprintf (asm_out_file, ":got_lo12:");
12379 break;
12381 case SYMBOL_SMALL_TLSGD:
12382 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12383 break;
12385 case SYMBOL_SMALL_TLSDESC:
12386 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12387 break;
12389 case SYMBOL_SMALL_TLSIE:
12390 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12391 break;
12393 case SYMBOL_TLSLE12:
12394 asm_fprintf (asm_out_file, ":tprel_lo12:");
12395 break;
12397 case SYMBOL_TLSLE24:
12398 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12399 break;
12401 case SYMBOL_TINY_GOT:
12402 asm_fprintf (asm_out_file, ":got:");
12403 break;
12405 case SYMBOL_TINY_TLSIE:
12406 asm_fprintf (asm_out_file, ":gottprel:");
12407 break;
12409 default:
12410 break;
12412 output_addr_const (asm_out_file, x);
12413 break;
12415 case 'G':
12416 switch (aarch64_classify_symbolic_expression (x))
12418 case SYMBOL_TLSLE24:
12419 asm_fprintf (asm_out_file, ":tprel_hi12:");
12420 break;
12421 default:
12422 break;
12424 output_addr_const (asm_out_file, x);
12425 break;
12427 case 'k':
12429 HOST_WIDE_INT cond_code;
12431 if (!CONST_INT_P (x))
12433 output_operand_lossage ("invalid operand for '%%%c'", code);
12434 return;
12437 cond_code = INTVAL (x);
12438 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12439 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12441 break;
12443 case 'K':
12444 if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12446 output_operand_lossage ("invalid operand for '%%%c'", code);
12447 return;
12449 asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12450 break;
12452 case 'y':
12453 case 'z':
12455 machine_mode mode = GET_MODE (x);
12457 if (!MEM_P (x)
12458 || (code == 'y'
12459 && maybe_ne (GET_MODE_SIZE (mode), 8)
12460 && maybe_ne (GET_MODE_SIZE (mode), 16)
12461 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12463 output_operand_lossage ("invalid operand for '%%%c'", code);
12464 return;
12467 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12468 code == 'y'
12469 ? ADDR_QUERY_LDP_STP_N
12470 : ADDR_QUERY_LDP_STP))
12471 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12473 break;
12475 default:
12476 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12477 return;
12481 /* Print address 'x' of a memory access with mode 'mode'.
12482 'op' is the context required by aarch64_classify_address. It can either be
12483 MEM for a normal memory access or PARALLEL for LDP/STP. */
12484 static bool
12485 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12486 aarch64_addr_query_type type)
12488 struct aarch64_address_info addr;
12489 unsigned int size, vec_flags;
12491 /* Check all addresses are Pmode - including ILP32. */
12492 if (GET_MODE (x) != Pmode
12493 && (!CONST_INT_P (x)
12494 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12496 output_operand_lossage ("invalid address mode");
12497 return false;
12500 const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12501 || type == ADDR_QUERY_LDP_STP_N);
12503 if (aarch64_classify_address (&addr, x, mode, true, type))
12504 switch (addr.type)
12506 case ADDRESS_REG_IMM:
12507 if (known_eq (addr.const_offset, 0))
12509 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12510 return true;
12513 vec_flags = aarch64_classify_vector_mode (mode);
12514 if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12516 HOST_WIDE_INT vnum
12517 = exact_div (addr.const_offset,
12518 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12519 asm_fprintf (f, "[%s, #%wd, mul vl]",
12520 reg_names[REGNO (addr.base)], vnum);
12521 return true;
12524 if (!CONST_INT_P (addr.offset))
12525 return false;
12527 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12528 INTVAL (addr.offset));
12529 return true;
12531 case ADDRESS_REG_REG:
12532 if (addr.shift == 0)
12533 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12534 reg_names [REGNO (addr.offset)]);
12535 else
12536 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12537 reg_names [REGNO (addr.offset)], addr.shift);
12538 return true;
12540 case ADDRESS_REG_UXTW:
12541 if (addr.shift == 0)
12542 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12543 REGNO (addr.offset) - R0_REGNUM);
12544 else
12545 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12546 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12547 return true;
12549 case ADDRESS_REG_SXTW:
12550 if (addr.shift == 0)
12551 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12552 REGNO (addr.offset) - R0_REGNUM);
12553 else
12554 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12555 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12556 return true;
12558 case ADDRESS_REG_WB:
12559 /* Writeback is only supported for fixed-width modes. */
12560 size = GET_MODE_SIZE (mode).to_constant ();
12561 switch (GET_CODE (x))
12563 case PRE_INC:
12564 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12565 return true;
12566 case POST_INC:
12567 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12568 return true;
12569 case PRE_DEC:
12570 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12571 return true;
12572 case POST_DEC:
12573 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12574 return true;
12575 case PRE_MODIFY:
12576 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12577 INTVAL (addr.offset));
12578 return true;
12579 case POST_MODIFY:
12580 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12581 INTVAL (addr.offset));
12582 return true;
12583 default:
12584 break;
12586 break;
12588 case ADDRESS_LO_SUM:
12589 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12590 output_addr_const (f, addr.offset);
12591 asm_fprintf (f, "]");
12592 return true;
12594 case ADDRESS_SYMBOLIC:
12595 output_addr_const (f, x);
12596 return true;
12599 return false;
12602 /* Print address 'x' of a memory access with mode 'mode'. */
12603 static void
12604 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12606 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12607 output_addr_const (f, x);
12610 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12612 static bool
12613 aarch64_output_addr_const_extra (FILE *file, rtx x)
12615 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12617 output_addr_const (file, XVECEXP (x, 0, 0));
12618 return true;
12620 return false;
12623 bool
12624 aarch64_label_mentioned_p (rtx x)
12626 const char *fmt;
12627 int i;
12629 if (LABEL_REF_P (x))
12630 return true;
12632 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12633 referencing instruction, but they are constant offsets, not
12634 symbols. */
12635 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12636 return false;
12638 fmt = GET_RTX_FORMAT (GET_CODE (x));
12639 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12641 if (fmt[i] == 'E')
12643 int j;
12645 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12646 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12647 return 1;
12649 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12650 return 1;
12653 return 0;
12656 /* Implement REGNO_REG_CLASS. */
12658 enum reg_class
12659 aarch64_regno_regclass (unsigned regno)
12661 if (W8_W11_REGNUM_P (regno))
12662 return W8_W11_REGS;
12664 if (W12_W15_REGNUM_P (regno))
12665 return W12_W15_REGS;
12667 if (STUB_REGNUM_P (regno))
12668 return STUB_REGS;
12670 if (GP_REGNUM_P (regno))
12671 return GENERAL_REGS;
12673 if (regno == SP_REGNUM)
12674 return STACK_REG;
12676 if (regno == FRAME_POINTER_REGNUM
12677 || regno == ARG_POINTER_REGNUM)
12678 return POINTER_REGS;
12680 if (FP_REGNUM_P (regno))
12681 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12682 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12684 if (PR_REGNUM_P (regno))
12685 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12687 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12688 return FFR_REGS;
12690 if (FAKE_REGNUM_P (regno))
12691 return FAKE_REGS;
12693 return NO_REGS;
12696 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12697 If OFFSET is out of range, return an offset of an anchor point
12698 that is in range. Return 0 otherwise. */
12700 static HOST_WIDE_INT
12701 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12702 machine_mode mode)
12704 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12705 if (size > 16)
12706 return (offset + 0x400) & ~0x7f0;
12708 /* For offsets that aren't a multiple of the access size, the limit is
12709 -256...255. */
12710 if (offset & (size - 1))
12712 /* BLKmode typically uses LDP of X-registers. */
12713 if (mode == BLKmode)
12714 return (offset + 512) & ~0x3ff;
12715 return (offset + 0x100) & ~0x1ff;
12718 /* Small negative offsets are supported. */
12719 if (IN_RANGE (offset, -256, 0))
12720 return 0;
12722 if (mode == TImode || mode == TFmode || mode == TDmode)
12723 return (offset + 0x100) & ~0x1ff;
12725 /* Use 12-bit offset by access size. */
12726 return offset & (~0xfff * size);
12729 static rtx
12730 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
12732 #if TARGET_PECOFF
12733 rtx tmp = legitimize_pe_coff_symbol (x, true);
12734 if (tmp)
12735 return tmp;
12736 #endif
12738 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12739 where mask is selected by alignment and size of the offset.
12740 We try to pick as large a range for the offset as possible to
12741 maximize the chance of a CSE. However, for aligned addresses
12742 we limit the range to 4k so that structures with different sized
12743 elements are likely to use the same base. We need to be careful
12744 not to split a CONST for some forms of address expression, otherwise
12745 it will generate sub-optimal code. */
12747 /* First split X + CONST (base, offset) into (base + X) + offset. */
12748 if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
12750 poly_int64 offset;
12751 rtx base = strip_offset (XEXP (x, 1), &offset);
12753 base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
12754 NULL_RTX, true, OPTAB_DIRECT);
12755 x = plus_constant (Pmode, base, offset);
12758 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12760 rtx base = XEXP (x, 0);
12761 rtx offset_rtx = XEXP (x, 1);
12762 HOST_WIDE_INT offset = INTVAL (offset_rtx);
12764 if (GET_CODE (base) == PLUS)
12766 rtx op0 = XEXP (base, 0);
12767 rtx op1 = XEXP (base, 1);
12769 /* Force any scaling into a temp for CSE. */
12770 op0 = force_reg (Pmode, op0);
12771 op1 = force_reg (Pmode, op1);
12773 /* Let the pointer register be in op0. */
12774 if (REG_POINTER (op1))
12775 std::swap (op0, op1);
12777 /* If the pointer is virtual or frame related, then we know that
12778 virtual register instantiation or register elimination is going
12779 to apply a second constant. We want the two constants folded
12780 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12781 if (virt_or_elim_regno_p (REGNO (op0)))
12783 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12784 NULL_RTX, true, OPTAB_DIRECT);
12785 return gen_rtx_PLUS (Pmode, base, op1);
12788 /* Otherwise, in order to encourage CSE (and thence loop strength
12789 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12790 base = expand_binop (Pmode, add_optab, op0, op1,
12791 NULL_RTX, true, OPTAB_DIRECT);
12792 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12795 HOST_WIDE_INT size;
12796 if (GET_MODE_SIZE (mode).is_constant (&size))
12798 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12799 mode);
12800 if (base_offset != 0)
12802 base = plus_constant (Pmode, base, base_offset);
12803 base = force_operand (base, NULL_RTX);
12804 return plus_constant (Pmode, base, offset - base_offset);
12809 return x;
12812 static reg_class_t
12813 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12814 reg_class_t rclass,
12815 machine_mode mode,
12816 secondary_reload_info *sri)
12818 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12819 LDR and STR. See the comment at the head of aarch64-sve.md for
12820 more details about the big-endian handling. */
12821 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12822 if (reg_class_subset_p (rclass, FP_REGS)
12823 && !((REG_P (x) && HARD_REGISTER_P (x))
12824 || aarch64_simd_valid_immediate (x, NULL))
12825 && mode != VNx16QImode
12826 && (vec_flags & VEC_SVE_DATA)
12827 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12829 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12830 return NO_REGS;
12833 /* If we have to disable direct literal pool loads and stores because the
12834 function is too big, then we need a scratch register. */
12835 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12836 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12837 || targetm.vector_mode_supported_p (GET_MODE (x)))
12838 && !aarch64_pcrelative_literal_loads)
12840 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12841 return NO_REGS;
12844 /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12845 Q register to a Q register directly. We need a scratch. */
12846 if (REG_P (x)
12847 && (mode == TFmode
12848 || mode == TImode
12849 || mode == TDmode
12850 || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12851 && mode == GET_MODE (x)
12852 && !TARGET_SIMD
12853 && FP_REGNUM_P (REGNO (x))
12854 && reg_class_subset_p (rclass, FP_REGS))
12856 sri->icode = code_for_aarch64_reload_mov (mode);
12857 return NO_REGS;
12860 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12861 because AArch64 has richer addressing modes for LDR/STR instructions
12862 than LDP/STP instructions. */
12863 if (TARGET_FLOAT && rclass == GENERAL_REGS
12864 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12865 return FP_REGS;
12867 if (rclass == FP_REGS
12868 && (mode == TImode || mode == TFmode || mode == TDmode)
12869 && CONSTANT_P(x))
12870 return GENERAL_REGS;
12872 return NO_REGS;
12875 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12877 static bool
12878 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12879 reg_class_t class2)
12881 if (!TARGET_SIMD
12882 && reg_classes_intersect_p (class1, FP_REGS)
12883 && reg_classes_intersect_p (class2, FP_REGS))
12885 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12886 so we can't easily split a move involving tuples of 128-bit
12887 vectors. Force the copy through memory instead.
12889 (Tuples of 64-bit vectors are fine.) */
12890 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12891 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12892 return true;
12894 return false;
12897 /* Implement TARGET_FRAME_POINTER_REQUIRED. */
12899 static bool
12900 aarch64_frame_pointer_required ()
12902 /* If the function needs to record the incoming value of PSTATE.SM,
12903 make sure that the slot is accessible from the frame pointer. */
12904 return aarch64_need_old_pstate_sm ();
12907 static bool
12908 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12910 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12912 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12913 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12914 if (frame_pointer_needed)
12915 return to == HARD_FRAME_POINTER_REGNUM;
12916 return true;
12919 poly_int64
12920 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12922 aarch64_frame &frame = cfun->machine->frame;
12924 if (to == HARD_FRAME_POINTER_REGNUM)
12926 if (from == ARG_POINTER_REGNUM)
12927 return frame.bytes_above_hard_fp;
12929 if (from == FRAME_POINTER_REGNUM)
12930 return frame.bytes_above_hard_fp - frame.bytes_above_locals;
12933 if (to == STACK_POINTER_REGNUM)
12935 if (from == FRAME_POINTER_REGNUM)
12936 return frame.frame_size - frame.bytes_above_locals;
12939 return frame.frame_size;
12943 /* Get return address without mangling. */
12946 aarch64_return_addr_rtx (void)
12948 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12949 /* Note: aarch64_return_address_signing_enabled only
12950 works after cfun->machine->frame.laid_out is set,
12951 so here we don't know if the return address will
12952 be signed or not. */
12953 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12954 emit_move_insn (lr, val);
12955 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12956 return lr;
12960 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12961 previous frame. */
12964 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12966 if (count != 0)
12967 return const0_rtx;
12968 return aarch64_return_addr_rtx ();
12971 static void
12972 aarch64_asm_trampoline_template (FILE *f)
12974 /* Even if the current function doesn't have branch protection, some
12975 later function might, so since this template is only generated once
12976 we have to add a BTI just in case. */
12977 asm_fprintf (f, "\thint\t34 // bti c\n");
12979 if (TARGET_ILP32)
12981 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12982 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12984 else
12986 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12987 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12989 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12991 /* We always emit a speculation barrier.
12992 This is because the same trampoline template is used for every nested
12993 function. Since nested functions are not particularly common or
12994 performant we don't worry too much about the extra instructions to copy
12995 around.
12996 This is not yet a problem, since we have not yet implemented function
12997 specific attributes to choose between hardening against straight line
12998 speculation or not, but such function specific attributes are likely to
12999 happen in the future. */
13000 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
13002 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13003 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13006 static void
13007 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
13009 rtx fnaddr, mem, a_tramp;
13010 const int tramp_code_sz = 24;
13012 /* Don't need to copy the trailing D-words, we fill those in below. */
13013 /* We create our own memory address in Pmode so that `emit_block_move` can
13014 use parts of the backend which expect Pmode addresses. */
13015 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
13016 emit_block_move (gen_rtx_MEM (BLKmode, temp),
13017 assemble_trampoline_template (),
13018 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
13019 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
13020 fnaddr = XEXP (DECL_RTL (fndecl), 0);
13021 if (GET_MODE (fnaddr) != ptr_mode)
13022 fnaddr = convert_memory_address (ptr_mode, fnaddr);
13023 emit_move_insn (mem, fnaddr);
13025 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
13026 emit_move_insn (mem, chain_value);
13028 /* XXX We should really define a "clear_cache" pattern and use
13029 gen_clear_cache(). */
13030 a_tramp = XEXP (m_tramp, 0);
13031 maybe_emit_call_builtin___clear_cache (a_tramp,
13032 plus_constant (ptr_mode,
13033 a_tramp,
13034 TRAMPOLINE_SIZE));
13037 static unsigned char
13038 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
13040 /* ??? Logically we should only need to provide a value when
13041 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13042 can hold MODE, but at the moment we need to handle all modes.
13043 Just ignore any runtime parts for registers that can't store them. */
13044 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
13045 unsigned int nregs, vec_flags;
13046 switch (regclass)
13048 case W8_W11_REGS:
13049 case W12_W15_REGS:
13050 case STUB_REGS:
13051 case TAILCALL_ADDR_REGS:
13052 case POINTER_REGS:
13053 case GENERAL_REGS:
13054 case ALL_REGS:
13055 case POINTER_AND_FP_REGS:
13056 case FP_REGS:
13057 case FP_LO_REGS:
13058 case FP_LO8_REGS:
13059 vec_flags = aarch64_classify_vector_mode (mode);
13060 if ((vec_flags & VEC_SVE_DATA)
13061 && constant_multiple_p (GET_MODE_SIZE (mode),
13062 aarch64_vl_bytes (mode, vec_flags), &nregs))
13063 return nregs;
13064 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
13065 return GET_MODE_SIZE (mode).to_constant () / 8;
13066 return (vec_flags & VEC_ADVSIMD
13067 ? CEIL (lowest_size, UNITS_PER_VREG)
13068 : CEIL (lowest_size, UNITS_PER_WORD));
13070 case PR_REGS:
13071 case PR_LO_REGS:
13072 case PR_HI_REGS:
13073 return mode == VNx32BImode ? 2 : 1;
13075 case STACK_REG:
13076 case FFR_REGS:
13077 case PR_AND_FFR_REGS:
13078 case FAKE_REGS:
13079 return 1;
13081 case NO_REGS:
13082 return 0;
13084 default:
13085 break;
13087 gcc_unreachable ();
13090 static reg_class_t
13091 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
13093 if (regclass == POINTER_REGS)
13094 return GENERAL_REGS;
13096 if (regclass == STACK_REG)
13098 if (REG_P(x)
13099 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
13100 return regclass;
13102 return NO_REGS;
13105 /* Register eliminiation can result in a request for
13106 SP+constant->FP_REGS. We cannot support such operations which
13107 use SP as source and an FP_REG as destination, so reject out
13108 right now. */
13109 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
13111 rtx lhs = XEXP (x, 0);
13113 /* Look through a possible SUBREG introduced by ILP32. */
13114 if (SUBREG_P (lhs))
13115 lhs = SUBREG_REG (lhs);
13117 gcc_assert (REG_P (lhs));
13118 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
13119 POINTER_REGS));
13120 return NO_REGS;
13123 return regclass;
13126 void
13127 aarch64_asm_output_labelref (FILE* f, const char *name)
13129 asm_fprintf (f, "%U%s", name);
13132 static void
13133 aarch64_elf_asm_constructor (rtx symbol, int priority)
13135 if (priority == DEFAULT_INIT_PRIORITY)
13136 default_ctor_section_asm_out_constructor (symbol, priority);
13137 else
13139 section *s;
13140 /* While priority is known to be in range [0, 65535], so 18 bytes
13141 would be enough, the compiler might not know that. To avoid
13142 -Wformat-truncation false positive, use a larger size. */
13143 char buf[23];
13144 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13145 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13146 switch_to_section (s);
13147 assemble_align (POINTER_SIZE);
13148 assemble_aligned_integer (POINTER_BYTES, symbol);
13152 static void
13153 aarch64_elf_asm_destructor (rtx symbol, int priority)
13155 if (priority == DEFAULT_INIT_PRIORITY)
13156 default_dtor_section_asm_out_destructor (symbol, priority);
13157 else
13159 section *s;
13160 /* While priority is known to be in range [0, 65535], so 18 bytes
13161 would be enough, the compiler might not know that. To avoid
13162 -Wformat-truncation false positive, use a larger size. */
13163 char buf[23];
13164 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13165 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13166 switch_to_section (s);
13167 assemble_align (POINTER_SIZE);
13168 assemble_aligned_integer (POINTER_BYTES, symbol);
13172 const char*
13173 aarch64_output_casesi (rtx *operands)
13175 char buf[100];
13176 char label[100];
13177 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13178 int index;
13179 static const char *const patterns[4][2] =
13182 "ldrb\t%w3, [%0,%w1,uxtw]",
13183 "add\t%3, %4, %w3, sxtb #2"
13186 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13187 "add\t%3, %4, %w3, sxth #2"
13190 "ldr\t%w3, [%0,%w1,uxtw #2]",
13191 "add\t%3, %4, %w3, sxtw #2"
13193 /* We assume that DImode is only generated when not optimizing and
13194 that we don't really need 64-bit address offsets. That would
13195 imply an object file with 8GB of code in a single function! */
13197 "ldr\t%w3, [%0,%w1,uxtw #2]",
13198 "add\t%3, %4, %w3, sxtw #2"
13202 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13204 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13205 index = exact_log2 (GET_MODE_SIZE (mode));
13207 gcc_assert (index >= 0 && index <= 3);
13209 /* Need to implement table size reduction, by chaning the code below. */
13210 output_asm_insn (patterns[index][0], operands);
13211 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13212 snprintf (buf, sizeof (buf),
13213 "adr\t%%4, %s", targetm.strip_name_encoding (label));
13214 output_asm_insn (buf, operands);
13215 output_asm_insn (patterns[index][1], operands);
13216 output_asm_insn ("br\t%3", operands);
13217 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13218 operands);
13219 assemble_label (asm_out_file, label);
13220 return "";
13223 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13224 operand is MASK. */
13225 const char *
13226 aarch64_output_sme_zero_za (rtx mask)
13228 auto mask_val = UINTVAL (mask);
13229 if (mask_val == 0)
13230 return "zero\t{}";
13232 if (mask_val == 0xff)
13233 return "zero\t{ za }";
13235 static constexpr struct { unsigned char mask; char letter; } tiles[] = {
13236 { 0xff, 'b' },
13237 { 0x55, 'h' },
13238 { 0x11, 's' },
13239 { 0x01, 'd' }
13241 /* The last entry in the list has the form "za7.d }", but that's the
13242 same length as "za7.d, ". */
13243 static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13244 for (auto &tile : tiles)
13246 unsigned int tile_mask = tile.mask;
13247 unsigned int tile_index = 0;
13248 unsigned int i = snprintf (buffer, sizeof (buffer), "zero\t");
13249 const char *prefix = "{ ";
13250 auto remaining_mask = mask_val;
13251 while (tile_mask < 0x100)
13253 if ((remaining_mask & tile_mask) == tile_mask)
13255 i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13256 prefix, tile_index, tile.letter);
13257 prefix = ", ";
13258 remaining_mask &= ~tile_mask;
13260 tile_mask <<= 1;
13261 tile_index += 1;
13263 if (remaining_mask == 0)
13265 gcc_assert (i + 3 <= sizeof (buffer));
13266 snprintf (buffer + i, sizeof (buffer) - i, " }");
13267 return buffer;
13270 gcc_unreachable ();
13273 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13274 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13275 operator. */
13278 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13280 if (shift >= 0 && shift <= 4)
13282 int size;
13283 for (size = 8; size <= 32; size *= 2)
13285 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13286 if (mask == bits << shift)
13287 return size;
13290 return 0;
13293 /* Constant pools are per function only when PC relative
13294 literal loads are true or we are in the large memory
13295 model. */
13297 static inline bool
13298 aarch64_can_use_per_function_literal_pools_p (void)
13300 return (aarch64_pcrelative_literal_loads
13301 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13304 static bool
13305 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13307 /* We can't use blocks for constants when we're using a per-function
13308 constant pool. */
13309 return !aarch64_can_use_per_function_literal_pools_p ();
13312 /* Select appropriate section for constants depending
13313 on where we place literal pools. */
13315 static section *
13316 aarch64_select_rtx_section (machine_mode mode,
13317 rtx x,
13318 unsigned HOST_WIDE_INT align)
13320 if (aarch64_can_use_per_function_literal_pools_p ())
13321 return function_section (current_function_decl);
13323 return default_elf_select_rtx_section (mode, x, align);
13326 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13327 void
13328 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13329 HOST_WIDE_INT offset)
13331 /* When using per-function literal pools, we must ensure that any code
13332 section is aligned to the minimal instruction length, lest we get
13333 errors from the assembler re "unaligned instructions". */
13334 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13335 ASM_OUTPUT_ALIGN (f, 2);
13338 /* Costs. */
13340 /* Helper function for rtx cost calculation. Strip a shift expression
13341 from X. Returns the inner operand if successful, or the original
13342 expression on failure. */
13343 static rtx
13344 aarch64_strip_shift (rtx x)
13346 rtx op = x;
13348 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13349 we can convert both to ROR during final output. */
13350 if ((GET_CODE (op) == ASHIFT
13351 || GET_CODE (op) == ASHIFTRT
13352 || GET_CODE (op) == LSHIFTRT
13353 || GET_CODE (op) == ROTATERT
13354 || GET_CODE (op) == ROTATE)
13355 && CONST_INT_P (XEXP (op, 1)))
13356 return XEXP (op, 0);
13358 if (GET_CODE (op) == MULT
13359 && CONST_INT_P (XEXP (op, 1))
13360 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13361 return XEXP (op, 0);
13363 return x;
13366 /* Helper function for rtx cost calculation. Strip an extend
13367 expression from X. Returns the inner operand if successful, or the
13368 original expression on failure. We deal with a number of possible
13369 canonicalization variations here. If STRIP_SHIFT is true, then
13370 we can strip off a shift also. */
13371 static rtx
13372 aarch64_strip_extend (rtx x, bool strip_shift)
13374 scalar_int_mode mode;
13375 rtx op = x;
13377 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13378 return op;
13380 if (GET_CODE (op) == AND
13381 && GET_CODE (XEXP (op, 0)) == MULT
13382 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13383 && CONST_INT_P (XEXP (op, 1))
13384 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13385 INTVAL (XEXP (op, 1))) != 0)
13386 return XEXP (XEXP (op, 0), 0);
13388 /* Now handle extended register, as this may also have an optional
13389 left shift by 1..4. */
13390 if (strip_shift
13391 && GET_CODE (op) == ASHIFT
13392 && CONST_INT_P (XEXP (op, 1))
13393 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13394 op = XEXP (op, 0);
13396 if (GET_CODE (op) == ZERO_EXTEND
13397 || GET_CODE (op) == SIGN_EXTEND)
13398 op = XEXP (op, 0);
13400 if (op != x)
13401 return op;
13403 return x;
13406 /* Helper function for rtx cost calculation. Strip extension as well as any
13407 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13408 successful, or the original expression on failure. */
13409 static rtx
13410 aarch64_strip_extend_vec_half (rtx x)
13412 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13414 x = XEXP (x, 0);
13415 if (GET_CODE (x) == VEC_SELECT
13416 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13417 XEXP (x, 1)))
13418 x = XEXP (x, 0);
13420 return x;
13423 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13424 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13425 operand if successful, or the original expression on failure. */
13426 static rtx
13427 aarch64_strip_duplicate_vec_elt (rtx x)
13429 if (GET_CODE (x) == VEC_DUPLICATE
13430 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13432 x = XEXP (x, 0);
13433 if (GET_CODE (x) == VEC_SELECT)
13434 x = XEXP (x, 0);
13435 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13436 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13437 x = XEXP (XEXP (x, 0), 0);
13439 return x;
13442 /* Return true iff CODE is a shift supported in combination
13443 with arithmetic instructions. */
13445 static bool
13446 aarch64_shift_p (enum rtx_code code)
13448 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13452 /* Return true iff X is a cheap shift without a sign extend. */
13454 static bool
13455 aarch64_cheap_mult_shift_p (rtx x)
13457 rtx op0, op1;
13459 op0 = XEXP (x, 0);
13460 op1 = XEXP (x, 1);
13462 if (!(aarch64_tune_params.extra_tuning_flags
13463 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13464 return false;
13466 if (GET_CODE (op0) == SIGN_EXTEND)
13467 return false;
13469 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13470 && UINTVAL (op1) <= 4)
13471 return true;
13473 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13474 return false;
13476 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13478 if (l2 > 0 && l2 <= 4)
13479 return true;
13481 return false;
13484 /* Helper function for rtx cost calculation. Calculate the cost of
13485 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13486 Return the calculated cost of the expression, recursing manually in to
13487 operands where needed. */
13489 static int
13490 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13492 rtx op0, op1;
13493 const struct cpu_cost_table *extra_cost
13494 = aarch64_tune_params.insn_extra_cost;
13495 int cost = 0;
13496 bool compound_p = (outer == PLUS || outer == MINUS);
13497 machine_mode mode = GET_MODE (x);
13499 gcc_checking_assert (code == MULT);
13501 op0 = XEXP (x, 0);
13502 op1 = XEXP (x, 1);
13504 if (VECTOR_MODE_P (mode))
13506 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13507 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13509 /* The select-operand-high-half versions of the instruction have the
13510 same cost as the three vector version - don't add the costs of the
13511 extension or selection into the costs of the multiply. */
13512 op0 = aarch64_strip_extend_vec_half (op0);
13513 op1 = aarch64_strip_extend_vec_half (op1);
13514 /* The by-element versions of the instruction have the same costs as
13515 the normal 3-vector version. We make an assumption that the input
13516 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13517 costing of a MUL by element pre RA is a bit optimistic. */
13518 op0 = aarch64_strip_duplicate_vec_elt (op0);
13519 op1 = aarch64_strip_duplicate_vec_elt (op1);
13521 cost += rtx_cost (op0, mode, MULT, 0, speed);
13522 cost += rtx_cost (op1, mode, MULT, 1, speed);
13523 if (speed)
13525 if (GET_CODE (x) == MULT)
13526 cost += extra_cost->vect.mult;
13527 /* This is to catch the SSRA costing currently flowing here. */
13528 else
13529 cost += extra_cost->vect.alu;
13531 return cost;
13534 /* Integer multiply/fma. */
13535 if (GET_MODE_CLASS (mode) == MODE_INT)
13537 /* The multiply will be canonicalized as a shift, cost it as such. */
13538 if (aarch64_shift_p (GET_CODE (x))
13539 || (CONST_INT_P (op1)
13540 && exact_log2 (INTVAL (op1)) > 0))
13542 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13543 || GET_CODE (op0) == SIGN_EXTEND;
13544 if (speed)
13546 if (compound_p)
13548 /* If the shift is considered cheap,
13549 then don't add any cost. */
13550 if (aarch64_cheap_mult_shift_p (x))
13552 else if (REG_P (op1))
13553 /* ARITH + shift-by-register. */
13554 cost += extra_cost->alu.arith_shift_reg;
13555 else if (is_extend)
13556 /* ARITH + extended register. We don't have a cost field
13557 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13558 cost += extra_cost->alu.extend_arith;
13559 else
13560 /* ARITH + shift-by-immediate. */
13561 cost += extra_cost->alu.arith_shift;
13563 else
13564 /* LSL (immediate). */
13565 cost += extra_cost->alu.shift;
13568 /* Strip extends as we will have costed them in the case above. */
13569 if (is_extend)
13570 op0 = aarch64_strip_extend (op0, true);
13572 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13574 return cost;
13577 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13578 compound and let the below cases handle it. After all, MNEG is a
13579 special-case alias of MSUB. */
13580 if (GET_CODE (op0) == NEG)
13582 op0 = XEXP (op0, 0);
13583 compound_p = true;
13586 /* Integer multiplies or FMAs have zero/sign extending variants. */
13587 if ((GET_CODE (op0) == ZERO_EXTEND
13588 && GET_CODE (op1) == ZERO_EXTEND)
13589 || (GET_CODE (op0) == SIGN_EXTEND
13590 && GET_CODE (op1) == SIGN_EXTEND))
13592 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13593 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13595 if (speed)
13597 if (compound_p)
13598 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13599 cost += extra_cost->mult[0].extend_add;
13600 else
13601 /* MUL/SMULL/UMULL. */
13602 cost += extra_cost->mult[0].extend;
13605 return cost;
13608 /* This is either an integer multiply or a MADD. In both cases
13609 we want to recurse and cost the operands. */
13610 cost += rtx_cost (op0, mode, MULT, 0, speed);
13611 cost += rtx_cost (op1, mode, MULT, 1, speed);
13613 if (speed)
13615 if (compound_p)
13616 /* MADD/MSUB. */
13617 cost += extra_cost->mult[mode == DImode].add;
13618 else
13619 /* MUL. */
13620 cost += extra_cost->mult[mode == DImode].simple;
13623 return cost;
13625 else
13627 if (speed)
13629 /* Floating-point FMA/FMUL can also support negations of the
13630 operands, unless the rounding mode is upward or downward in
13631 which case FNMUL is different than FMUL with operand negation. */
13632 bool neg0 = GET_CODE (op0) == NEG;
13633 bool neg1 = GET_CODE (op1) == NEG;
13634 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13636 if (neg0)
13637 op0 = XEXP (op0, 0);
13638 if (neg1)
13639 op1 = XEXP (op1, 0);
13642 if (compound_p)
13643 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13644 cost += extra_cost->fp[mode == DFmode].fma;
13645 else
13646 /* FMUL/FNMUL. */
13647 cost += extra_cost->fp[mode == DFmode].mult;
13650 cost += rtx_cost (op0, mode, MULT, 0, speed);
13651 cost += rtx_cost (op1, mode, MULT, 1, speed);
13652 return cost;
13656 static int
13657 aarch64_address_cost (rtx x,
13658 machine_mode mode,
13659 addr_space_t as ATTRIBUTE_UNUSED,
13660 bool speed)
13662 enum rtx_code c = GET_CODE (x);
13663 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13664 struct aarch64_address_info info;
13665 int cost = 0;
13666 info.shift = 0;
13668 if (!aarch64_classify_address (&info, x, mode, false))
13670 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13672 /* This is a CONST or SYMBOL ref which will be split
13673 in a different way depending on the code model in use.
13674 Cost it through the generic infrastructure. */
13675 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13676 /* Divide through by the cost of one instruction to
13677 bring it to the same units as the address costs. */
13678 cost_symbol_ref /= COSTS_N_INSNS (1);
13679 /* The cost is then the cost of preparing the address,
13680 followed by an immediate (possibly 0) offset. */
13681 return cost_symbol_ref + addr_cost->imm_offset;
13683 else
13685 /* This is most likely a jump table from a case
13686 statement. */
13687 return addr_cost->register_offset;
13691 switch (info.type)
13693 case ADDRESS_LO_SUM:
13694 case ADDRESS_SYMBOLIC:
13695 case ADDRESS_REG_IMM:
13696 cost += addr_cost->imm_offset;
13697 break;
13699 case ADDRESS_REG_WB:
13700 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13701 cost += addr_cost->pre_modify;
13702 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13704 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13705 if (nvectors == 3)
13706 cost += addr_cost->post_modify_ld3_st3;
13707 else if (nvectors == 4)
13708 cost += addr_cost->post_modify_ld4_st4;
13709 else
13710 cost += addr_cost->post_modify;
13712 else
13713 gcc_unreachable ();
13715 break;
13717 case ADDRESS_REG_REG:
13718 cost += addr_cost->register_offset;
13719 break;
13721 case ADDRESS_REG_SXTW:
13722 cost += addr_cost->register_sextend;
13723 break;
13725 case ADDRESS_REG_UXTW:
13726 cost += addr_cost->register_zextend;
13727 break;
13729 default:
13730 gcc_unreachable ();
13734 if (info.shift > 0)
13736 /* For the sake of calculating the cost of the shifted register
13737 component, we can treat same sized modes in the same way. */
13738 if (known_eq (GET_MODE_BITSIZE (mode), 16))
13739 cost += addr_cost->addr_scale_costs.hi;
13740 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13741 cost += addr_cost->addr_scale_costs.si;
13742 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13743 cost += addr_cost->addr_scale_costs.di;
13744 else
13745 /* We can't tell, or this is a 128-bit vector. */
13746 cost += addr_cost->addr_scale_costs.ti;
13749 return cost;
13752 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13753 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13754 to be taken. */
13757 aarch64_branch_cost (bool speed_p, bool predictable_p)
13759 /* When optimizing for speed, use the cost of unpredictable branches. */
13760 const struct cpu_branch_cost *branch_costs =
13761 aarch64_tune_params.branch_costs;
13763 if (!speed_p || predictable_p)
13764 return branch_costs->predictable;
13765 else
13766 return branch_costs->unpredictable;
13769 /* Return true if X is a zero or sign extract
13770 usable in an ADD or SUB (extended register) instruction. */
13771 static bool
13772 aarch64_rtx_arith_op_extract_p (rtx x)
13774 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13775 No shift. */
13776 if (GET_CODE (x) == SIGN_EXTEND
13777 || GET_CODE (x) == ZERO_EXTEND)
13778 return REG_P (XEXP (x, 0));
13780 return false;
13783 static bool
13784 aarch64_frint_unspec_p (unsigned int u)
13786 switch (u)
13788 case UNSPEC_FRINTZ:
13789 case UNSPEC_FRINTP:
13790 case UNSPEC_FRINTM:
13791 case UNSPEC_FRINTA:
13792 case UNSPEC_FRINTN:
13793 case UNSPEC_FRINTX:
13794 case UNSPEC_FRINTI:
13795 return true;
13797 default:
13798 return false;
13802 /* Return true iff X is an rtx that will match an extr instruction
13803 i.e. as described in the *extr<mode>5_insn family of patterns.
13804 OP0 and OP1 will be set to the operands of the shifts involved
13805 on success and will be NULL_RTX otherwise. */
13807 static bool
13808 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13810 rtx op0, op1;
13811 scalar_int_mode mode;
13812 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13813 return false;
13815 *res_op0 = NULL_RTX;
13816 *res_op1 = NULL_RTX;
13818 if (GET_CODE (x) != IOR)
13819 return false;
13821 op0 = XEXP (x, 0);
13822 op1 = XEXP (x, 1);
13824 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13825 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13827 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13828 if (GET_CODE (op1) == ASHIFT)
13829 std::swap (op0, op1);
13831 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13832 return false;
13834 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13835 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13837 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13838 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13840 *res_op0 = XEXP (op0, 0);
13841 *res_op1 = XEXP (op1, 0);
13842 return true;
13846 return false;
13849 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13850 storing it in *COST. Result is true if the total cost of the operation
13851 has now been calculated. */
13852 static bool
13853 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13855 rtx inner;
13856 rtx comparator;
13857 enum rtx_code cmpcode;
13858 const struct cpu_cost_table *extra_cost
13859 = aarch64_tune_params.insn_extra_cost;
13861 if (COMPARISON_P (op0))
13863 inner = XEXP (op0, 0);
13864 comparator = XEXP (op0, 1);
13865 cmpcode = GET_CODE (op0);
13867 else
13869 inner = op0;
13870 comparator = const0_rtx;
13871 cmpcode = NE;
13874 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13876 /* Conditional branch. */
13877 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13878 return true;
13879 else
13881 if (cmpcode == NE || cmpcode == EQ)
13883 if (comparator == const0_rtx)
13885 /* TBZ/TBNZ/CBZ/CBNZ. */
13886 if (GET_CODE (inner) == ZERO_EXTRACT)
13887 /* TBZ/TBNZ. */
13888 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13889 ZERO_EXTRACT, 0, speed);
13890 else
13891 /* CBZ/CBNZ. */
13892 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13894 return true;
13896 if (register_operand (inner, VOIDmode)
13897 && aarch64_imm24 (comparator, VOIDmode))
13899 /* SUB and SUBS. */
13900 *cost += COSTS_N_INSNS (2);
13901 if (speed)
13902 *cost += extra_cost->alu.arith * 2;
13903 return true;
13906 else if (cmpcode == LT || cmpcode == GE)
13908 /* TBZ/TBNZ. */
13909 if (comparator == const0_rtx)
13910 return true;
13914 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13916 /* CCMP. */
13917 if (GET_CODE (op1) == COMPARE)
13919 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13920 if (XEXP (op1, 1) == const0_rtx)
13921 *cost += 1;
13922 if (speed)
13924 machine_mode mode = GET_MODE (XEXP (op1, 0));
13926 if (GET_MODE_CLASS (mode) == MODE_INT)
13927 *cost += extra_cost->alu.arith;
13928 else
13929 *cost += extra_cost->fp[mode == DFmode].compare;
13931 return true;
13934 /* It's a conditional operation based on the status flags,
13935 so it must be some flavor of CSEL. */
13937 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13938 if (GET_CODE (op1) == NEG
13939 || GET_CODE (op1) == NOT
13940 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13941 op1 = XEXP (op1, 0);
13942 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13944 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13945 op1 = XEXP (op1, 0);
13946 op2 = XEXP (op2, 0);
13948 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13950 inner = XEXP (op1, 0);
13951 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13952 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13953 op1 = XEXP (inner, 0);
13955 else if (op1 == constm1_rtx || op1 == const1_rtx)
13957 /* Use CSINV or CSINC. */
13958 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13959 return true;
13961 else if (op2 == constm1_rtx || op2 == const1_rtx)
13963 /* Use CSINV or CSINC. */
13964 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13965 return true;
13968 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13969 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13970 return true;
13973 /* We don't know what this is, cost all operands. */
13974 return false;
13977 /* Check whether X is a bitfield operation of the form shift + extend that
13978 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13979 operand to which the bitfield operation is applied. Otherwise return
13980 NULL_RTX. */
13982 static rtx
13983 aarch64_extend_bitfield_pattern_p (rtx x)
13985 rtx_code outer_code = GET_CODE (x);
13986 machine_mode outer_mode = GET_MODE (x);
13988 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13989 && outer_mode != SImode && outer_mode != DImode)
13990 return NULL_RTX;
13992 rtx inner = XEXP (x, 0);
13993 rtx_code inner_code = GET_CODE (inner);
13994 machine_mode inner_mode = GET_MODE (inner);
13995 rtx op = NULL_RTX;
13997 switch (inner_code)
13999 case ASHIFT:
14000 if (CONST_INT_P (XEXP (inner, 1))
14001 && (inner_mode == QImode || inner_mode == HImode))
14002 op = XEXP (inner, 0);
14003 break;
14004 case LSHIFTRT:
14005 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
14006 && (inner_mode == QImode || inner_mode == HImode))
14007 op = XEXP (inner, 0);
14008 break;
14009 case ASHIFTRT:
14010 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
14011 && (inner_mode == QImode || inner_mode == HImode))
14012 op = XEXP (inner, 0);
14013 break;
14014 default:
14015 break;
14018 return op;
14021 /* Return true if the mask and a shift amount from an RTX of the form
14022 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
14023 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
14025 bool
14026 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
14027 rtx shft_amnt)
14029 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
14030 && INTVAL (mask) > 0
14031 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
14032 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
14033 && (UINTVAL (mask)
14034 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
14037 /* Return true if the masks and a shift amount from an RTX of the form
14038 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14039 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
14041 bool
14042 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
14043 unsigned HOST_WIDE_INT mask1,
14044 unsigned HOST_WIDE_INT shft_amnt,
14045 unsigned HOST_WIDE_INT mask2)
14047 unsigned HOST_WIDE_INT t;
14049 /* Verify that there is no overlap in what bits are set in the two masks. */
14050 if (mask1 != ~mask2)
14051 return false;
14053 /* Verify that mask2 is not all zeros or ones. */
14054 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
14055 return false;
14057 /* The shift amount should always be less than the mode size. */
14058 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
14060 /* Verify that the mask being shifted is contiguous and would be in the
14061 least significant bits after shifting by shft_amnt. */
14062 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
14063 return (t == (t & -t));
14066 /* Return true if X is an RTX representing an operation in the ABD family
14067 of instructions. */
14069 static bool
14070 aarch64_abd_rtx_p (rtx x)
14072 if (GET_CODE (x) != MINUS)
14073 return false;
14074 rtx max_arm = XEXP (x, 0);
14075 rtx min_arm = XEXP (x, 1);
14076 if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
14077 return false;
14078 bool signed_p = GET_CODE (max_arm) == SMAX;
14079 if (signed_p && GET_CODE (min_arm) != SMIN)
14080 return false;
14081 else if (!signed_p && GET_CODE (min_arm) != UMIN)
14082 return false;
14084 rtx maxop0 = XEXP (max_arm, 0);
14085 rtx maxop1 = XEXP (max_arm, 1);
14086 rtx minop0 = XEXP (min_arm, 0);
14087 rtx minop1 = XEXP (min_arm, 1);
14088 return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
14091 /* Calculate the cost of calculating X, storing it in *COST. Result
14092 is true if the total cost of the operation has now been calculated. */
14093 static bool
14094 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
14095 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
14097 rtx op0, op1, op2;
14098 const struct cpu_cost_table *extra_cost
14099 = aarch64_tune_params.insn_extra_cost;
14100 rtx_code code = GET_CODE (x);
14101 scalar_int_mode int_mode;
14103 /* By default, assume that everything has equivalent cost to the
14104 cheapest instruction. Any additional costs are applied as a delta
14105 above this default. */
14106 *cost = COSTS_N_INSNS (1);
14108 switch (code)
14110 case SET:
14111 /* The cost depends entirely on the operands to SET. */
14112 *cost = 0;
14113 op0 = SET_DEST (x);
14114 op1 = SET_SRC (x);
14116 switch (GET_CODE (op0))
14118 case MEM:
14119 if (speed)
14121 rtx address = XEXP (op0, 0);
14122 if (VECTOR_MODE_P (mode))
14123 *cost += extra_cost->ldst.storev;
14124 else if (GET_MODE_CLASS (mode) == MODE_INT)
14125 *cost += extra_cost->ldst.store;
14126 else if (mode == SFmode || mode == SDmode)
14127 *cost += extra_cost->ldst.storef;
14128 else if (mode == DFmode || mode == DDmode)
14129 *cost += extra_cost->ldst.stored;
14131 *cost +=
14132 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14133 0, speed));
14136 *cost += rtx_cost (op1, mode, SET, 1, speed);
14137 return true;
14139 case SUBREG:
14140 if (! REG_P (SUBREG_REG (op0)))
14141 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14143 /* Fall through. */
14144 case REG:
14145 /* The cost is one per vector-register copied. */
14146 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14148 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14149 *cost = COSTS_N_INSNS (nregs);
14151 /* const0_rtx is in general free, but we will use an
14152 instruction to set a register to 0. */
14153 else if (REG_P (op1) || op1 == const0_rtx)
14155 /* The cost is 1 per register copied. */
14156 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14157 *cost = COSTS_N_INSNS (nregs);
14159 else
14160 /* Cost is just the cost of the RHS of the set. */
14161 *cost += rtx_cost (op1, mode, SET, 1, speed);
14162 return true;
14164 case ZERO_EXTRACT:
14165 case SIGN_EXTRACT:
14166 /* Bit-field insertion. Strip any redundant widening of
14167 the RHS to meet the width of the target. */
14168 if (SUBREG_P (op1))
14169 op1 = SUBREG_REG (op1);
14170 if ((GET_CODE (op1) == ZERO_EXTEND
14171 || GET_CODE (op1) == SIGN_EXTEND)
14172 && CONST_INT_P (XEXP (op0, 1))
14173 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14174 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14175 op1 = XEXP (op1, 0);
14177 if (CONST_INT_P (op1))
14179 /* MOV immediate is assumed to always be cheap. */
14180 *cost = COSTS_N_INSNS (1);
14182 else
14184 /* BFM. */
14185 if (speed)
14186 *cost += extra_cost->alu.bfi;
14187 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
14190 return true;
14192 default:
14193 /* We can't make sense of this, assume default cost. */
14194 *cost = COSTS_N_INSNS (1);
14195 return false;
14197 return false;
14199 case CONST_INT:
14200 /* If an instruction can incorporate a constant within the
14201 instruction, the instruction's expression avoids calling
14202 rtx_cost() on the constant. If rtx_cost() is called on a
14203 constant, then it is usually because the constant must be
14204 moved into a register by one or more instructions.
14206 The exception is constant 0, which can be expressed
14207 as XZR/WZR and is therefore free. The exception to this is
14208 if we have (set (reg) (const0_rtx)) in which case we must cost
14209 the move. However, we can catch that when we cost the SET, so
14210 we don't need to consider that here. */
14211 if (x == const0_rtx)
14212 *cost = 0;
14213 else
14215 /* To an approximation, building any other constant is
14216 proportionally expensive to the number of instructions
14217 required to build that constant. This is true whether we
14218 are compiling for SPEED or otherwise. */
14219 machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14220 ? SImode : DImode;
14221 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14222 (NULL_RTX, x, false, imode));
14224 return true;
14226 case CONST_DOUBLE:
14228 /* First determine number of instructions to do the move
14229 as an integer constant. */
14230 if (!aarch64_float_const_representable_p (x)
14231 && !aarch64_can_const_movi_rtx_p (x, mode)
14232 && aarch64_float_const_rtx_p (x))
14234 unsigned HOST_WIDE_INT ival;
14235 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14236 gcc_assert (succeed);
14238 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14239 ? DImode : SImode;
14240 int ncost = aarch64_internal_mov_immediate
14241 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14242 *cost += COSTS_N_INSNS (ncost);
14243 return true;
14246 if (speed)
14248 /* mov[df,sf]_aarch64. */
14249 if (aarch64_float_const_representable_p (x))
14250 /* FMOV (scalar immediate). */
14251 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14252 else if (!aarch64_float_const_zero_rtx_p (x))
14254 /* This will be a load from memory. */
14255 if (mode == DFmode || mode == DDmode)
14256 *cost += extra_cost->ldst.loadd;
14257 else
14258 *cost += extra_cost->ldst.loadf;
14260 else
14261 /* Otherwise this is +0.0. We get this using MOVI d0, #0
14262 or MOV v0.s[0], wzr - neither of which are modeled by the
14263 cost tables. Just use the default cost. */
14268 return true;
14270 case MEM:
14271 if (speed)
14273 /* For loads we want the base cost of a load, plus an
14274 approximation for the additional cost of the addressing
14275 mode. */
14276 rtx address = XEXP (x, 0);
14277 if (VECTOR_MODE_P (mode))
14278 *cost += extra_cost->ldst.loadv;
14279 else if (GET_MODE_CLASS (mode) == MODE_INT)
14280 *cost += extra_cost->ldst.load;
14281 else if (mode == SFmode || mode == SDmode)
14282 *cost += extra_cost->ldst.loadf;
14283 else if (mode == DFmode || mode == DDmode)
14284 *cost += extra_cost->ldst.loadd;
14286 *cost +=
14287 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14288 0, speed));
14291 return true;
14293 case NEG:
14294 op0 = XEXP (x, 0);
14296 if (VECTOR_MODE_P (mode))
14298 /* Many vector comparison operations are represented as NEG
14299 of a comparison. */
14300 if (COMPARISON_P (op0))
14302 rtx op00 = XEXP (op0, 0);
14303 rtx op01 = XEXP (op0, 1);
14304 machine_mode inner_mode = GET_MODE (op00);
14305 /* FACGE/FACGT. */
14306 if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14307 && GET_CODE (op00) == ABS
14308 && GET_CODE (op01) == ABS)
14310 op00 = XEXP (op00, 0);
14311 op01 = XEXP (op01, 0);
14313 *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14314 *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14315 if (speed)
14316 *cost += extra_cost->vect.alu;
14317 return true;
14319 if (speed)
14321 /* FNEG. */
14322 *cost += extra_cost->vect.alu;
14324 return false;
14327 if (GET_MODE_CLASS (mode) == MODE_INT)
14329 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14330 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14332 /* CSETM. */
14333 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14334 return true;
14337 /* Cost this as SUB wzr, X. */
14338 op0 = CONST0_RTX (mode);
14339 op1 = XEXP (x, 0);
14340 goto cost_minus;
14343 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14345 /* Support (neg(fma...)) as a single instruction only if
14346 sign of zeros is unimportant. This matches the decision
14347 making in aarch64.md. */
14348 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14350 /* FNMADD. */
14351 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14352 return true;
14354 if (GET_CODE (op0) == MULT)
14356 /* FNMUL. */
14357 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14358 return true;
14360 if (speed)
14361 /* FNEG. */
14362 *cost += extra_cost->fp[mode == DFmode].neg;
14363 return false;
14366 return false;
14368 case CLRSB:
14369 case CLZ:
14370 if (speed)
14372 if (VECTOR_MODE_P (mode))
14373 *cost += extra_cost->vect.alu;
14374 else
14375 *cost += extra_cost->alu.clz;
14378 return false;
14380 case CTZ:
14381 if (VECTOR_MODE_P (mode))
14383 *cost = COSTS_N_INSNS (3);
14384 if (speed)
14385 *cost += extra_cost->vect.alu * 3;
14387 else if (TARGET_CSSC)
14389 *cost = COSTS_N_INSNS (1);
14390 if (speed)
14391 *cost += extra_cost->alu.clz;
14393 else
14395 *cost = COSTS_N_INSNS (2);
14396 if (speed)
14397 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14399 return false;
14401 case COMPARE:
14402 op0 = XEXP (x, 0);
14403 op1 = XEXP (x, 1);
14405 if (op1 == const0_rtx
14406 && GET_CODE (op0) == AND)
14408 x = op0;
14409 mode = GET_MODE (op0);
14410 goto cost_logic;
14413 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14415 /* TODO: A write to the CC flags possibly costs extra, this
14416 needs encoding in the cost tables. */
14418 mode = GET_MODE (op0);
14419 /* ANDS. */
14420 if (GET_CODE (op0) == AND)
14422 x = op0;
14423 goto cost_logic;
14426 if (GET_CODE (op0) == PLUS)
14428 /* ADDS (and CMN alias). */
14429 x = op0;
14430 goto cost_plus;
14433 if (GET_CODE (op0) == MINUS)
14435 /* SUBS. */
14436 x = op0;
14437 goto cost_minus;
14440 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14441 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14442 && CONST_INT_P (XEXP (op0, 2)))
14444 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14445 Handle it here directly rather than going to cost_logic
14446 since we know the immediate generated for the TST is valid
14447 so we can avoid creating an intermediate rtx for it only
14448 for costing purposes. */
14449 if (speed)
14450 *cost += extra_cost->alu.logical;
14452 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14453 ZERO_EXTRACT, 0, speed);
14454 return true;
14457 if (GET_CODE (op1) == NEG)
14459 /* CMN. */
14460 if (speed)
14461 *cost += extra_cost->alu.arith;
14463 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14464 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14465 return true;
14468 /* CMP.
14470 Compare can freely swap the order of operands, and
14471 canonicalization puts the more complex operation first.
14472 But the integer MINUS logic expects the shift/extend
14473 operation in op1. */
14474 if (! (REG_P (op0)
14475 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14477 op0 = XEXP (x, 1);
14478 op1 = XEXP (x, 0);
14480 goto cost_minus;
14483 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14485 /* FCMP. */
14486 if (speed)
14487 *cost += extra_cost->fp[mode == DFmode].compare;
14489 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14491 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14492 /* FCMP supports constant 0.0 for no extra cost. */
14493 return true;
14495 return false;
14498 if (VECTOR_MODE_P (mode))
14500 /* Vector compare. */
14501 if (speed)
14502 *cost += extra_cost->vect.alu;
14504 if (aarch64_float_const_zero_rtx_p (op1))
14506 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14507 cost. */
14508 return true;
14510 return false;
14512 return false;
14514 case MINUS:
14516 op0 = XEXP (x, 0);
14517 op1 = XEXP (x, 1);
14519 cost_minus:
14520 if (VECTOR_MODE_P (mode))
14522 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14523 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14525 /* Recognise the SABD and UABD operation here.
14526 Recursion from the PLUS case will catch the accumulating
14527 forms. */
14528 if (aarch64_abd_rtx_p (x))
14530 if (speed)
14531 *cost += extra_cost->vect.alu;
14532 return true;
14534 /* SUBL2 and SUBW2.
14535 The select-operand-high-half versions of the sub instruction
14536 have the same cost as the regular three vector version -
14537 don't add the costs of the select into the costs of the sub.
14539 op0 = aarch64_strip_extend_vec_half (op0);
14540 op1 = aarch64_strip_extend_vec_half (op1);
14544 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14546 /* Detect valid immediates. */
14547 if ((GET_MODE_CLASS (mode) == MODE_INT
14548 || (GET_MODE_CLASS (mode) == MODE_CC
14549 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14550 && CONST_INT_P (op1)
14551 && aarch64_uimm12_shift (INTVAL (op1)))
14553 if (speed)
14554 /* SUB(S) (immediate). */
14555 *cost += extra_cost->alu.arith;
14556 return true;
14559 /* Look for SUB (extended register). */
14560 if (is_a <scalar_int_mode> (mode)
14561 && aarch64_rtx_arith_op_extract_p (op1))
14563 if (speed)
14564 *cost += extra_cost->alu.extend_arith;
14566 op1 = aarch64_strip_extend (op1, true);
14567 *cost += rtx_cost (op1, VOIDmode,
14568 (enum rtx_code) GET_CODE (op1), 0, speed);
14569 return true;
14572 rtx new_op1 = aarch64_strip_extend (op1, false);
14574 /* Cost this as an FMA-alike operation. */
14575 if ((GET_CODE (new_op1) == MULT
14576 || aarch64_shift_p (GET_CODE (new_op1)))
14577 && code != COMPARE)
14579 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14580 (enum rtx_code) code,
14581 speed);
14582 return true;
14585 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14587 if (speed)
14589 if (VECTOR_MODE_P (mode))
14591 /* Vector SUB. */
14592 *cost += extra_cost->vect.alu;
14594 else if (GET_MODE_CLASS (mode) == MODE_INT)
14596 /* SUB(S). */
14597 *cost += extra_cost->alu.arith;
14599 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14601 /* FSUB. */
14602 *cost += extra_cost->fp[mode == DFmode].addsub;
14605 return true;
14608 case PLUS:
14610 rtx new_op0;
14612 op0 = XEXP (x, 0);
14613 op1 = XEXP (x, 1);
14615 cost_plus:
14616 if (VECTOR_MODE_P (mode))
14618 /* ADDL2 and ADDW2. */
14619 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14620 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14622 /* The select-operand-high-half versions of the add instruction
14623 have the same cost as the regular three vector version -
14624 don't add the costs of the select into the costs of the add.
14626 op0 = aarch64_strip_extend_vec_half (op0);
14627 op1 = aarch64_strip_extend_vec_half (op1);
14631 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14632 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14634 /* CSINC. */
14635 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14636 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14637 return true;
14640 if (GET_MODE_CLASS (mode) == MODE_INT
14641 && (aarch64_plus_immediate (op1, mode)
14642 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14644 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14646 if (speed)
14648 /* ADD (immediate). */
14649 *cost += extra_cost->alu.arith;
14651 /* Some tunings prefer to not use the VL-based scalar ops.
14652 Increase the cost of the poly immediate to prevent their
14653 formation. */
14654 if (GET_CODE (op1) == CONST_POLY_INT
14655 && (aarch64_tune_params.extra_tuning_flags
14656 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14657 *cost += COSTS_N_INSNS (1);
14659 return true;
14662 if (aarch64_pluslong_immediate (op1, mode))
14664 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14665 if ((INTVAL (op1) & 0xfff) != 0)
14666 *cost += COSTS_N_INSNS (1);
14668 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14669 return true;
14672 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14674 /* Look for ADD (extended register). */
14675 if (is_a <scalar_int_mode> (mode)
14676 && aarch64_rtx_arith_op_extract_p (op0))
14678 if (speed)
14679 *cost += extra_cost->alu.extend_arith;
14681 op0 = aarch64_strip_extend (op0, true);
14682 *cost += rtx_cost (op0, VOIDmode,
14683 (enum rtx_code) GET_CODE (op0), 0, speed);
14684 return true;
14687 /* Strip any extend, leave shifts behind as we will
14688 cost them through mult_cost. */
14689 new_op0 = aarch64_strip_extend (op0, false);
14691 if (GET_CODE (new_op0) == MULT
14692 || aarch64_shift_p (GET_CODE (new_op0)))
14694 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14695 speed);
14696 return true;
14699 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14701 if (speed)
14703 if (VECTOR_MODE_P (mode))
14705 /* Vector ADD. */
14706 *cost += extra_cost->vect.alu;
14708 else if (GET_MODE_CLASS (mode) == MODE_INT)
14710 /* ADD. */
14711 *cost += extra_cost->alu.arith;
14713 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14715 /* FADD. */
14716 *cost += extra_cost->fp[mode == DFmode].addsub;
14719 return true;
14722 case BITREVERSE:
14723 case BSWAP:
14724 *cost = COSTS_N_INSNS (1);
14726 if (speed)
14728 if (VECTOR_MODE_P (mode))
14729 *cost += extra_cost->vect.alu;
14730 else
14731 *cost += extra_cost->alu.rev;
14733 return false;
14735 case IOR:
14736 if (aarch_rev16_p (x))
14738 *cost = COSTS_N_INSNS (1);
14740 if (speed)
14742 if (VECTOR_MODE_P (mode))
14743 *cost += extra_cost->vect.alu;
14744 else
14745 *cost += extra_cost->alu.rev;
14747 return true;
14750 if (aarch64_extr_rtx_p (x, &op0, &op1))
14752 *cost += rtx_cost (op0, mode, IOR, 0, speed);
14753 *cost += rtx_cost (op1, mode, IOR, 1, speed);
14754 if (speed)
14755 *cost += extra_cost->alu.shift;
14757 return true;
14759 /* Fall through. */
14760 case XOR:
14761 case AND:
14762 cost_logic:
14763 op0 = XEXP (x, 0);
14764 op1 = XEXP (x, 1);
14766 if (VECTOR_MODE_P (mode))
14768 if (speed)
14769 *cost += extra_cost->vect.alu;
14770 return true;
14773 if (code == AND
14774 && GET_CODE (op0) == MULT
14775 && CONST_INT_P (XEXP (op0, 1))
14776 && CONST_INT_P (op1)
14777 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14778 INTVAL (op1)) != 0)
14780 /* This is a UBFM/SBFM. */
14781 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14782 if (speed)
14783 *cost += extra_cost->alu.bfx;
14784 return true;
14787 if (is_int_mode (mode, &int_mode))
14789 if (CONST_INT_P (op1))
14791 /* We have a mask + shift version of a UBFIZ
14792 i.e. the *andim_ashift<mode>_bfiz pattern. */
14793 if (GET_CODE (op0) == ASHIFT
14794 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14795 XEXP (op0, 1)))
14797 *cost += rtx_cost (XEXP (op0, 0), int_mode,
14798 (enum rtx_code) code, 0, speed);
14799 if (speed)
14800 *cost += extra_cost->alu.bfx;
14802 return true;
14804 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14806 /* We possibly get the immediate for free, this is not
14807 modelled. */
14808 *cost += rtx_cost (op0, int_mode,
14809 (enum rtx_code) code, 0, speed);
14810 if (speed)
14811 *cost += extra_cost->alu.logical;
14813 return true;
14816 else
14818 rtx new_op0 = op0;
14820 /* Handle ORN, EON, or BIC. */
14821 if (GET_CODE (op0) == NOT)
14822 op0 = XEXP (op0, 0);
14824 new_op0 = aarch64_strip_shift (op0);
14826 /* If we had a shift on op0 then this is a logical-shift-
14827 by-register/immediate operation. Otherwise, this is just
14828 a logical operation. */
14829 if (speed)
14831 if (new_op0 != op0)
14833 /* Shift by immediate. */
14834 if (CONST_INT_P (XEXP (op0, 1)))
14835 *cost += extra_cost->alu.log_shift;
14836 else
14837 *cost += extra_cost->alu.log_shift_reg;
14839 else
14840 *cost += extra_cost->alu.logical;
14843 /* In both cases we want to cost both operands. */
14844 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14845 0, speed);
14846 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14847 1, speed);
14849 return true;
14852 return false;
14854 case NOT:
14855 x = XEXP (x, 0);
14856 op0 = aarch64_strip_shift (x);
14858 if (VECTOR_MODE_P (mode))
14860 /* Vector NOT. */
14861 *cost += extra_cost->vect.alu;
14862 return false;
14865 /* MVN-shifted-reg. */
14866 if (op0 != x)
14868 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14870 if (speed)
14871 *cost += extra_cost->alu.log_shift;
14873 return true;
14875 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14876 Handle the second form here taking care that 'a' in the above can
14877 be a shift. */
14878 else if (GET_CODE (op0) == XOR)
14880 rtx newop0 = XEXP (op0, 0);
14881 rtx newop1 = XEXP (op0, 1);
14882 rtx op0_stripped = aarch64_strip_shift (newop0);
14884 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14885 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14887 if (speed)
14889 if (op0_stripped != newop0)
14890 *cost += extra_cost->alu.log_shift;
14891 else
14892 *cost += extra_cost->alu.logical;
14895 return true;
14897 /* MVN. */
14898 if (speed)
14899 *cost += extra_cost->alu.logical;
14901 return false;
14903 case ZERO_EXTEND:
14905 op0 = XEXP (x, 0);
14906 /* If a value is written in SI mode, then zero extended to DI
14907 mode, the operation will in general be free as a write to
14908 a 'w' register implicitly zeroes the upper bits of an 'x'
14909 register. However, if this is
14911 (set (reg) (zero_extend (reg)))
14913 we must cost the explicit register move. */
14914 if (mode == DImode
14915 && GET_MODE (op0) == SImode)
14917 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14919 /* If OP_COST is non-zero, then the cost of the zero extend
14920 is effectively the cost of the inner operation. Otherwise
14921 we have a MOV instruction and we take the cost from the MOV
14922 itself. This is true independently of whether we are
14923 optimizing for space or time. */
14924 if (op_cost)
14925 *cost = op_cost;
14927 return true;
14929 else if (MEM_P (op0))
14931 /* All loads can zero extend to any size for free. */
14932 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14933 return true;
14936 op0 = aarch64_extend_bitfield_pattern_p (x);
14937 if (op0)
14939 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14940 if (speed)
14941 *cost += extra_cost->alu.bfx;
14942 return true;
14945 if (speed)
14947 if (VECTOR_MODE_P (mode))
14949 /* UMOV. */
14950 *cost += extra_cost->vect.alu;
14952 else
14954 /* We generate an AND instead of UXTB/UXTH. */
14955 *cost += extra_cost->alu.logical;
14958 return false;
14960 case SIGN_EXTEND:
14961 if (MEM_P (XEXP (x, 0)))
14963 /* LDRSH. */
14964 if (speed)
14966 rtx address = XEXP (XEXP (x, 0), 0);
14967 *cost += extra_cost->ldst.load_sign_extend;
14969 *cost +=
14970 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14971 0, speed));
14973 return true;
14976 op0 = aarch64_extend_bitfield_pattern_p (x);
14977 if (op0)
14979 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14980 if (speed)
14981 *cost += extra_cost->alu.bfx;
14982 return true;
14985 if (speed)
14987 if (VECTOR_MODE_P (mode))
14988 *cost += extra_cost->vect.alu;
14989 else
14990 *cost += extra_cost->alu.extend;
14992 return false;
14994 case ROTATE:
14995 case ROTATERT:
14996 case LSHIFTRT:
14997 case ASHIFTRT:
14998 case ASHIFT:
14999 op0 = XEXP (x, 0);
15000 op1 = XEXP (x, 1);
15002 if (CONST_INT_P (op1))
15004 if (speed)
15006 if (VECTOR_MODE_P (mode))
15008 /* Vector shift (immediate). */
15009 *cost += extra_cost->vect.alu;
15011 else
15013 /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
15014 These are all aliases. */
15015 *cost += extra_cost->alu.shift;
15019 /* We can incorporate zero/sign extend for free. */
15020 if (GET_CODE (op0) == ZERO_EXTEND
15021 || GET_CODE (op0) == SIGN_EXTEND)
15022 op0 = XEXP (op0, 0);
15024 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
15025 return true;
15027 else
15029 if (VECTOR_MODE_P (mode))
15031 if (speed)
15032 /* Vector shift (register). */
15033 *cost += extra_cost->vect.alu;
15035 else
15037 if (speed)
15038 /* LSLV, ASRV. */
15039 *cost += extra_cost->alu.shift_reg;
15041 /* The register shift amount may be in a shorter mode expressed
15042 as a lowpart SUBREG. For costing purposes just look inside. */
15043 if (SUBREG_P (op1) && subreg_lowpart_p (op1))
15044 op1 = SUBREG_REG (op1);
15045 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
15046 && CONST_INT_P (XEXP (op1, 1))
15047 && known_eq (INTVAL (XEXP (op1, 1)),
15048 GET_MODE_BITSIZE (mode) - 1))
15050 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
15051 /* We already demanded XEXP (op1, 0) to be REG_P, so
15052 don't recurse into it. */
15053 return true;
15056 return false; /* All arguments need to be in registers. */
15059 case SYMBOL_REF:
15061 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
15062 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
15064 /* LDR. */
15065 if (speed)
15066 *cost += extra_cost->ldst.load;
15068 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
15069 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
15071 /* ADRP, followed by ADD. */
15072 *cost += COSTS_N_INSNS (1);
15073 if (speed)
15074 *cost += 2 * extra_cost->alu.arith;
15076 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
15077 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
15079 /* ADR. */
15080 if (speed)
15081 *cost += extra_cost->alu.arith;
15084 if (flag_pic)
15086 /* One extra load instruction, after accessing the GOT. */
15087 *cost += COSTS_N_INSNS (1);
15088 if (speed)
15089 *cost += extra_cost->ldst.load;
15091 return true;
15093 case HIGH:
15094 case LO_SUM:
15095 /* ADRP/ADD (immediate). */
15096 if (speed)
15097 *cost += extra_cost->alu.arith;
15098 return true;
15100 case ZERO_EXTRACT:
15101 case SIGN_EXTRACT:
15102 /* UBFX/SBFX. */
15103 if (speed)
15105 if (VECTOR_MODE_P (mode))
15106 *cost += extra_cost->vect.alu;
15107 else
15108 *cost += extra_cost->alu.bfx;
15111 /* We can trust that the immediates used will be correct (there
15112 are no by-register forms), so we need only cost op0. */
15113 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
15114 return true;
15116 case MULT:
15117 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
15118 /* aarch64_rtx_mult_cost always handles recursion to its
15119 operands. */
15120 return true;
15122 case MOD:
15123 /* We can expand signed mod by power of 2 using a NEGS, two parallel
15124 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
15125 an unconditional negate. This case should only ever be reached through
15126 the set_smod_pow2_cheap check in expmed.cc. */
15127 if (CONST_INT_P (XEXP (x, 1))
15128 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
15129 && (mode == SImode || mode == DImode))
15131 /* We expand to 4 instructions. Reset the baseline. */
15132 *cost = COSTS_N_INSNS (4);
15134 if (speed)
15135 *cost += 2 * extra_cost->alu.logical
15136 + 2 * extra_cost->alu.arith;
15138 return true;
15141 /* Fall-through. */
15142 case UMOD:
15143 if (speed)
15145 /* Slighly prefer UMOD over SMOD. */
15146 if (VECTOR_MODE_P (mode))
15147 *cost += extra_cost->vect.alu;
15148 else if (GET_MODE_CLASS (mode) == MODE_INT)
15149 *cost += (extra_cost->mult[mode == DImode].add
15150 + extra_cost->mult[mode == DImode].idiv
15151 + (code == MOD ? 1 : 0));
15153 return false; /* All arguments need to be in registers. */
15155 case DIV:
15156 case UDIV:
15157 case SQRT:
15158 if (speed)
15160 if (VECTOR_MODE_P (mode))
15161 *cost += extra_cost->vect.alu;
15162 else if (GET_MODE_CLASS (mode) == MODE_INT)
15163 /* There is no integer SQRT, so only DIV and UDIV can get
15164 here. */
15165 *cost += (extra_cost->mult[mode == DImode].idiv
15166 /* Slighly prefer UDIV over SDIV. */
15167 + (code == DIV ? 1 : 0));
15168 else
15169 *cost += extra_cost->fp[mode == DFmode].div;
15171 return false; /* All arguments need to be in registers. */
15173 case IF_THEN_ELSE:
15174 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15175 XEXP (x, 2), cost, speed);
15177 case EQ:
15178 case NE:
15179 case GT:
15180 case GTU:
15181 case LT:
15182 case LTU:
15183 case GE:
15184 case GEU:
15185 case LE:
15186 case LEU:
15188 return false; /* All arguments must be in registers. */
15190 case FMA:
15191 op0 = XEXP (x, 0);
15192 op1 = XEXP (x, 1);
15193 op2 = XEXP (x, 2);
15195 if (speed)
15197 if (VECTOR_MODE_P (mode))
15198 *cost += extra_cost->vect.alu;
15199 else
15200 *cost += extra_cost->fp[mode == DFmode].fma;
15203 /* FMSUB, FNMADD, and FNMSUB are free. */
15204 if (GET_CODE (op0) == NEG)
15205 op0 = XEXP (op0, 0);
15207 if (GET_CODE (op2) == NEG)
15208 op2 = XEXP (op2, 0);
15210 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15211 and the by-element operand as operand 0. */
15212 if (GET_CODE (op1) == NEG)
15213 op1 = XEXP (op1, 0);
15215 /* Catch vector-by-element operations. The by-element operand can
15216 either be (vec_duplicate (vec_select (x))) or just
15217 (vec_select (x)), depending on whether we are multiplying by
15218 a vector or a scalar.
15220 Canonicalization is not very good in these cases, FMA4 will put the
15221 by-element operand as operand 0, FNMA4 will have it as operand 1. */
15222 if (GET_CODE (op0) == VEC_DUPLICATE)
15223 op0 = XEXP (op0, 0);
15224 else if (GET_CODE (op1) == VEC_DUPLICATE)
15225 op1 = XEXP (op1, 0);
15227 if (GET_CODE (op0) == VEC_SELECT)
15228 op0 = XEXP (op0, 0);
15229 else if (GET_CODE (op1) == VEC_SELECT)
15230 op1 = XEXP (op1, 0);
15232 /* If the remaining parameters are not registers,
15233 get the cost to put them into registers. */
15234 *cost += rtx_cost (op0, mode, FMA, 0, speed);
15235 *cost += rtx_cost (op1, mode, FMA, 1, speed);
15236 *cost += rtx_cost (op2, mode, FMA, 2, speed);
15237 return true;
15239 case FLOAT:
15240 case UNSIGNED_FLOAT:
15241 if (speed)
15242 *cost += extra_cost->fp[mode == DFmode].fromint;
15243 return false;
15245 case FLOAT_EXTEND:
15246 if (speed)
15248 if (VECTOR_MODE_P (mode))
15250 /*Vector truncate. */
15251 *cost += extra_cost->vect.alu;
15253 else
15254 *cost += extra_cost->fp[mode == DFmode].widen;
15256 return false;
15258 case FLOAT_TRUNCATE:
15259 if (speed)
15261 if (VECTOR_MODE_P (mode))
15263 /*Vector conversion. */
15264 *cost += extra_cost->vect.alu;
15266 else
15267 *cost += extra_cost->fp[mode == DFmode].narrow;
15269 return false;
15271 case FIX:
15272 case UNSIGNED_FIX:
15273 x = XEXP (x, 0);
15274 /* Strip the rounding part. They will all be implemented
15275 by the fcvt* family of instructions anyway. */
15276 if (GET_CODE (x) == UNSPEC)
15278 unsigned int uns_code = XINT (x, 1);
15280 if (uns_code == UNSPEC_FRINTA
15281 || uns_code == UNSPEC_FRINTM
15282 || uns_code == UNSPEC_FRINTN
15283 || uns_code == UNSPEC_FRINTP
15284 || uns_code == UNSPEC_FRINTZ)
15285 x = XVECEXP (x, 0, 0);
15288 if (speed)
15290 if (VECTOR_MODE_P (mode))
15291 *cost += extra_cost->vect.alu;
15292 else
15293 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15296 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15297 fixed-point fcvt. */
15298 if (GET_CODE (x) == MULT
15299 && ((VECTOR_MODE_P (mode)
15300 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15301 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15303 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15304 0, speed);
15305 return true;
15308 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15309 return true;
15311 case ABS:
15312 if (VECTOR_MODE_P (mode))
15314 /* ABS (vector). */
15315 if (speed)
15316 *cost += extra_cost->vect.alu;
15318 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15320 op0 = XEXP (x, 0);
15322 /* FABD, which is analogous to FADD. */
15323 if (GET_CODE (op0) == MINUS)
15325 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15326 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15327 if (speed)
15328 *cost += extra_cost->fp[mode == DFmode].addsub;
15330 return true;
15332 /* Simple FABS is analogous to FNEG. */
15333 if (speed)
15334 *cost += extra_cost->fp[mode == DFmode].neg;
15336 else
15338 /* Integer ABS will either be split to
15339 two arithmetic instructions, or will be an ABS
15340 (scalar), which we don't model. */
15341 *cost = COSTS_N_INSNS (2);
15342 if (speed)
15343 *cost += 2 * extra_cost->alu.arith;
15345 return false;
15347 case SMAX:
15348 case SMIN:
15349 if (speed)
15351 if (VECTOR_MODE_P (mode))
15352 *cost += extra_cost->vect.alu;
15353 else
15355 /* FMAXNM/FMINNM/FMAX/FMIN.
15356 TODO: This may not be accurate for all implementations, but
15357 we do not model this in the cost tables. */
15358 *cost += extra_cost->fp[mode == DFmode].addsub;
15361 return false;
15363 case UNSPEC:
15364 /* The floating point round to integer frint* instructions. */
15365 if (aarch64_frint_unspec_p (XINT (x, 1)))
15367 if (speed)
15368 *cost += extra_cost->fp[mode == DFmode].roundint;
15370 return false;
15372 break;
15374 case TRUNCATE:
15376 /* Decompose <su>muldi3_highpart. */
15377 if (/* (truncate:DI */
15378 mode == DImode
15379 /* (lshiftrt:TI */
15380 && GET_MODE (XEXP (x, 0)) == TImode
15381 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15382 /* (mult:TI */
15383 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15384 /* (ANY_EXTEND:TI (reg:DI))
15385 (ANY_EXTEND:TI (reg:DI))) */
15386 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15387 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15388 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15389 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15390 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15391 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15392 /* (const_int 64) */
15393 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15394 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15396 /* UMULH/SMULH. */
15397 if (speed)
15398 *cost += extra_cost->mult[mode == DImode].extend;
15399 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15400 mode, MULT, 0, speed);
15401 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15402 mode, MULT, 1, speed);
15403 return true;
15405 break;
15406 case CONST_VECTOR:
15408 /* Load using MOVI/MVNI. */
15409 if (aarch64_simd_valid_immediate (x, NULL))
15410 *cost = extra_cost->vect.movi;
15411 else /* Load using constant pool. */
15412 *cost = extra_cost->ldst.load;
15413 break;
15415 case VEC_CONCAT:
15416 /* depending on the operation, either DUP or INS.
15417 For now, keep default costing. */
15418 break;
15419 case VEC_DUPLICATE:
15420 /* Load using a DUP. */
15421 *cost = extra_cost->vect.dup;
15422 return false;
15423 case VEC_SELECT:
15425 rtx op0 = XEXP (x, 0);
15426 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15428 /* cost subreg of 0 as free, otherwise as DUP */
15429 rtx op1 = XEXP (x, 1);
15430 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15432 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15433 *cost = extra_cost->vect.dup;
15434 else
15435 *cost = extra_cost->vect.extract;
15436 return true;
15438 default:
15439 break;
15442 if (dump_file
15443 && flag_aarch64_verbose_cost)
15444 fprintf (dump_file,
15445 "\nFailed to cost RTX. Assuming default cost.\n");
15447 return true;
15450 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15451 calculated for X. This cost is stored in *COST. Returns true
15452 if the total cost of X was calculated. */
15453 static bool
15454 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15455 int param, int *cost, bool speed)
15457 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15459 if (dump_file
15460 && flag_aarch64_verbose_cost)
15462 print_rtl_single (dump_file, x);
15463 fprintf (dump_file, "\n%s cost: %d (%s)\n",
15464 speed ? "Hot" : "Cold",
15465 *cost, result ? "final" : "partial");
15468 return result;
15471 static int
15472 aarch64_register_move_cost (machine_mode mode,
15473 reg_class_t from_i, reg_class_t to_i)
15475 enum reg_class from = (enum reg_class) from_i;
15476 enum reg_class to = (enum reg_class) to_i;
15477 const struct cpu_regmove_cost *regmove_cost
15478 = aarch64_tune_params.regmove_cost;
15480 /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS. */
15481 if (reg_class_subset_p (to, POINTER_REGS))
15482 to = GENERAL_REGS;
15484 if (reg_class_subset_p (from, POINTER_REGS))
15485 from = GENERAL_REGS;
15487 /* Make RDFFR very expensive. In particular, if we know that the FFR
15488 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15489 as a way of obtaining a PTRUE. */
15490 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15491 && hard_reg_set_subset_p (reg_class_contents[from_i],
15492 reg_class_contents[FFR_REGS]))
15493 return 80;
15495 /* Moving between GPR and stack cost is the same as GP2GP. */
15496 if ((from == GENERAL_REGS && to == STACK_REG)
15497 || (to == GENERAL_REGS && from == STACK_REG))
15498 return regmove_cost->GP2GP;
15500 /* To/From the stack register, we move via the gprs. */
15501 if (to == STACK_REG || from == STACK_REG)
15502 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15503 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15505 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15506 if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15507 && known_eq (GET_MODE_SIZE (mode), 16))
15509 /* 128-bit operations on general registers require 2 instructions. */
15510 if (from == GENERAL_REGS && to == GENERAL_REGS)
15511 return regmove_cost->GP2GP * 2;
15512 else if (from == GENERAL_REGS)
15513 return regmove_cost->GP2FP * 2;
15514 else if (to == GENERAL_REGS)
15515 return regmove_cost->FP2GP * 2;
15517 /* When AdvSIMD instructions are disabled it is not possible to move
15518 a 128-bit value directly between Q registers. This is handled in
15519 secondary reload. A general register is used as a scratch to move
15520 the upper DI value and the lower DI value is moved directly,
15521 hence the cost is the sum of three moves. */
15522 if (!TARGET_SIMD && !TARGET_SVE)
15523 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15525 return regmove_cost->FP2FP;
15528 if (from == GENERAL_REGS && to == GENERAL_REGS)
15529 return regmove_cost->GP2GP;
15530 else if (from == GENERAL_REGS)
15531 return regmove_cost->GP2FP;
15532 else if (to == GENERAL_REGS)
15533 return regmove_cost->FP2GP;
15535 if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15537 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15538 The cost must be greater than 2 units to indicate that direct
15539 moves aren't possible. */
15540 auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15541 + aarch64_tune_params.memmov_cost.store_fp);
15542 return MIN (CEIL (per_vector, 2), 4);
15545 return regmove_cost->FP2FP;
15548 /* Implements TARGET_MEMORY_MOVE_COST. */
15549 static int
15550 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15552 enum reg_class rclass = (enum reg_class) rclass_i;
15553 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15554 ? reg_classes_intersect_p (rclass, PR_REGS)
15555 : reg_class_subset_p (rclass, PR_REGS))
15556 return (in
15557 ? aarch64_tune_params.memmov_cost.load_pred
15558 : aarch64_tune_params.memmov_cost.store_pred);
15560 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15561 ? reg_classes_intersect_p (rclass, FP_REGS)
15562 : reg_class_subset_p (rclass, FP_REGS))
15563 return (in
15564 ? aarch64_tune_params.memmov_cost.load_fp
15565 : aarch64_tune_params.memmov_cost.store_fp);
15567 return (in
15568 ? aarch64_tune_params.memmov_cost.load_int
15569 : aarch64_tune_params.memmov_cost.store_int);
15572 /* Implement TARGET_INSN_COST. We have the opportunity to do something
15573 much more productive here, such as using insn attributes to cost things.
15574 But we don't, not yet.
15576 The main point of this current definition is to make calling insn_cost
15577 on one instruction equivalent to calling seq_cost on a sequence that
15578 contains only that instruction. The default definition would instead
15579 only look at SET_SRCs, ignoring SET_DESTs.
15581 This ensures that, for example, storing a 128-bit zero vector is more
15582 expensive than storing a 128-bit vector register. A move of zero
15583 into a 128-bit vector register followed by multiple stores of that
15584 register is then cheaper than multiple stores of zero (which would
15585 use STP of XZR). This in turn allows STP Qs to be formed. */
15586 static int
15587 aarch64_insn_cost (rtx_insn *insn, bool speed)
15589 if (rtx set = single_set (insn))
15590 return set_rtx_cost (set, speed);
15591 return pattern_cost (PATTERN (insn), speed);
15594 /* Implement TARGET_INIT_BUILTINS. */
15595 static void
15596 aarch64_init_builtins ()
15598 aarch64_general_init_builtins ();
15599 aarch64_sve::init_builtins ();
15600 #ifdef SUBTARGET_INIT_BUILTINS
15601 SUBTARGET_INIT_BUILTINS;
15602 #endif
15605 /* Implement TARGET_FOLD_BUILTIN. */
15606 static tree
15607 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15609 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15610 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15611 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15612 switch (code & AARCH64_BUILTIN_CLASS)
15614 case AARCH64_BUILTIN_GENERAL:
15615 return aarch64_general_fold_builtin (subcode, type, nargs, args);
15617 case AARCH64_BUILTIN_SVE:
15618 return NULL_TREE;
15620 gcc_unreachable ();
15623 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15624 static bool
15625 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15627 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15628 tree fndecl = gimple_call_fndecl (stmt);
15629 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15630 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15631 gimple *new_stmt = NULL;
15632 switch (code & AARCH64_BUILTIN_CLASS)
15634 case AARCH64_BUILTIN_GENERAL:
15635 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15636 break;
15638 case AARCH64_BUILTIN_SVE:
15639 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15640 break;
15643 if (!new_stmt)
15644 return false;
15646 gsi_replace (gsi, new_stmt, false);
15647 return true;
15650 /* Implement TARGET_EXPAND_BUILTIN. */
15651 static rtx
15652 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15654 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15655 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15656 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15657 switch (code & AARCH64_BUILTIN_CLASS)
15659 case AARCH64_BUILTIN_GENERAL:
15660 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15662 case AARCH64_BUILTIN_SVE:
15663 return aarch64_sve::expand_builtin (subcode, exp, target);
15665 gcc_unreachable ();
15668 /* Implement TARGET_BUILTIN_DECL. */
15669 static tree
15670 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15672 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15673 switch (code & AARCH64_BUILTIN_CLASS)
15675 case AARCH64_BUILTIN_GENERAL:
15676 return aarch64_general_builtin_decl (subcode, initialize_p);
15678 case AARCH64_BUILTIN_SVE:
15679 return aarch64_sve::builtin_decl (subcode, initialize_p);
15681 gcc_unreachable ();
15684 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15685 to optimize 1.0/sqrt. */
15687 static bool
15688 use_rsqrt_p (machine_mode mode)
15690 return (!flag_trapping_math
15691 && flag_unsafe_math_optimizations
15692 && ((aarch64_tune_params.approx_modes->recip_sqrt
15693 & AARCH64_APPROX_MODE (mode))
15694 || flag_mrecip_low_precision_sqrt));
15697 /* Function to decide when to use the approximate reciprocal square root
15698 builtin. */
15700 static tree
15701 aarch64_builtin_reciprocal (tree fndecl)
15703 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15705 if (!use_rsqrt_p (mode))
15706 return NULL_TREE;
15707 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15708 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15709 switch (code & AARCH64_BUILTIN_CLASS)
15711 case AARCH64_BUILTIN_GENERAL:
15712 return aarch64_general_builtin_rsqrt (subcode);
15714 case AARCH64_BUILTIN_SVE:
15715 return NULL_TREE;
15717 gcc_unreachable ();
15720 /* Emit code to perform the floating-point operation:
15722 DST = SRC1 * SRC2
15724 where all three operands are already known to be registers.
15725 If the operation is an SVE one, PTRUE is a suitable all-true
15726 predicate. */
15728 static void
15729 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15731 if (ptrue)
15732 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15733 dst, ptrue, src1, src2,
15734 gen_int_mode (SVE_RELAXED_GP, SImode)));
15735 else
15736 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15739 /* Emit instruction sequence to compute either the approximate square root
15740 or its approximate reciprocal, depending on the flag RECP, and return
15741 whether the sequence was emitted or not. */
15743 bool
15744 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15746 machine_mode mode = GET_MODE (dst);
15748 if (GET_MODE_INNER (mode) == HFmode)
15750 gcc_assert (!recp);
15751 return false;
15754 if (!recp)
15756 if (!(flag_mlow_precision_sqrt
15757 || (aarch64_tune_params.approx_modes->sqrt
15758 & AARCH64_APPROX_MODE (mode))))
15759 return false;
15761 if (!flag_finite_math_only
15762 || flag_trapping_math
15763 || !flag_unsafe_math_optimizations
15764 || optimize_function_for_size_p (cfun))
15765 return false;
15767 else
15768 /* Caller assumes we cannot fail. */
15769 gcc_assert (use_rsqrt_p (mode));
15771 rtx pg = NULL_RTX;
15772 if (aarch64_sve_mode_p (mode))
15773 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15774 machine_mode mmsk = (VECTOR_MODE_P (mode)
15775 ? related_int_vector_mode (mode).require ()
15776 : int_mode_for_mode (mode).require ());
15777 rtx xmsk = NULL_RTX;
15778 if (!recp)
15780 /* When calculating the approximate square root, compare the
15781 argument with 0.0 and create a mask. */
15782 rtx zero = CONST0_RTX (mode);
15783 if (pg)
15785 xmsk = gen_reg_rtx (GET_MODE (pg));
15786 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15787 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15788 xmsk, pg, hint, src, zero));
15790 else
15792 xmsk = gen_reg_rtx (mmsk);
15793 emit_insn (gen_rtx_SET (xmsk,
15794 gen_rtx_NEG (mmsk,
15795 gen_rtx_EQ (mmsk, src, zero))));
15799 /* Estimate the approximate reciprocal square root. */
15800 rtx xdst = gen_reg_rtx (mode);
15801 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15803 /* Iterate over the series twice for SF and thrice for DF. */
15804 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15806 /* Optionally iterate over the series once less for faster performance
15807 while sacrificing the accuracy. */
15808 if ((recp && flag_mrecip_low_precision_sqrt)
15809 || (!recp && flag_mlow_precision_sqrt))
15810 iterations--;
15812 /* Iterate over the series to calculate the approximate reciprocal square
15813 root. */
15814 rtx x1 = gen_reg_rtx (mode);
15815 while (iterations--)
15817 rtx x2 = gen_reg_rtx (mode);
15818 aarch64_emit_mult (x2, pg, xdst, xdst);
15820 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15822 if (iterations > 0)
15823 aarch64_emit_mult (xdst, pg, xdst, x1);
15826 if (!recp)
15828 if (pg)
15829 /* Multiply nonzero source values by the corresponding intermediate
15830 result elements, so that the final calculation is the approximate
15831 square root rather than its reciprocal. Select a zero result for
15832 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15833 otherwise. */
15834 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15835 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15836 else
15838 /* Qualify the approximate reciprocal square root when the
15839 argument is 0.0 by squashing the intermediary result to 0.0. */
15840 rtx xtmp = gen_reg_rtx (mmsk);
15841 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15842 gen_rtx_SUBREG (mmsk, xdst, 0)));
15843 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15845 /* Calculate the approximate square root. */
15846 aarch64_emit_mult (xdst, pg, xdst, src);
15850 /* Finalize the approximation. */
15851 aarch64_emit_mult (dst, pg, xdst, x1);
15853 return true;
15856 /* Emit the instruction sequence to compute the approximation for the division
15857 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15859 bool
15860 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15862 machine_mode mode = GET_MODE (quo);
15864 if (GET_MODE_INNER (mode) == HFmode)
15865 return false;
15867 bool use_approx_division_p = (flag_mlow_precision_div
15868 || (aarch64_tune_params.approx_modes->division
15869 & AARCH64_APPROX_MODE (mode)));
15871 if (!flag_finite_math_only
15872 || flag_trapping_math
15873 || !flag_unsafe_math_optimizations
15874 || optimize_function_for_size_p (cfun)
15875 || !use_approx_division_p)
15876 return false;
15878 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15879 return false;
15881 rtx pg = NULL_RTX;
15882 if (aarch64_sve_mode_p (mode))
15883 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15885 /* Estimate the approximate reciprocal. */
15886 rtx xrcp = gen_reg_rtx (mode);
15887 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15889 /* Iterate over the series twice for SF and thrice for DF. */
15890 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15892 /* Optionally iterate over the series less for faster performance,
15893 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15894 if (flag_mlow_precision_div)
15895 iterations = (GET_MODE_INNER (mode) == DFmode
15896 ? aarch64_double_recp_precision
15897 : aarch64_float_recp_precision);
15899 /* Iterate over the series to calculate the approximate reciprocal. */
15900 rtx xtmp = gen_reg_rtx (mode);
15901 while (iterations--)
15903 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15905 if (iterations > 0)
15906 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15909 if (num != CONST1_RTX (mode))
15911 /* As the approximate reciprocal of DEN is already calculated, only
15912 calculate the approximate division when NUM is not 1.0. */
15913 rtx xnum = force_reg (mode, num);
15914 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15917 /* Finalize the approximation. */
15918 aarch64_emit_mult (quo, pg, xrcp, xtmp);
15919 return true;
15922 /* Return the number of instructions that can be issued per cycle. */
15923 static int
15924 aarch64_sched_issue_rate (void)
15926 return aarch64_tune_params.issue_rate;
15929 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15930 static int
15931 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15933 if (DEBUG_INSN_P (insn))
15934 return more;
15936 rtx_code code = GET_CODE (PATTERN (insn));
15937 if (code == USE || code == CLOBBER)
15938 return more;
15940 if (get_attr_type (insn) == TYPE_NO_INSN)
15941 return more;
15943 return more - 1;
15946 static int
15947 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15949 int issue_rate = aarch64_sched_issue_rate ();
15951 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15955 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15956 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15957 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15959 static int
15960 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15961 int ready_index)
15963 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15967 /* Vectorizer cost model target hooks. */
15969 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15970 return the decl that should be recorded. Return null otherwise. */
15971 tree
15972 aarch64_vector_load_decl (tree addr)
15974 if (TREE_CODE (addr) != ADDR_EXPR)
15975 return NULL_TREE;
15976 tree base = get_base_address (TREE_OPERAND (addr, 0));
15977 if (TREE_CODE (base) != VAR_DECL)
15978 return NULL_TREE;
15979 return base;
15982 /* Return true if STMT_INFO accesses a decl that is known to be the
15983 argument to a vld1 in the same function. */
15984 static bool
15985 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15987 if (!cfun->machine->vector_load_decls)
15988 return false;
15989 auto dr = STMT_VINFO_DATA_REF (stmt_info);
15990 if (!dr)
15991 return false;
15992 tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15993 return decl && cfun->machine->vector_load_decls->contains (decl);
15996 /* Information about how the CPU would issue the scalar, Advanced SIMD
15997 or SVE version of a vector loop, using the scheme defined by the
15998 aarch64_base_vec_issue_info hierarchy of structures. */
15999 class aarch64_vec_op_count
16001 public:
16002 aarch64_vec_op_count () = default;
16003 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
16004 unsigned int = 1);
16006 unsigned int vec_flags () const { return m_vec_flags; }
16007 unsigned int vf_factor () const { return m_vf_factor; }
16009 const aarch64_base_vec_issue_info *base_issue_info () const;
16010 const aarch64_simd_vec_issue_info *simd_issue_info () const;
16011 const aarch64_sve_vec_issue_info *sve_issue_info () const;
16013 fractional_cost rename_cycles_per_iter () const;
16014 fractional_cost min_nonpred_cycles_per_iter () const;
16015 fractional_cost min_pred_cycles_per_iter () const;
16016 fractional_cost min_cycles_per_iter () const;
16018 void dump () const;
16020 /* The number of individual "general" operations. See the comments
16021 in aarch64_base_vec_issue_info for details. */
16022 unsigned int general_ops = 0;
16024 /* The number of load and store operations, under the same scheme
16025 as above. */
16026 unsigned int loads = 0;
16027 unsigned int stores = 0;
16029 /* The minimum number of cycles needed to execute all loop-carried
16030 operations, which in the vector code become associated with
16031 reductions. */
16032 unsigned int reduction_latency = 0;
16034 /* The number of individual predicate operations. See the comments
16035 in aarch64_sve_vec_issue_info for details. */
16036 unsigned int pred_ops = 0;
16038 private:
16039 /* The issue information for the core. */
16040 const aarch64_vec_issue_info *m_issue_info = nullptr;
16042 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16043 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16044 Advanced SIMD code.
16045 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16046 SVE code. */
16047 unsigned int m_vec_flags = 0;
16049 /* Assume that, when the code is executing on the core described
16050 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16051 times more data than the vectorizer anticipates.
16053 This is only ever different from 1 for SVE. It allows us to consider
16054 what would happen on a 256-bit SVE target even when the -mtune
16055 parameters say that the “likely” SVE length is 128 bits. */
16056 unsigned int m_vf_factor = 1;
16059 aarch64_vec_op_count::
16060 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
16061 unsigned int vec_flags, unsigned int vf_factor)
16062 : m_issue_info (issue_info),
16063 m_vec_flags (vec_flags),
16064 m_vf_factor (vf_factor)
16068 /* Return the base issue information (i.e. the parts that make sense
16069 for both scalar and vector code). Return null if we have no issue
16070 information. */
16071 const aarch64_base_vec_issue_info *
16072 aarch64_vec_op_count::base_issue_info () const
16074 if (auto *ret = simd_issue_info ())
16075 return ret;
16076 return m_issue_info->scalar;
16079 /* If the structure describes vector code and we have associated issue
16080 information, return that issue information, otherwise return null. */
16081 const aarch64_simd_vec_issue_info *
16082 aarch64_vec_op_count::simd_issue_info () const
16084 if (auto *ret = sve_issue_info ())
16085 return ret;
16086 if (m_vec_flags)
16087 return m_issue_info->advsimd;
16088 return nullptr;
16091 /* If the structure describes SVE code and we have associated issue
16092 information, return that issue information, otherwise return null. */
16093 const aarch64_sve_vec_issue_info *
16094 aarch64_vec_op_count::sve_issue_info () const
16096 if (m_vec_flags & VEC_ANY_SVE)
16097 return m_issue_info->sve;
16098 return nullptr;
16101 /* Estimate the minimum number of cycles per iteration needed to rename
16102 the instructions.
16104 ??? For now this is done inline rather than via cost tables, since it
16105 isn't clear how it should be parameterized for the general case. */
16106 fractional_cost
16107 aarch64_vec_op_count::rename_cycles_per_iter () const
16109 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16110 || sve_issue_info () == &neoversen2_sve_issue_info
16111 || sve_issue_info () == &neoversev2_sve_issue_info)
16112 /* + 1 for an addition. We've already counted a general op for each
16113 store, so we don't need to account for stores separately. The branch
16114 reads no registers and so does not need to be counted either.
16116 ??? This value is very much on the pessimistic side, but seems to work
16117 pretty well in practice. */
16118 return { general_ops + loads + pred_ops + 1, 5 };
16120 return 0;
16123 /* Like min_cycles_per_iter, but excluding predicate operations. */
16124 fractional_cost
16125 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16127 auto *issue_info = base_issue_info ();
16129 fractional_cost cycles = MAX (reduction_latency, 1);
16130 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
16131 cycles = std::max (cycles, { loads + stores,
16132 issue_info->loads_stores_per_cycle });
16133 cycles = std::max (cycles, { general_ops,
16134 issue_info->general_ops_per_cycle });
16135 cycles = std::max (cycles, rename_cycles_per_iter ());
16136 return cycles;
16139 /* Like min_cycles_per_iter, but including only the predicate operations. */
16140 fractional_cost
16141 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16143 if (auto *issue_info = sve_issue_info ())
16144 return { pred_ops, issue_info->pred_ops_per_cycle };
16145 return 0;
16148 /* Estimate the minimum number of cycles needed to issue the operations.
16149 This is a very simplistic model! */
16150 fractional_cost
16151 aarch64_vec_op_count::min_cycles_per_iter () const
16153 return std::max (min_nonpred_cycles_per_iter (),
16154 min_pred_cycles_per_iter ());
16157 /* Dump information about the structure. */
16158 void
16159 aarch64_vec_op_count::dump () const
16161 dump_printf_loc (MSG_NOTE, vect_location,
16162 " load operations = %d\n", loads);
16163 dump_printf_loc (MSG_NOTE, vect_location,
16164 " store operations = %d\n", stores);
16165 dump_printf_loc (MSG_NOTE, vect_location,
16166 " general operations = %d\n", general_ops);
16167 if (sve_issue_info ())
16168 dump_printf_loc (MSG_NOTE, vect_location,
16169 " predicate operations = %d\n", pred_ops);
16170 dump_printf_loc (MSG_NOTE, vect_location,
16171 " reduction latency = %d\n", reduction_latency);
16172 if (auto rcpi = rename_cycles_per_iter ())
16173 dump_printf_loc (MSG_NOTE, vect_location,
16174 " estimated cycles per iteration to rename = %f\n",
16175 rcpi.as_double ());
16176 if (auto pred_cpi = min_pred_cycles_per_iter ())
16178 dump_printf_loc (MSG_NOTE, vect_location,
16179 " estimated min cycles per iteration"
16180 " without predication = %f\n",
16181 min_nonpred_cycles_per_iter ().as_double ());
16182 dump_printf_loc (MSG_NOTE, vect_location,
16183 " estimated min cycles per iteration"
16184 " for predication = %f\n", pred_cpi.as_double ());
16186 if (auto cpi = min_cycles_per_iter ())
16187 dump_printf_loc (MSG_NOTE, vect_location,
16188 " estimated min cycles per iteration = %f\n",
16189 cpi.as_double ());
16192 /* Information about vector code that we're in the process of costing. */
16193 class aarch64_vector_costs : public vector_costs
16195 public:
16196 aarch64_vector_costs (vec_info *, bool);
16198 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16199 stmt_vec_info stmt_info, slp_tree, tree vectype,
16200 int misalign,
16201 vect_cost_model_location where) override;
16202 void finish_cost (const vector_costs *) override;
16203 bool better_main_loop_than_p (const vector_costs *other) const override;
16205 private:
16206 void record_potential_advsimd_unrolling (loop_vec_info);
16207 void analyze_loop_vinfo (loop_vec_info);
16208 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
16209 aarch64_vec_op_count *);
16210 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16211 fractional_cost, unsigned int,
16212 unsigned int *, bool *);
16213 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16214 unsigned int);
16215 bool prefer_unrolled_loop () const;
16216 unsigned int determine_suggested_unroll_factor ();
16218 /* True if we have performed one-time initialization based on the
16219 vec_info. */
16220 bool m_analyzed_vinfo = false;
16222 /* This loop uses an average operation that is not supported by SVE, but is
16223 supported by Advanced SIMD and SVE2. */
16224 bool m_has_avg = false;
16226 /* True if the vector body contains a store to a decl and if the
16227 function is known to have a vld1 from the same decl.
16229 In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16230 initializing a vector is:
16232 float f[4] = { elts };
16233 float32x4_t x = vld1q_f32(f);
16235 We should strongly prefer vectorization of the initialization of f,
16236 so that the store to f and the load back can be optimized away,
16237 leaving a vectorization of { elts }. */
16238 bool m_stores_to_vector_load_decl = false;
16240 /* Non-zero if the last operation we costed is a vector promotion or demotion.
16241 In this case the value is the number of insns in the last operation.
16243 On AArch64 vector promotion and demotions require us to first widen or
16244 narrow the input and only after that emit conversion instructions. For
16245 costing this means we need to emit the cost of the final conversions as
16246 well. */
16247 unsigned int m_num_last_promote_demote = 0;
16249 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16250 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16251 SIMD code.
16252 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
16253 unsigned int m_vec_flags = 0;
16255 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16256 This means that code such as:
16258 a[0] = x;
16259 a[1] = x;
16261 will be costed as two scalar instructions and two vector instructions
16262 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
16263 wins if the costs are equal, because of the fact that the vector costs
16264 include constant initializations whereas the scalar costs don't.
16265 We would therefore tend to vectorize the code above, even though
16266 the scalar version can use a single STP.
16268 We should eventually fix this and model LDP and STP in the main costs;
16269 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16270 Until then, we look specifically for code that does nothing more than
16271 STP-like operations. We cost them on that basis in addition to the
16272 normal latency-based costs.
16274 If the scalar or vector code could be a sequence of STPs +
16275 initialization, this variable counts the cost of the sequence,
16276 with 2 units per instruction. The variable is ~0U for other
16277 kinds of code. */
16278 unsigned int m_stp_sequence_cost = 0;
16280 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16281 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
16282 situations, we try to predict whether an Advanced SIMD implementation
16283 of the loop could be completely unrolled and become straight-line code.
16284 If so, it is generally better to use the Advanced SIMD version rather
16285 than length-agnostic SVE, since the SVE loop would execute an unknown
16286 number of times and so could not be completely unrolled in the same way.
16288 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16289 number of Advanced SIMD loop iterations that would be unrolled and
16290 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16291 in the unrolled loop. Both values are zero if we're not applying
16292 the heuristic. */
16293 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16294 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16296 /* If we're vectorizing a loop that executes a constant number of times,
16297 this variable gives the number of times that the vector loop would
16298 iterate, otherwise it is zero. */
16299 uint64_t m_num_vector_iterations = 0;
16301 /* Used only when vectorizing loops. Estimates the number and kind of
16302 operations that would be needed by one iteration of the scalar
16303 or vector loop. There is one entry for each tuning option of
16304 interest. */
16305 auto_vec<aarch64_vec_op_count, 2> m_ops;
16308 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16309 bool costing_for_scalar)
16310 : vector_costs (vinfo, costing_for_scalar),
16311 m_vec_flags (costing_for_scalar ? 0
16312 : aarch64_classify_vector_mode (vinfo->vector_mode))
16314 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16316 m_ops.quick_push ({ issue_info, m_vec_flags });
16317 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16319 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16320 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16321 vf_factor });
16326 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
16327 vector_costs *
16328 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16330 return new aarch64_vector_costs (vinfo, costing_for_scalar);
16333 /* Return true if the current CPU should use the new costs defined
16334 in GCC 11. This should be removed for GCC 12 and above, with the
16335 costs applying to all CPUs instead. */
16336 static bool
16337 aarch64_use_new_vector_costs_p ()
16339 return (aarch64_tune_params.extra_tuning_flags
16340 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16343 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16344 static const simd_vec_cost *
16345 aarch64_simd_vec_costs (tree vectype)
16347 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16348 if (vectype != NULL
16349 && aarch64_sve_mode_p (TYPE_MODE (vectype))
16350 && costs->sve != NULL)
16351 return costs->sve;
16352 return costs->advsimd;
16355 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16356 static const simd_vec_cost *
16357 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16359 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16360 if ((flags & VEC_ANY_SVE) && costs->sve)
16361 return costs->sve;
16362 return costs->advsimd;
16365 /* If STMT_INFO is a memory reference, return the scalar memory type,
16366 otherwise return null. */
16367 static tree
16368 aarch64_dr_type (stmt_vec_info stmt_info)
16370 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16371 return TREE_TYPE (DR_REF (dr));
16372 return NULL_TREE;
16375 /* Decide whether to use the unrolling heuristic described above
16376 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16377 describes the loop that we're vectorizing. */
16378 void
16379 aarch64_vector_costs::
16380 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16382 /* The heuristic only makes sense on targets that have the same
16383 vector throughput for SVE and Advanced SIMD. */
16384 if (!(aarch64_tune_params.extra_tuning_flags
16385 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16386 return;
16388 /* We only want to apply the heuristic if LOOP_VINFO is being
16389 vectorized for SVE. */
16390 if (!(m_vec_flags & VEC_ANY_SVE))
16391 return;
16393 /* Check whether it is possible in principle to use Advanced SIMD
16394 instead. */
16395 if (aarch64_autovec_preference == 2)
16396 return;
16398 /* We don't want to apply the heuristic to outer loops, since it's
16399 harder to track two levels of unrolling. */
16400 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16401 return;
16403 /* Only handle cases in which the number of Advanced SIMD iterations
16404 would be known at compile time but the number of SVE iterations
16405 would not. */
16406 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16407 || aarch64_sve_vg.is_constant ())
16408 return;
16410 /* Guess how many times the Advanced SIMD loop would iterate and make
16411 sure that it is within the complete unrolling limit. Even if the
16412 number of iterations is small enough, the number of statements might
16413 not be, which is why we need to estimate the number of statements too. */
16414 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16415 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16416 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16417 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16418 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16419 return;
16421 /* Record that we're applying the heuristic and should try to estimate
16422 the number of statements in the Advanced SIMD loop. */
16423 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16426 /* Do one-time initialization of the aarch64_vector_costs given that we're
16427 costing the loop vectorization described by LOOP_VINFO. */
16428 void
16429 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16431 /* Record the number of times that the vector loop would execute,
16432 if known. */
16433 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16434 auto scalar_niters = max_stmt_executions_int (loop);
16435 if (scalar_niters >= 0)
16437 unsigned int vf = vect_vf_for_cost (loop_vinfo);
16438 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16439 m_num_vector_iterations = scalar_niters / vf;
16440 else
16441 m_num_vector_iterations = CEIL (scalar_niters, vf);
16444 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16445 heuristic described above m_unrolled_advsimd_niters. */
16446 record_potential_advsimd_unrolling (loop_vinfo);
16449 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16450 static int
16451 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16452 tree vectype,
16453 int misalign ATTRIBUTE_UNUSED)
16455 unsigned elements;
16456 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16457 bool fp = false;
16459 if (vectype != NULL)
16460 fp = FLOAT_TYPE_P (vectype);
16462 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16464 switch (type_of_cost)
16466 case scalar_stmt:
16467 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16469 case scalar_load:
16470 return costs->scalar_load_cost;
16472 case scalar_store:
16473 return costs->scalar_store_cost;
16475 case vector_stmt:
16476 return fp ? simd_costs->fp_stmt_cost
16477 : simd_costs->int_stmt_cost;
16479 case vector_load:
16480 return simd_costs->align_load_cost;
16482 case vector_store:
16483 return simd_costs->store_cost;
16485 case vec_to_scalar:
16486 return simd_costs->vec_to_scalar_cost;
16488 case scalar_to_vec:
16489 return simd_costs->scalar_to_vec_cost;
16491 case unaligned_load:
16492 case vector_gather_load:
16493 return simd_costs->unalign_load_cost;
16495 case unaligned_store:
16496 case vector_scatter_store:
16497 return simd_costs->unalign_store_cost;
16499 case cond_branch_taken:
16500 return costs->cond_taken_branch_cost;
16502 case cond_branch_not_taken:
16503 return costs->cond_not_taken_branch_cost;
16505 case vec_perm:
16506 return simd_costs->permute_cost;
16508 case vec_promote_demote:
16509 return fp ? simd_costs->fp_stmt_cost
16510 : simd_costs->int_stmt_cost;
16512 case vec_construct:
16513 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16514 return elements / 2 + 1;
16516 default:
16517 gcc_unreachable ();
16521 /* Return true if an access of kind KIND for STMT_INFO represents one
16522 vector of an LD[234] or ST[234] operation. Return the total number of
16523 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16524 static int
16525 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16527 if ((kind == vector_load
16528 || kind == unaligned_load
16529 || kind == vector_store
16530 || kind == unaligned_store)
16531 && STMT_VINFO_DATA_REF (stmt_info))
16533 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16534 if (stmt_info
16535 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16536 return DR_GROUP_SIZE (stmt_info);
16538 return 0;
16541 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16542 vectors would produce a series of LDP or STP operations. KIND is the
16543 kind of statement that STMT_INFO represents. */
16544 static bool
16545 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16546 stmt_vec_info stmt_info)
16548 switch (kind)
16550 case vector_load:
16551 case vector_store:
16552 case unaligned_load:
16553 case unaligned_store:
16554 break;
16556 default:
16557 return false;
16560 return is_gimple_assign (stmt_info->stmt);
16563 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16564 or multiply-subtract sequence that might be suitable for fusing into a
16565 single instruction. If VEC_FLAGS is zero, analyze the operation as
16566 a scalar one, otherwise analyze it as an operation on vectors with those
16567 VEC_* flags. */
16568 static bool
16569 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16570 unsigned int vec_flags)
16572 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16573 if (!assign)
16574 return false;
16575 tree_code code = gimple_assign_rhs_code (assign);
16576 if (code != PLUS_EXPR && code != MINUS_EXPR)
16577 return false;
16579 auto is_mul_result = [&](int i)
16581 tree rhs = gimple_op (assign, i);
16582 /* ??? Should we try to check for a single use as well? */
16583 if (TREE_CODE (rhs) != SSA_NAME)
16584 return false;
16586 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16587 if (!def_stmt_info
16588 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16589 return false;
16590 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16591 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16592 return false;
16594 if (vec_flags & VEC_ADVSIMD)
16596 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16597 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16598 only supports MLA forms, so will require a move if the result
16599 cannot be tied to the accumulator. The most important case in
16600 which this is true is when the accumulator input is invariant. */
16601 rhs = gimple_op (assign, 3 - i);
16602 if (TREE_CODE (rhs) != SSA_NAME)
16603 return false;
16604 def_stmt_info = vinfo->lookup_def (rhs);
16605 if (!def_stmt_info
16606 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16607 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16608 return false;
16611 return true;
16614 if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16615 /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16616 multiplication must be on the second operand (to form an FMLS).
16617 But if both operands are multiplications and the second operand
16618 is used more than once, we'll instead negate the second operand
16619 and use it as an accumulator for the first operand. */
16620 return (is_mul_result (2)
16621 && (has_single_use (gimple_assign_rhs2 (assign))
16622 || !is_mul_result (1)));
16624 return is_mul_result (1) || is_mul_result (2);
16627 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16628 expression sequence that might be suitable for fusing into a
16629 single instruction. If VEC_FLAGS is zero, analyze the operation as
16630 a scalar one, otherwise analyze it as an operation on vectors with those
16631 VEC_* flags. */
16633 static bool
16634 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16635 unsigned int vec_flags)
16637 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16638 if (!assign
16639 || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16640 || !STMT_VINFO_VECTYPE (stmt_info)
16641 || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16642 return false;
16644 for (int i = 1; i < 3; ++i)
16646 tree rhs = gimple_op (assign, i);
16648 if (TREE_CODE (rhs) != SSA_NAME)
16649 continue;
16651 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16652 if (!def_stmt_info
16653 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16654 continue;
16656 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16657 if (!rhs_assign
16658 || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16659 != tcc_comparison)
16660 continue;
16662 if (vec_flags & VEC_ADVSIMD)
16663 return false;
16665 return true;
16667 return false;
16670 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16671 in-loop reduction that SVE supports directly, return its latency in cycles,
16672 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16673 instructions. */
16674 static unsigned int
16675 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16676 stmt_vec_info stmt_info,
16677 const sve_vec_cost *sve_costs)
16679 switch (vect_reduc_type (vinfo, stmt_info))
16681 case EXTRACT_LAST_REDUCTION:
16682 return sve_costs->clast_cost;
16684 case FOLD_LEFT_REDUCTION:
16685 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16687 case E_HFmode:
16688 case E_BFmode:
16689 return sve_costs->fadda_f16_cost;
16691 case E_SFmode:
16692 return sve_costs->fadda_f32_cost;
16694 case E_DFmode:
16695 return sve_costs->fadda_f64_cost;
16697 default:
16698 break;
16700 break;
16703 return 0;
16706 /* STMT_INFO describes a loop-carried operation in the original scalar code
16707 that we are considering implementing as a reduction. Return one of the
16708 following values, depending on VEC_FLAGS:
16710 - If VEC_FLAGS is zero, return the loop carry latency of the original
16711 scalar operation.
16713 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16714 Advanced SIMD implementation.
16716 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16717 SVE implementation. */
16718 static unsigned int
16719 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16720 unsigned int vec_flags)
16722 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16723 const sve_vec_cost *sve_costs = nullptr;
16724 if (vec_flags & VEC_ANY_SVE)
16725 sve_costs = aarch64_tune_params.vec_costs->sve;
16727 /* If the caller is asking for the SVE latency, check for forms of reduction
16728 that only SVE can handle directly. */
16729 if (sve_costs)
16731 unsigned int latency
16732 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16733 if (latency)
16734 return latency;
16737 /* Handle scalar costs. */
16738 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16739 if (vec_flags == 0)
16741 if (is_float)
16742 return vec_costs->scalar_fp_stmt_cost;
16743 return vec_costs->scalar_int_stmt_cost;
16746 /* Otherwise, the loop body just contains normal integer or FP operations,
16747 with a vector reduction outside the loop. */
16748 const simd_vec_cost *simd_costs
16749 = aarch64_simd_vec_costs_for_flags (vec_flags);
16750 if (is_float)
16751 return simd_costs->fp_stmt_cost;
16752 return simd_costs->int_stmt_cost;
16755 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16756 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16757 try to subdivide the target-independent categorization provided by KIND
16758 to get a more accurate cost. */
16759 static fractional_cost
16760 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16761 stmt_vec_info stmt_info,
16762 fractional_cost stmt_cost)
16764 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16765 the extension with the load. */
16766 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16767 return 0;
16769 return stmt_cost;
16772 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16773 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16774 when vectorized would operate on vector type VECTYPE. Try to subdivide
16775 the target-independent categorization provided by KIND to get a more
16776 accurate cost. WHERE specifies where the cost associated with KIND
16777 occurs. */
16778 static fractional_cost
16779 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16780 stmt_vec_info stmt_info, tree vectype,
16781 enum vect_cost_model_location where,
16782 fractional_cost stmt_cost)
16784 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16785 const sve_vec_cost *sve_costs = nullptr;
16786 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16787 sve_costs = aarch64_tune_params.vec_costs->sve;
16789 /* It's generally better to avoid costing inductions, since the induction
16790 will usually be hidden by other operations. This is particularly true
16791 for things like COND_REDUCTIONS. */
16792 if (is_a<gphi *> (stmt_info->stmt))
16793 return 0;
16795 /* Detect cases in which vec_to_scalar is describing the extraction of a
16796 vector element in preparation for a scalar store. The store itself is
16797 costed separately. */
16798 if (vect_is_store_elt_extraction (kind, stmt_info))
16799 return simd_costs->store_elt_extra_cost;
16801 /* Detect SVE gather loads, which are costed as a single scalar_load
16802 for each element. We therefore need to divide the full-instruction
16803 cost by the number of elements in the vector. */
16804 if (kind == scalar_load
16805 && sve_costs
16806 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16808 unsigned int nunits = vect_nunits_for_cost (vectype);
16809 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16810 return { sve_costs->gather_load_x64_cost, nunits };
16811 return { sve_costs->gather_load_x32_cost, nunits };
16814 /* Detect cases in which a scalar_store is really storing one element
16815 in a scatter operation. */
16816 if (kind == scalar_store
16817 && sve_costs
16818 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16819 return sve_costs->scatter_store_elt_cost;
16821 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16822 if (kind == vec_to_scalar
16823 && where == vect_body
16824 && sve_costs)
16826 unsigned int latency
16827 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16828 if (latency)
16829 return latency;
16832 /* Detect cases in which vec_to_scalar represents a single reduction
16833 instruction like FADDP or MAXV. */
16834 if (kind == vec_to_scalar
16835 && where == vect_epilogue
16836 && vect_is_reduction (stmt_info))
16837 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16839 case E_QImode:
16840 return simd_costs->reduc_i8_cost;
16842 case E_HImode:
16843 return simd_costs->reduc_i16_cost;
16845 case E_SImode:
16846 return simd_costs->reduc_i32_cost;
16848 case E_DImode:
16849 return simd_costs->reduc_i64_cost;
16851 case E_HFmode:
16852 case E_BFmode:
16853 return simd_costs->reduc_f16_cost;
16855 case E_SFmode:
16856 return simd_costs->reduc_f32_cost;
16858 case E_DFmode:
16859 return simd_costs->reduc_f64_cost;
16861 default:
16862 break;
16865 /* Otherwise stick with the original categorization. */
16866 return stmt_cost;
16869 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16870 for STMT_INFO, which has cost kind KIND and which when vectorized would
16871 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16872 targets. */
16873 static fractional_cost
16874 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16875 stmt_vec_info stmt_info, tree vectype,
16876 fractional_cost stmt_cost)
16878 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16879 vector register size or number of units. Integer promotions of this
16880 type therefore map to SXT[BHW] or UXT[BHW].
16882 Most loads have extending forms that can do the sign or zero extension
16883 on the fly. Optimistically assume that a load followed by an extension
16884 will fold to this form during combine, and that the extension therefore
16885 comes for free. */
16886 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16887 stmt_cost = 0;
16889 /* For similar reasons, vector_stmt integer truncations are a no-op,
16890 because we can just ignore the unused upper bits of the source. */
16891 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16892 stmt_cost = 0;
16894 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16895 but there are no equivalent instructions for SVE. This means that
16896 (all other things being equal) 128-bit SVE needs twice as many load
16897 and store instructions as Advanced SIMD in order to process vector pairs.
16899 Also, scalar code can often use LDP and STP to access pairs of values,
16900 so it is too simplistic to say that one SVE load or store replaces
16901 VF scalar loads and stores.
16903 Ideally we would account for this in the scalar and Advanced SIMD
16904 costs by making suitable load/store pairs as cheap as a single
16905 load/store. However, that would be a very invasive change and in
16906 practice it tends to stress other parts of the cost model too much.
16907 E.g. stores of scalar constants currently count just a store,
16908 whereas stores of vector constants count a store and a vec_init.
16909 This is an artificial distinction for AArch64, where stores of
16910 nonzero scalar constants need the same kind of register invariant
16911 as vector stores.
16913 An alternative would be to double the cost of any SVE loads and stores
16914 that could be paired in Advanced SIMD (and possibly also paired in
16915 scalar code). But this tends to stress other parts of the cost model
16916 in the same way. It also means that we can fall back to Advanced SIMD
16917 even if full-loop predication would have been useful.
16919 Here we go for a more conservative version: double the costs of SVE
16920 loads and stores if one iteration of the scalar loop processes enough
16921 elements for it to use a whole number of Advanced SIMD LDP or STP
16922 instructions. This makes it very likely that the VF would be 1 for
16923 Advanced SIMD, and so no epilogue should be needed. */
16924 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16926 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16927 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16928 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16929 if (multiple_p (count * elt_bits, 256)
16930 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16931 stmt_cost *= 2;
16934 return stmt_cost;
16937 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16938 and which when vectorized would operate on vector type VECTYPE. Add the
16939 cost of any embedded operations. */
16940 static fractional_cost
16941 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
16942 stmt_vec_info stmt_info, tree vectype,
16943 unsigned vec_flags, fractional_cost stmt_cost)
16945 if (vectype)
16947 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16949 /* Detect cases in which a vector load or store represents an
16950 LD[234] or ST[234] instruction. */
16951 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16953 case 2:
16954 stmt_cost += simd_costs->ld2_st2_permute_cost;
16955 break;
16957 case 3:
16958 stmt_cost += simd_costs->ld3_st3_permute_cost;
16959 break;
16961 case 4:
16962 stmt_cost += simd_costs->ld4_st4_permute_cost;
16963 break;
16966 gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
16967 if ((kind == scalar_stmt || kind == vector_stmt) && assign)
16969 /* For MLA we need to reduce the cost since MLA is 1 instruction. */
16970 if (!vect_is_reduction (stmt_info)
16971 && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
16972 return 0;
16974 /* For vector boolean ANDs with a compare operand we just need
16975 one insn. */
16976 if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
16977 return 0;
16980 if (kind == vector_stmt || kind == vec_to_scalar)
16981 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16983 if (FLOAT_TYPE_P (cmp_type))
16984 stmt_cost += simd_costs->fp_stmt_cost;
16985 else
16986 stmt_cost += simd_costs->int_stmt_cost;
16990 if (kind == scalar_stmt)
16991 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16993 if (FLOAT_TYPE_P (cmp_type))
16994 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16995 else
16996 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16999 return stmt_cost;
17002 /* Return true if STMT_INFO is part of a reduction that has the form:
17004 r = r op ...;
17005 r = r op ...;
17007 with the single accumulator being read and written multiple times. */
17008 static bool
17009 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
17011 if (!STMT_VINFO_REDUC_DEF (stmt_info))
17012 return false;
17014 auto reduc_info = info_for_reduction (vinfo, stmt_info);
17015 return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
17018 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
17019 and they describe an operation in the body of a vector loop. Record issue
17020 information relating to the vector operation in OPS. */
17021 void
17022 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
17023 stmt_vec_info stmt_info,
17024 aarch64_vec_op_count *ops)
17026 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
17027 if (!base_issue)
17028 return;
17029 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
17030 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
17032 /* Calculate the minimum cycles per iteration imposed by a reduction
17033 operation. */
17034 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17035 && vect_is_reduction (stmt_info))
17037 unsigned int base
17038 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
17039 if (aarch64_force_single_cycle (m_vinfo, stmt_info))
17040 /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17041 and then accumulate that, but at the moment the loop-carried
17042 dependency includes all copies. */
17043 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
17044 else
17045 ops->reduction_latency = MAX (ops->reduction_latency, base);
17048 if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
17050 /* Assume that multiply-adds will become a single operation. */
17051 if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
17052 return;
17054 /* Assume that bool AND with compare operands will become a single
17055 operation. */
17056 if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
17057 return;
17061 /* Count the basic operation cost associated with KIND. */
17062 switch (kind)
17064 case cond_branch_taken:
17065 case cond_branch_not_taken:
17066 case vector_gather_load:
17067 case vector_scatter_store:
17068 /* We currently don't expect these to be used in a loop body. */
17069 break;
17071 case vec_perm:
17072 case vec_promote_demote:
17073 case vec_construct:
17074 case vec_to_scalar:
17075 case scalar_to_vec:
17076 case vector_stmt:
17077 case scalar_stmt:
17078 ops->general_ops += count;
17079 break;
17081 case scalar_load:
17082 case vector_load:
17083 case unaligned_load:
17084 ops->loads += count;
17085 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17086 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
17087 break;
17089 case vector_store:
17090 case unaligned_store:
17091 case scalar_store:
17092 ops->stores += count;
17093 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17094 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
17095 break;
17098 /* Add any embedded comparison operations. */
17099 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17100 && vect_embedded_comparison_type (stmt_info))
17101 ops->general_ops += count;
17103 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17104 have only accounted for one. */
17105 if ((kind == vector_stmt || kind == vec_to_scalar)
17106 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
17107 ops->general_ops += count;
17109 /* Count the predicate operations needed by an SVE comparison. */
17110 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
17111 if (tree type = vect_comparison_type (stmt_info))
17113 unsigned int base = (FLOAT_TYPE_P (type)
17114 ? sve_issue->fp_cmp_pred_ops
17115 : sve_issue->int_cmp_pred_ops);
17116 ops->pred_ops += base * count;
17119 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
17120 if (simd_issue)
17121 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
17123 case 2:
17124 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
17125 break;
17127 case 3:
17128 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
17129 break;
17131 case 4:
17132 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
17133 break;
17136 /* Add any overhead associated with gather loads and scatter stores. */
17137 if (sve_issue
17138 && (kind == scalar_load || kind == scalar_store)
17139 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
17141 unsigned int pairs = CEIL (count, 2);
17142 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17143 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17147 /* Return true if STMT_INFO contains a memory access and if the constant
17148 component of the memory address is aligned to SIZE bytes. */
17149 static bool
17150 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17151 poly_uint64 size)
17153 if (!STMT_VINFO_DATA_REF (stmt_info))
17154 return false;
17156 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17157 stmt_info = first_stmt;
17158 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17159 /* Needed for gathers & scatters, for example. */
17160 if (!constant_offset)
17161 return false;
17163 return multiple_p (wi::to_poly_offset (constant_offset), size);
17166 /* Check if a scalar or vector stmt could be part of a region of code
17167 that does nothing more than store values to memory, in the scalar
17168 case using STP. Return the cost of the stmt if so, counting 2 for
17169 one instruction. Return ~0U otherwise.
17171 The arguments are a subset of those passed to add_stmt_cost. */
17172 unsigned int
17173 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17174 stmt_vec_info stmt_info, tree vectype)
17176 /* Code that stores vector constants uses a vector_load to create
17177 the constant. We don't apply the heuristic to that case for two
17178 main reasons:
17180 - At the moment, STPs are only formed via peephole2, and the
17181 constant scalar moves would often come between STRs and so
17182 prevent STP formation.
17184 - The scalar code also has to load the constant somehow, and that
17185 isn't costed. */
17186 switch (kind)
17188 case scalar_to_vec:
17189 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
17190 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17192 case vec_construct:
17193 if (FLOAT_TYPE_P (vectype))
17194 /* Count 1 insn for the maximum number of FP->SIMD INS
17195 instructions. */
17196 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17198 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17199 maximum number of GPR->SIMD INS instructions. */
17200 return vect_nunits_for_cost (vectype) * 4 * count;
17202 case vector_store:
17203 case unaligned_store:
17204 /* Count 1 insn per vector if we can't form STP Q pairs. */
17205 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17206 return count * 2;
17208 if (stmt_info)
17210 /* Assume we won't be able to use STP if the constant offset
17211 component of the address is misaligned. ??? This could be
17212 removed if we formed STP pairs earlier, rather than relying
17213 on peephole2. */
17214 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17215 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17216 return count * 2;
17218 return CEIL (count, 2) * 2;
17220 case scalar_store:
17221 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17223 /* Check for a mode in which STP pairs can be formed. */
17224 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17225 if (maybe_ne (size, 4) && maybe_ne (size, 8))
17226 return ~0U;
17228 /* Assume we won't be able to use STP if the constant offset
17229 component of the address is misaligned. ??? This could be
17230 removed if we formed STP pairs earlier, rather than relying
17231 on peephole2. */
17232 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17233 return ~0U;
17235 return count;
17237 default:
17238 return ~0U;
17242 unsigned
17243 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17244 stmt_vec_info stmt_info, slp_tree,
17245 tree vectype, int misalign,
17246 vect_cost_model_location where)
17248 fractional_cost stmt_cost
17249 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17251 bool in_inner_loop_p = (where == vect_body
17252 && stmt_info
17253 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17255 /* Do one-time initialization based on the vinfo. */
17256 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17257 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
17259 if (loop_vinfo)
17260 analyze_loop_vinfo (loop_vinfo);
17262 m_analyzed_vinfo = true;
17265 /* Apply the heuristic described above m_stp_sequence_cost. */
17266 if (m_stp_sequence_cost != ~0U)
17268 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17269 stmt_info, vectype);
17270 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17273 /* Try to get a more accurate cost by looking at STMT_INFO instead
17274 of just looking at KIND. */
17275 if (stmt_info && aarch64_use_new_vector_costs_p ())
17277 /* If we scalarize a strided store, the vectorizer costs one
17278 vec_to_scalar for each element. However, we can store the first
17279 element using an FP store without a separate extract step. */
17280 if (vect_is_store_elt_extraction (kind, stmt_info))
17281 count -= 1;
17283 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17284 stmt_info, stmt_cost);
17286 if (vectype && m_vec_flags)
17287 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17288 stmt_info, vectype,
17289 where, stmt_cost);
17292 /* Do any SVE-specific adjustments to the cost. */
17293 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17294 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17295 vectype, stmt_cost);
17297 /* Vector promotion and demotion requires us to widen the operation first
17298 and only after that perform the conversion. Unfortunately the mid-end
17299 expects this to be doable as a single operation and doesn't pass on
17300 enough context here for us to tell which operation is happening. To
17301 account for this we count every promote-demote operation twice and if
17302 the previously costed operation was also a promote-demote we reduce
17303 the cost of the currently being costed operation to simulate the final
17304 conversion cost. Note that for SVE we can do better here if the converted
17305 value comes from a load since the widening load would consume the widening
17306 operations. However since we're in stage 3 we can't change the helper
17307 vect_is_extending_load and duplicating the code seems not useful. */
17308 gassign *assign = NULL;
17309 if (kind == vec_promote_demote
17310 && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17311 && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17313 auto new_count = count * 2 - m_num_last_promote_demote;
17314 m_num_last_promote_demote = count;
17315 count = new_count;
17317 else
17318 m_num_last_promote_demote = 0;
17320 if (stmt_info && aarch64_use_new_vector_costs_p ())
17322 /* Account for any extra "embedded" costs that apply additively
17323 to the base cost calculated above. */
17324 stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17325 vectype, m_vec_flags, stmt_cost);
17327 /* If we're recording a nonzero vector loop body cost for the
17328 innermost loop, also estimate the operations that would need
17329 to be issued by all relevant implementations of the loop. */
17330 if (loop_vinfo
17331 && (m_costing_for_scalar || where == vect_body)
17332 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17333 && stmt_cost != 0)
17334 for (auto &ops : m_ops)
17335 count_ops (count, kind, stmt_info, &ops);
17337 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17338 estimate the number of statements in the unrolled Advanced SIMD
17339 loop. For simplicitly, we assume that one iteration of the
17340 Advanced SIMD loop would need the same number of statements
17341 as one iteration of the SVE loop. */
17342 if (where == vect_body && m_unrolled_advsimd_niters)
17343 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17345 /* Detect the use of an averaging operation. */
17346 gimple *stmt = stmt_info->stmt;
17347 if (is_gimple_call (stmt)
17348 && gimple_call_internal_p (stmt))
17350 switch (gimple_call_internal_fn (stmt))
17352 case IFN_AVG_FLOOR:
17353 case IFN_AVG_CEIL:
17354 m_has_avg = true;
17355 default:
17356 break;
17361 /* If the statement stores to a decl that is known to be the argument
17362 to a vld1 in the same function, ignore the store for costing purposes.
17363 See the comment above m_stores_to_vector_load_decl for more details. */
17364 if (stmt_info
17365 && (kind == vector_store || kind == unaligned_store)
17366 && aarch64_accesses_vector_load_decl_p (stmt_info))
17368 stmt_cost = 0;
17369 m_stores_to_vector_load_decl = true;
17372 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17375 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17376 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17377 says that we should prefer the Advanced SIMD loop. */
17378 bool
17379 aarch64_vector_costs::prefer_unrolled_loop () const
17381 if (!m_unrolled_advsimd_stmts)
17382 return false;
17384 if (dump_enabled_p ())
17385 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17386 " unrolled Advanced SIMD loop = "
17387 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17388 m_unrolled_advsimd_stmts);
17390 /* The balance here is tricky. On the one hand, we can't be sure whether
17391 the code is vectorizable with Advanced SIMD or not. However, even if
17392 it isn't vectorizable with Advanced SIMD, there's a possibility that
17393 the scalar code could also be unrolled. Some of the code might then
17394 benefit from SLP, or from using LDP and STP. We therefore apply
17395 the heuristic regardless of can_use_advsimd_p. */
17396 return (m_unrolled_advsimd_stmts
17397 && (m_unrolled_advsimd_stmts
17398 <= (unsigned int) param_max_completely_peeled_insns));
17401 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
17402 how fast the SVE code can be issued and compare it to the equivalent value
17403 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
17404 also compare it to the issue rate of Advanced SIMD code
17405 (ADVSIMD_CYCLES_PER_ITER).
17407 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17408 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
17409 is true if we think the loop body is too expensive. */
17411 fractional_cost
17412 aarch64_vector_costs::
17413 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17414 fractional_cost scalar_cycles_per_iter,
17415 unsigned int orig_body_cost, unsigned int *body_cost,
17416 bool *should_disparage)
17418 if (dump_enabled_p ())
17419 ops->dump ();
17421 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17422 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17424 /* If the scalar version of the loop could issue at least as
17425 quickly as the predicate parts of the SVE loop, make the SVE loop
17426 prohibitively expensive. In this case vectorization is adding an
17427 overhead that the original scalar code didn't have.
17429 This is mostly intended to detect cases in which WHILELOs dominate
17430 for very tight loops, which is something that normal latency-based
17431 costs would not model. Adding this kind of cliffedge would be
17432 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17433 code in the caller handles that case in a more conservative way. */
17434 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17435 if (scalar_cycles_per_iter < sve_estimate)
17437 unsigned int min_cost
17438 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17439 if (*body_cost < min_cost)
17441 if (dump_enabled_p ())
17442 dump_printf_loc (MSG_NOTE, vect_location,
17443 "Increasing body cost to %d because the"
17444 " scalar code could issue within the limit"
17445 " imposed by predicate operations\n",
17446 min_cost);
17447 *body_cost = min_cost;
17448 *should_disparage = true;
17452 return sve_cycles_per_iter;
17455 unsigned int
17456 aarch64_vector_costs::determine_suggested_unroll_factor ()
17458 bool sve = m_vec_flags & VEC_ANY_SVE;
17459 /* If we are trying to unroll an Advanced SIMD main loop that contains
17460 an averaging operation that we do not support with SVE and we might use a
17461 predicated epilogue, we need to be conservative and block unrolling as
17462 this might lead to a less optimal loop for the first and only epilogue
17463 using the original loop's vectorization factor.
17464 TODO: Remove this constraint when we add support for multiple epilogue
17465 vectorization. */
17466 if (!sve && !TARGET_SVE2 && m_has_avg)
17467 return 1;
17469 unsigned int max_unroll_factor = 1;
17470 for (auto vec_ops : m_ops)
17472 aarch64_simd_vec_issue_info const *vec_issue
17473 = vec_ops.simd_issue_info ();
17474 if (!vec_issue)
17475 return 1;
17476 /* Limit unroll factor to a value adjustable by the user, the default
17477 value is 4. */
17478 unsigned int unroll_factor = aarch64_vect_unroll_limit;
17479 unsigned int factor
17480 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17481 unsigned int temp;
17483 /* Sanity check, this should never happen. */
17484 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17485 return 1;
17487 /* Check stores. */
17488 if (vec_ops.stores > 0)
17490 temp = CEIL (factor * vec_issue->stores_per_cycle,
17491 vec_ops.stores);
17492 unroll_factor = MIN (unroll_factor, temp);
17495 /* Check loads + stores. */
17496 if (vec_ops.loads > 0)
17498 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17499 vec_ops.loads + vec_ops.stores);
17500 unroll_factor = MIN (unroll_factor, temp);
17503 /* Check general ops. */
17504 if (vec_ops.general_ops > 0)
17506 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17507 vec_ops.general_ops);
17508 unroll_factor = MIN (unroll_factor, temp);
17510 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17513 /* Make sure unroll factor is power of 2. */
17514 return 1 << ceil_log2 (max_unroll_factor);
17517 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17518 and return the new cost. */
17519 unsigned int
17520 aarch64_vector_costs::
17521 adjust_body_cost (loop_vec_info loop_vinfo,
17522 const aarch64_vector_costs *scalar_costs,
17523 unsigned int body_cost)
17525 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17526 return body_cost;
17528 const auto &scalar_ops = scalar_costs->m_ops[0];
17529 const auto &vector_ops = m_ops[0];
17530 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17531 unsigned int orig_body_cost = body_cost;
17532 bool should_disparage = false;
17534 if (dump_enabled_p ())
17535 dump_printf_loc (MSG_NOTE, vect_location,
17536 "Original vector body cost = %d\n", body_cost);
17538 fractional_cost scalar_cycles_per_iter
17539 = scalar_ops.min_cycles_per_iter () * estimated_vf;
17541 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17543 if (dump_enabled_p ())
17545 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17546 dump_printf_loc (MSG_NOTE, vect_location,
17547 "Vector loop iterates at most %wd times\n",
17548 m_num_vector_iterations);
17549 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17550 scalar_ops.dump ();
17551 dump_printf_loc (MSG_NOTE, vect_location,
17552 " estimated cycles per vector iteration"
17553 " (for VF %d) = %f\n",
17554 estimated_vf, scalar_cycles_per_iter.as_double ());
17557 if (vector_ops.sve_issue_info ())
17559 if (dump_enabled_p ())
17560 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17561 vector_cycles_per_iter
17562 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17563 orig_body_cost, &body_cost, &should_disparage);
17565 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17567 /* Also take Neoverse V1 tuning into account, doubling the
17568 scalar and Advanced SIMD estimates to account for the
17569 doubling in SVE vector length. */
17570 if (dump_enabled_p ())
17571 dump_printf_loc (MSG_NOTE, vect_location,
17572 "Neoverse V1 estimate:\n");
17573 auto vf_factor = m_ops[1].vf_factor ();
17574 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17575 orig_body_cost, &body_cost, &should_disparage);
17578 else
17580 if (dump_enabled_p ())
17582 dump_printf_loc (MSG_NOTE, vect_location,
17583 "Vector issue estimate:\n");
17584 vector_ops.dump ();
17588 /* Decide whether to stick to latency-based costs or whether to try to
17589 take issue rates into account. */
17590 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17591 if (m_vec_flags & VEC_ANY_SVE)
17592 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17594 if (m_num_vector_iterations >= 1
17595 && m_num_vector_iterations < threshold)
17597 if (dump_enabled_p ())
17598 dump_printf_loc (MSG_NOTE, vect_location,
17599 "Low iteration count, so using pure latency"
17600 " costs\n");
17602 /* Increase the cost of the vector code if it looks like the scalar code
17603 could issue more quickly. These values are only rough estimates,
17604 so minor differences should only result in minor changes. */
17605 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17607 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17608 scalar_cycles_per_iter);
17609 if (dump_enabled_p ())
17610 dump_printf_loc (MSG_NOTE, vect_location,
17611 "Increasing body cost to %d because scalar code"
17612 " would issue more quickly\n", body_cost);
17614 /* In general, it's expected that the proposed vector code would be able
17615 to issue more quickly than the original scalar code. This should
17616 already be reflected to some extent in the latency-based costs.
17618 However, the latency-based costs effectively assume that the scalar
17619 code and the vector code execute serially, which tends to underplay
17620 one important case: if the real (non-serialized) execution time of
17621 a scalar iteration is dominated by loop-carried dependencies,
17622 and if the vector code is able to reduce both the length of
17623 the loop-carried dependencies *and* the number of cycles needed
17624 to issue the code in general, we can be more confident that the
17625 vector code is an improvement, even if adding the other (non-loop-carried)
17626 latencies tends to hide this saving. We therefore reduce the cost of the
17627 vector loop body in proportion to the saving. */
17628 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17629 && scalar_ops.reduction_latency == scalar_cycles_per_iter
17630 && scalar_cycles_per_iter > vector_cycles_per_iter
17631 && !should_disparage)
17633 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17634 scalar_cycles_per_iter);
17635 if (dump_enabled_p ())
17636 dump_printf_loc (MSG_NOTE, vect_location,
17637 "Decreasing body cost to %d account for smaller"
17638 " reduction latency\n", body_cost);
17641 return body_cost;
17644 void
17645 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17647 /* Record the issue information for any SVE WHILE instructions that the
17648 loop needs. */
17649 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17650 if (!m_ops.is_empty ()
17651 && loop_vinfo
17652 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
17654 unsigned int num_masks = 0;
17655 rgroup_controls *rgm;
17656 unsigned int num_vectors_m1;
17657 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
17658 num_vectors_m1, rgm)
17659 if (rgm->type)
17660 num_masks += num_vectors_m1 + 1;
17661 for (auto &ops : m_ops)
17662 if (auto *issue = ops.sve_issue_info ())
17663 ops.pred_ops += num_masks * issue->while_pred_ops;
17666 auto *scalar_costs
17667 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17668 if (loop_vinfo
17669 && m_vec_flags
17670 && aarch64_use_new_vector_costs_p ())
17672 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17673 m_costs[vect_body]);
17674 m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17677 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17678 the scalar code in the event of a tie, since there is more chance
17679 of scalar code being optimized with surrounding operations.
17681 In addition, if the vector body is a simple store to a decl that
17682 is elsewhere loaded using vld1, strongly prefer the vector form,
17683 to the extent of giving the prologue a zero cost. See the comment
17684 above m_stores_to_vector_load_decl for details. */
17685 if (!loop_vinfo
17686 && scalar_costs
17687 && m_stp_sequence_cost != ~0U)
17689 if (m_stores_to_vector_load_decl)
17690 m_costs[vect_prologue] = 0;
17691 else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17692 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17695 vector_costs::finish_cost (scalar_costs);
17698 bool
17699 aarch64_vector_costs::
17700 better_main_loop_than_p (const vector_costs *uncast_other) const
17702 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17704 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17705 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17707 if (dump_enabled_p ())
17708 dump_printf_loc (MSG_NOTE, vect_location,
17709 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17710 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17711 vect_vf_for_cost (this_loop_vinfo),
17712 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17713 vect_vf_for_cost (other_loop_vinfo));
17715 /* Apply the unrolling heuristic described above
17716 m_unrolled_advsimd_niters. */
17717 if (bool (m_unrolled_advsimd_stmts)
17718 != bool (other->m_unrolled_advsimd_stmts))
17720 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17721 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17722 if (this_prefer_unrolled != other_prefer_unrolled)
17724 if (dump_enabled_p ())
17725 dump_printf_loc (MSG_NOTE, vect_location,
17726 "Preferring Advanced SIMD loop because"
17727 " it can be unrolled\n");
17728 return other_prefer_unrolled;
17732 for (unsigned int i = 0; i < m_ops.length (); ++i)
17734 if (dump_enabled_p ())
17736 if (i)
17737 dump_printf_loc (MSG_NOTE, vect_location,
17738 "Reconsidering with subtuning %d\n", i);
17739 dump_printf_loc (MSG_NOTE, vect_location,
17740 "Issue info for %s loop:\n",
17741 GET_MODE_NAME (this_loop_vinfo->vector_mode));
17742 this->m_ops[i].dump ();
17743 dump_printf_loc (MSG_NOTE, vect_location,
17744 "Issue info for %s loop:\n",
17745 GET_MODE_NAME (other_loop_vinfo->vector_mode));
17746 other->m_ops[i].dump ();
17749 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17750 * this->m_ops[i].vf_factor ());
17751 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17752 * other->m_ops[i].vf_factor ());
17754 /* If it appears that one loop could process the same amount of data
17755 in fewer cycles, prefer that loop over the other one. */
17756 fractional_cost this_cost
17757 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17758 fractional_cost other_cost
17759 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17760 if (dump_enabled_p ())
17762 dump_printf_loc (MSG_NOTE, vect_location,
17763 "Weighted cycles per iteration of %s loop ~= %f\n",
17764 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17765 this_cost.as_double ());
17766 dump_printf_loc (MSG_NOTE, vect_location,
17767 "Weighted cycles per iteration of %s loop ~= %f\n",
17768 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17769 other_cost.as_double ());
17771 if (this_cost != other_cost)
17773 if (dump_enabled_p ())
17774 dump_printf_loc (MSG_NOTE, vect_location,
17775 "Preferring loop with lower cycles"
17776 " per iteration\n");
17777 return this_cost < other_cost;
17780 /* If the issue rate of SVE code is limited by predicate operations
17781 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17782 and if Advanced SIMD code could issue within the limit imposed
17783 by the predicate operations, the predicate operations are adding an
17784 overhead that the original code didn't have and so we should prefer
17785 the Advanced SIMD version. */
17786 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17787 const aarch64_vec_op_count &b) -> bool
17789 if (a.pred_ops == 0
17790 && (b.min_pred_cycles_per_iter ()
17791 > b.min_nonpred_cycles_per_iter ()))
17793 if (dump_enabled_p ())
17794 dump_printf_loc (MSG_NOTE, vect_location,
17795 "Preferring Advanced SIMD loop since"
17796 " SVE loop is predicate-limited\n");
17797 return true;
17799 return false;
17801 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17802 return true;
17803 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17804 return false;
17807 return vector_costs::better_main_loop_than_p (other);
17810 static void initialize_aarch64_code_model (struct gcc_options *);
17812 /* Parse the TO_PARSE string and put the architecture struct that it
17813 selects into RES and the architectural features into ISA_FLAGS.
17814 Return an aarch_parse_opt_result describing the parse result.
17815 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17816 When the TO_PARSE string contains an invalid extension,
17817 a copy of the string is created and stored to INVALID_EXTENSION. */
17819 static enum aarch_parse_opt_result
17820 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17821 aarch64_feature_flags *isa_flags,
17822 std::string *invalid_extension)
17824 const char *ext;
17825 const struct processor *arch;
17826 size_t len;
17828 ext = strchr (to_parse, '+');
17830 if (ext != NULL)
17831 len = ext - to_parse;
17832 else
17833 len = strlen (to_parse);
17835 if (len == 0)
17836 return AARCH_PARSE_MISSING_ARG;
17839 /* Loop through the list of supported ARCHes to find a match. */
17840 for (arch = all_architectures; arch->name != NULL; arch++)
17842 if (strlen (arch->name) == len
17843 && strncmp (arch->name, to_parse, len) == 0)
17845 auto isa_temp = arch->flags;
17847 if (ext != NULL)
17849 /* TO_PARSE string contains at least one extension. */
17850 enum aarch_parse_opt_result ext_res
17851 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17853 if (ext_res != AARCH_PARSE_OK)
17854 return ext_res;
17856 /* Extension parsing was successful. Confirm the result
17857 arch and ISA flags. */
17858 *res = arch;
17859 *isa_flags = isa_temp;
17860 return AARCH_PARSE_OK;
17864 /* ARCH name not found in list. */
17865 return AARCH_PARSE_INVALID_ARG;
17868 /* Parse the TO_PARSE string and put the result tuning in RES and the
17869 architecture flags in ISA_FLAGS. Return an aarch_parse_opt_result
17870 describing the parse result. If there is an error parsing, RES and
17871 ISA_FLAGS are left unchanged.
17872 When the TO_PARSE string contains an invalid extension,
17873 a copy of the string is created and stored to INVALID_EXTENSION. */
17875 static enum aarch_parse_opt_result
17876 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17877 aarch64_feature_flags *isa_flags,
17878 std::string *invalid_extension)
17880 const char *ext;
17881 const struct processor *cpu;
17882 size_t len;
17884 ext = strchr (to_parse, '+');
17886 if (ext != NULL)
17887 len = ext - to_parse;
17888 else
17889 len = strlen (to_parse);
17891 if (len == 0)
17892 return AARCH_PARSE_MISSING_ARG;
17895 /* Loop through the list of supported CPUs to find a match. */
17896 for (cpu = all_cores; cpu->name != NULL; cpu++)
17898 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17900 auto isa_temp = cpu->flags;
17902 if (ext != NULL)
17904 /* TO_PARSE string contains at least one extension. */
17905 enum aarch_parse_opt_result ext_res
17906 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17908 if (ext_res != AARCH_PARSE_OK)
17909 return ext_res;
17911 /* Extension parsing was successfull. Confirm the result
17912 cpu and ISA flags. */
17913 *res = cpu;
17914 *isa_flags = isa_temp;
17915 return AARCH_PARSE_OK;
17919 /* CPU name not found in list. */
17920 return AARCH_PARSE_INVALID_ARG;
17923 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17924 Return an aarch_parse_opt_result describing the parse result.
17925 If the parsing fails the RES does not change. */
17927 static enum aarch_parse_opt_result
17928 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17930 const struct processor *cpu;
17932 /* Loop through the list of supported CPUs to find a match. */
17933 for (cpu = all_cores; cpu->name != NULL; cpu++)
17935 if (strcmp (cpu->name, to_parse) == 0)
17937 *res = cpu;
17938 return AARCH_PARSE_OK;
17942 /* CPU name not found in list. */
17943 return AARCH_PARSE_INVALID_ARG;
17946 /* Parse TOKEN, which has length LENGTH to see if it is an option
17947 described in FLAG. If it is, return the index bit for that fusion type.
17948 If not, error (printing OPTION_NAME) and return zero. */
17950 static unsigned int
17951 aarch64_parse_one_option_token (const char *token,
17952 size_t length,
17953 const struct aarch64_flag_desc *flag,
17954 const char *option_name)
17956 for (; flag->name != NULL; flag++)
17958 if (length == strlen (flag->name)
17959 && !strncmp (flag->name, token, length))
17960 return flag->flag;
17963 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17964 return 0;
17967 /* Parse OPTION which is a comma-separated list of flags to enable.
17968 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17969 default state we inherit from the CPU tuning structures. OPTION_NAME
17970 gives the top-level option we are parsing in the -moverride string,
17971 for use in error messages. */
17973 static unsigned int
17974 aarch64_parse_boolean_options (const char *option,
17975 const struct aarch64_flag_desc *flags,
17976 unsigned int initial_state,
17977 const char *option_name)
17979 const char separator = '.';
17980 const char* specs = option;
17981 const char* ntoken = option;
17982 unsigned int found_flags = initial_state;
17984 while ((ntoken = strchr (specs, separator)))
17986 size_t token_length = ntoken - specs;
17987 unsigned token_ops = aarch64_parse_one_option_token (specs,
17988 token_length,
17989 flags,
17990 option_name);
17991 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17992 in the token stream, reset the supported operations. So:
17994 adrp+add.cmp+branch.none.adrp+add
17996 would have the result of turning on only adrp+add fusion. */
17997 if (!token_ops)
17998 found_flags = 0;
18000 found_flags |= token_ops;
18001 specs = ++ntoken;
18004 /* We ended with a comma, print something. */
18005 if (!(*specs))
18007 error ("%qs string ill-formed", option_name);
18008 return 0;
18011 /* We still have one more token to parse. */
18012 size_t token_length = strlen (specs);
18013 unsigned token_ops = aarch64_parse_one_option_token (specs,
18014 token_length,
18015 flags,
18016 option_name);
18017 if (!token_ops)
18018 found_flags = 0;
18020 found_flags |= token_ops;
18021 return found_flags;
18024 /* Support for overriding instruction fusion. */
18026 static void
18027 aarch64_parse_fuse_string (const char *fuse_string,
18028 struct tune_params *tune)
18030 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
18031 aarch64_fusible_pairs,
18032 tune->fusible_ops,
18033 "fuse=");
18036 /* Support for overriding other tuning flags. */
18038 static void
18039 aarch64_parse_tune_string (const char *tune_string,
18040 struct tune_params *tune)
18042 tune->extra_tuning_flags
18043 = aarch64_parse_boolean_options (tune_string,
18044 aarch64_tuning_flags,
18045 tune->extra_tuning_flags,
18046 "tune=");
18049 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18050 Accept the valid SVE vector widths allowed by
18051 aarch64_sve_vector_bits_enum and use it to override sve_width
18052 in TUNE. */
18054 static void
18055 aarch64_parse_sve_width_string (const char *tune_string,
18056 struct tune_params *tune)
18058 int width = -1;
18060 int n = sscanf (tune_string, "%d", &width);
18061 if (n == EOF)
18063 error ("invalid format for %<sve_width%>");
18064 return;
18066 switch (width)
18068 case SVE_128:
18069 case SVE_256:
18070 case SVE_512:
18071 case SVE_1024:
18072 case SVE_2048:
18073 break;
18074 default:
18075 error ("invalid %<sve_width%> value: %d", width);
18077 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
18080 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18081 we understand. If it is, extract the option string and handoff to
18082 the appropriate function. */
18084 void
18085 aarch64_parse_one_override_token (const char* token,
18086 size_t length,
18087 struct tune_params *tune)
18089 const struct aarch64_tuning_override_function *fn
18090 = aarch64_tuning_override_functions;
18092 const char *option_part = strchr (token, '=');
18093 if (!option_part)
18095 error ("tuning string missing in option (%s)", token);
18096 return;
18099 /* Get the length of the option name. */
18100 length = option_part - token;
18101 /* Skip the '=' to get to the option string. */
18102 option_part++;
18104 for (; fn->name != NULL; fn++)
18106 if (!strncmp (fn->name, token, length))
18108 fn->parse_override (option_part, tune);
18109 return;
18113 error ("unknown tuning option (%s)",token);
18114 return;
18117 /* A checking mechanism for the implementation of the tls size. */
18119 static void
18120 initialize_aarch64_tls_size (struct gcc_options *opts)
18122 if (aarch64_tls_size == 0)
18123 aarch64_tls_size = 24;
18125 switch (opts->x_aarch64_cmodel_var)
18127 case AARCH64_CMODEL_TINY:
18128 /* Both the default and maximum TLS size allowed under tiny is 1M which
18129 needs two instructions to address, so we clamp the size to 24. */
18130 if (aarch64_tls_size > 24)
18131 aarch64_tls_size = 24;
18132 break;
18133 case AARCH64_CMODEL_SMALL:
18134 /* The maximum TLS size allowed under small is 4G. */
18135 if (aarch64_tls_size > 32)
18136 aarch64_tls_size = 32;
18137 break;
18138 case AARCH64_CMODEL_LARGE:
18139 /* The maximum TLS size allowed under large is 16E.
18140 FIXME: 16E should be 64bit, we only support 48bit offset now. */
18141 if (aarch64_tls_size > 48)
18142 aarch64_tls_size = 48;
18143 break;
18144 default:
18145 gcc_unreachable ();
18148 return;
18151 /* Return the CPU corresponding to the enum CPU. */
18153 static const struct processor *
18154 aarch64_get_tune_cpu (enum aarch64_processor cpu)
18156 gcc_assert (cpu != aarch64_none);
18158 return &all_cores[cpu];
18161 /* Return the architecture corresponding to the enum ARCH. */
18163 static const struct processor *
18164 aarch64_get_arch (enum aarch64_arch arch)
18166 gcc_assert (arch != aarch64_no_arch);
18168 return &all_architectures[arch];
18171 /* Parse STRING looking for options in the format:
18172 string :: option:string
18173 option :: name=substring
18174 name :: {a-z}
18175 substring :: defined by option. */
18177 static void
18178 aarch64_parse_override_string (const char* input_string,
18179 struct tune_params* tune)
18181 const char separator = ':';
18182 size_t string_length = strlen (input_string) + 1;
18183 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18184 char *string = string_root;
18185 strncpy (string, input_string, string_length);
18186 string[string_length - 1] = '\0';
18188 char* ntoken = string;
18190 while ((ntoken = strchr (string, separator)))
18192 size_t token_length = ntoken - string;
18193 /* Make this substring look like a string. */
18194 *ntoken = '\0';
18195 aarch64_parse_one_override_token (string, token_length, tune);
18196 string = ++ntoken;
18199 /* One last option to parse. */
18200 aarch64_parse_one_override_token (string, strlen (string), tune);
18201 free (string_root);
18204 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18205 are best for a generic target with the currently-enabled architecture
18206 extensions. */
18207 static void
18208 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18210 /* Neoverse V1 is the only core that is known to benefit from
18211 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
18212 point enabling it for SVE2 and above. */
18213 if (TARGET_SVE2)
18214 current_tune.extra_tuning_flags
18215 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18218 static void
18219 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18221 /* PR 70044: We have to be careful about being called multiple times for the
18222 same function. This means all changes should be repeatable. */
18224 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18225 Disable the frame pointer flag so the mid-end will not use a frame
18226 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18227 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18228 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
18229 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18230 if (opts->x_flag_omit_frame_pointer == 0)
18231 opts->x_flag_omit_frame_pointer = 2;
18233 /* If not optimizing for size, set the default
18234 alignment to what the target wants. */
18235 if (!opts->x_optimize_size)
18237 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18238 opts->x_str_align_loops = aarch64_tune_params.loop_align;
18239 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18240 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18241 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18242 opts->x_str_align_functions = aarch64_tune_params.function_align;
18245 /* We default to no pc-relative literal loads. */
18247 aarch64_pcrelative_literal_loads = false;
18249 /* If -mpc-relative-literal-loads is set on the command line, this
18250 implies that the user asked for PC relative literal loads. */
18251 if (opts->x_pcrelative_literal_loads == 1)
18252 aarch64_pcrelative_literal_loads = true;
18254 /* In the tiny memory model it makes no sense to disallow PC relative
18255 literal pool loads. */
18256 if (aarch64_cmodel == AARCH64_CMODEL_TINY
18257 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18258 aarch64_pcrelative_literal_loads = true;
18260 /* When enabling the lower precision Newton series for the square root, also
18261 enable it for the reciprocal square root, since the latter is an
18262 intermediary step for the former. */
18263 if (flag_mlow_precision_sqrt)
18264 flag_mrecip_low_precision_sqrt = true;
18267 /* 'Unpack' up the internal tuning structs and update the options
18268 in OPTS. The caller must have set up selected_tune and selected_arch
18269 as all the other target-specific codegen decisions are
18270 derived from them. */
18272 void
18273 aarch64_override_options_internal (struct gcc_options *opts)
18275 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18276 aarch64_tune_flags = tune->flags;
18277 aarch64_tune = tune->sched_core;
18278 /* Make a copy of the tuning parameters attached to the core, which
18279 we may later overwrite. */
18280 aarch64_tune_params = *(tune->tune);
18281 if (tune->tune == &generic_tunings)
18282 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18284 if (opts->x_aarch64_override_tune_string)
18285 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18286 &aarch64_tune_params);
18288 if (opts->x_aarch64_ldp_policy_param)
18289 aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18291 if (opts->x_aarch64_stp_policy_param)
18292 aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18294 /* This target defaults to strict volatile bitfields. */
18295 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18296 opts->x_flag_strict_volatile_bitfields = 1;
18298 if (aarch64_stack_protector_guard == SSP_GLOBAL
18299 && opts->x_aarch64_stack_protector_guard_offset_str)
18301 error ("incompatible options %<-mstack-protector-guard=global%> and "
18302 "%<-mstack-protector-guard-offset=%s%>",
18303 aarch64_stack_protector_guard_offset_str);
18306 if (aarch64_stack_protector_guard == SSP_SYSREG
18307 && !(opts->x_aarch64_stack_protector_guard_offset_str
18308 && opts->x_aarch64_stack_protector_guard_reg_str))
18310 error ("both %<-mstack-protector-guard-offset%> and "
18311 "%<-mstack-protector-guard-reg%> must be used "
18312 "with %<-mstack-protector-guard=sysreg%>");
18315 if (opts->x_aarch64_stack_protector_guard_reg_str)
18317 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18318 error ("specify a system register with a small string length");
18321 if (opts->x_aarch64_stack_protector_guard_offset_str)
18323 char *end;
18324 const char *str = aarch64_stack_protector_guard_offset_str;
18325 errno = 0;
18326 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18327 if (!*str || *end || errno)
18328 error ("%qs is not a valid offset in %qs", str,
18329 "-mstack-protector-guard-offset=");
18330 aarch64_stack_protector_guard_offset = offs;
18333 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18334 && !fixed_regs[R18_REGNUM])
18335 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18337 if ((opts->x_aarch64_isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18338 && !(opts->x_aarch64_isa_flags & AARCH64_FL_SME))
18340 if (opts->x_aarch64_isa_flags & AARCH64_FL_SM_ON)
18341 error ("streaming functions require the ISA extension %qs", "sme");
18342 else
18343 error ("functions with SME state require the ISA extension %qs",
18344 "sme");
18345 inform (input_location, "you can enable %qs using the command-line"
18346 " option %<-march%>, or by using the %<target%>"
18347 " attribute or pragma", "sme");
18348 opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18349 auto new_flags = (opts->x_aarch64_asm_isa_flags
18350 | feature_deps::SME ().enable);
18351 aarch64_set_asm_isa_flags (opts, new_flags);
18354 initialize_aarch64_code_model (opts);
18355 initialize_aarch64_tls_size (opts);
18356 aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18358 int queue_depth = 0;
18359 switch (aarch64_tune_params.autoprefetcher_model)
18361 case tune_params::AUTOPREFETCHER_OFF:
18362 queue_depth = -1;
18363 break;
18364 case tune_params::AUTOPREFETCHER_WEAK:
18365 queue_depth = 0;
18366 break;
18367 case tune_params::AUTOPREFETCHER_STRONG:
18368 queue_depth = max_insn_queue_index + 1;
18369 break;
18370 default:
18371 gcc_unreachable ();
18374 /* We don't mind passing in global_options_set here as we don't use
18375 the *options_set structs anyway. */
18376 SET_OPTION_IF_UNSET (opts, &global_options_set,
18377 param_sched_autopref_queue_depth, queue_depth);
18379 /* Set up parameters to be used in prefetching algorithm. Do not
18380 override the defaults unless we are tuning for a core we have
18381 researched values for. */
18382 if (aarch64_tune_params.prefetch->num_slots > 0)
18383 SET_OPTION_IF_UNSET (opts, &global_options_set,
18384 param_simultaneous_prefetches,
18385 aarch64_tune_params.prefetch->num_slots);
18386 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18387 SET_OPTION_IF_UNSET (opts, &global_options_set,
18388 param_l1_cache_size,
18389 aarch64_tune_params.prefetch->l1_cache_size);
18390 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18391 SET_OPTION_IF_UNSET (opts, &global_options_set,
18392 param_l1_cache_line_size,
18393 aarch64_tune_params.prefetch->l1_cache_line_size);
18395 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18397 SET_OPTION_IF_UNSET (opts, &global_options_set,
18398 param_destruct_interfere_size,
18399 aarch64_tune_params.prefetch->l1_cache_line_size);
18400 SET_OPTION_IF_UNSET (opts, &global_options_set,
18401 param_construct_interfere_size,
18402 aarch64_tune_params.prefetch->l1_cache_line_size);
18404 else
18406 /* For a generic AArch64 target, cover the current range of cache line
18407 sizes. */
18408 SET_OPTION_IF_UNSET (opts, &global_options_set,
18409 param_destruct_interfere_size,
18410 256);
18411 SET_OPTION_IF_UNSET (opts, &global_options_set,
18412 param_construct_interfere_size,
18413 64);
18416 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18417 SET_OPTION_IF_UNSET (opts, &global_options_set,
18418 param_l2_cache_size,
18419 aarch64_tune_params.prefetch->l2_cache_size);
18420 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18421 SET_OPTION_IF_UNSET (opts, &global_options_set,
18422 param_prefetch_dynamic_strides, 0);
18423 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18424 SET_OPTION_IF_UNSET (opts, &global_options_set,
18425 param_prefetch_minimum_stride,
18426 aarch64_tune_params.prefetch->minimum_stride);
18428 /* Use the alternative scheduling-pressure algorithm by default. */
18429 SET_OPTION_IF_UNSET (opts, &global_options_set,
18430 param_sched_pressure_algorithm,
18431 SCHED_PRESSURE_MODEL);
18433 /* Validate the guard size. */
18434 int guard_size = param_stack_clash_protection_guard_size;
18436 if (guard_size != 12 && guard_size != 16)
18437 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18438 "size. Given value %d (%llu KB) is out of range",
18439 guard_size, (1ULL << guard_size) / 1024ULL);
18441 /* Enforce that interval is the same size as size so the mid-end does the
18442 right thing. */
18443 SET_OPTION_IF_UNSET (opts, &global_options_set,
18444 param_stack_clash_protection_probe_interval,
18445 guard_size);
18447 /* The maybe_set calls won't update the value if the user has explicitly set
18448 one. Which means we need to validate that probing interval and guard size
18449 are equal. */
18450 int probe_interval
18451 = param_stack_clash_protection_probe_interval;
18452 if (guard_size != probe_interval)
18453 error ("stack clash guard size %<%d%> must be equal to probing interval "
18454 "%<%d%>", guard_size, probe_interval);
18456 /* Enable sw prefetching at specified optimization level for
18457 CPUS that have prefetch. Lower optimization level threshold by 1
18458 when profiling is enabled. */
18459 if (opts->x_flag_prefetch_loop_arrays < 0
18460 && !opts->x_optimize_size
18461 && aarch64_tune_params.prefetch->default_opt_level >= 0
18462 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18463 opts->x_flag_prefetch_loop_arrays = 1;
18465 /* Avoid loop-dependant FMA chains. */
18466 if (aarch64_tune_params.extra_tuning_flags
18467 & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18468 SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18469 512);
18471 /* Consider fully pipelined FMA in reassociation. */
18472 if (aarch64_tune_params.extra_tuning_flags
18473 & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18474 SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18477 aarch64_override_options_after_change_1 (opts);
18480 /* Print a hint with a suggestion for a core or architecture name that
18481 most closely resembles what the user passed in STR. ARCH is true if
18482 the user is asking for an architecture name. ARCH is false if the user
18483 is asking for a core name. */
18485 static void
18486 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18488 auto_vec<const char *> candidates;
18489 const struct processor *entry = arch ? all_architectures : all_cores;
18490 for (; entry->name != NULL; entry++)
18491 candidates.safe_push (entry->name);
18493 #ifdef HAVE_LOCAL_CPU_DETECT
18494 /* Add also "native" as possible value. */
18495 if (arch)
18496 candidates.safe_push ("native");
18497 #endif
18499 char *s;
18500 const char *hint = candidates_list_and_hint (str, s, candidates);
18501 if (hint)
18502 inform (input_location, "valid arguments are: %s;"
18503 " did you mean %qs?", s, hint);
18504 else
18505 inform (input_location, "valid arguments are: %s", s);
18507 XDELETEVEC (s);
18510 /* Print a hint with a suggestion for a core name that most closely resembles
18511 what the user passed in STR. */
18513 inline static void
18514 aarch64_print_hint_for_core (const char *str)
18516 aarch64_print_hint_for_core_or_arch (str, false);
18519 /* Print a hint with a suggestion for an architecture name that most closely
18520 resembles what the user passed in STR. */
18522 inline static void
18523 aarch64_print_hint_for_arch (const char *str)
18525 aarch64_print_hint_for_core_or_arch (str, true);
18529 /* Print a hint with a suggestion for an extension name
18530 that most closely resembles what the user passed in STR. */
18532 void
18533 aarch64_print_hint_for_extensions (const std::string &str)
18535 auto_vec<const char *> candidates;
18536 aarch64_get_all_extension_candidates (&candidates);
18537 char *s;
18538 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18539 if (hint)
18540 inform (input_location, "valid arguments are: %s;"
18541 " did you mean %qs?", s, hint);
18542 else
18543 inform (input_location, "valid arguments are: %s", s);
18545 XDELETEVEC (s);
18548 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18549 specified in STR and throw errors if appropriate. Put the results if
18550 they are valid in RES and ISA_FLAGS. Return whether the option is
18551 valid. */
18553 static bool
18554 aarch64_validate_mcpu (const char *str, const struct processor **res,
18555 aarch64_feature_flags *isa_flags)
18557 std::string invalid_extension;
18558 enum aarch_parse_opt_result parse_res
18559 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18561 if (parse_res == AARCH_PARSE_OK)
18562 return true;
18564 switch (parse_res)
18566 case AARCH_PARSE_MISSING_ARG:
18567 error ("missing cpu name in %<-mcpu=%s%>", str);
18568 break;
18569 case AARCH_PARSE_INVALID_ARG:
18570 error ("unknown value %qs for %<-mcpu%>", str);
18571 aarch64_print_hint_for_core (str);
18572 /* A common user error is confusing -march and -mcpu.
18573 If the -mcpu string matches a known architecture then suggest
18574 -march=. */
18575 parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18576 if (parse_res == AARCH_PARSE_OK)
18577 inform (input_location, "did you mean %<-march=%s%>?", str);
18578 break;
18579 case AARCH_PARSE_INVALID_FEATURE:
18580 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18581 invalid_extension.c_str (), str);
18582 aarch64_print_hint_for_extensions (invalid_extension);
18583 break;
18584 default:
18585 gcc_unreachable ();
18588 return false;
18591 /* Straight line speculation indicators. */
18592 enum aarch64_sls_hardening_type
18594 SLS_NONE = 0,
18595 SLS_RETBR = 1,
18596 SLS_BLR = 2,
18597 SLS_ALL = 3,
18599 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18601 /* Return whether we should mitigatate Straight Line Speculation for the RET
18602 and BR instructions. */
18603 bool
18604 aarch64_harden_sls_retbr_p (void)
18606 return aarch64_sls_hardening & SLS_RETBR;
18609 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18610 instruction. */
18611 bool
18612 aarch64_harden_sls_blr_p (void)
18614 return aarch64_sls_hardening & SLS_BLR;
18617 /* As of yet we only allow setting these options globally, in the future we may
18618 allow setting them per function. */
18619 static void
18620 aarch64_validate_sls_mitigation (const char *const_str)
18622 char *token_save = NULL;
18623 char *str = NULL;
18625 if (strcmp (const_str, "none") == 0)
18627 aarch64_sls_hardening = SLS_NONE;
18628 return;
18630 if (strcmp (const_str, "all") == 0)
18632 aarch64_sls_hardening = SLS_ALL;
18633 return;
18636 char *str_root = xstrdup (const_str);
18637 str = strtok_r (str_root, ",", &token_save);
18638 if (!str)
18639 error ("invalid argument given to %<-mharden-sls=%>");
18641 int temp = SLS_NONE;
18642 while (str)
18644 if (strcmp (str, "blr") == 0)
18645 temp |= SLS_BLR;
18646 else if (strcmp (str, "retbr") == 0)
18647 temp |= SLS_RETBR;
18648 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18650 error ("%qs must be by itself for %<-mharden-sls=%>", str);
18651 break;
18653 else
18655 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18656 break;
18658 str = strtok_r (NULL, ",", &token_save);
18660 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18661 free (str_root);
18664 /* Validate a command-line -march option. Parse the arch and extensions
18665 (if any) specified in STR and throw errors if appropriate. Put the
18666 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18667 option is valid. */
18669 static bool
18670 aarch64_validate_march (const char *str, const struct processor **res,
18671 aarch64_feature_flags *isa_flags)
18673 std::string invalid_extension;
18674 enum aarch_parse_opt_result parse_res
18675 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18677 if (parse_res == AARCH_PARSE_OK)
18678 return true;
18680 switch (parse_res)
18682 case AARCH_PARSE_MISSING_ARG:
18683 error ("missing arch name in %<-march=%s%>", str);
18684 break;
18685 case AARCH_PARSE_INVALID_ARG:
18686 error ("unknown value %qs for %<-march%>", str);
18687 aarch64_print_hint_for_arch (str);
18688 /* A common user error is confusing -march and -mcpu.
18689 If the -march string matches a known CPU suggest -mcpu. */
18690 parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18691 if (parse_res == AARCH_PARSE_OK)
18692 inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18693 break;
18694 case AARCH_PARSE_INVALID_FEATURE:
18695 error ("invalid feature modifier %qs in %<-march=%s%>",
18696 invalid_extension.c_str (), str);
18697 aarch64_print_hint_for_extensions (invalid_extension);
18698 break;
18699 default:
18700 gcc_unreachable ();
18703 return false;
18706 /* Validate a command-line -mtune option. Parse the cpu
18707 specified in STR and throw errors if appropriate. Put the
18708 result, if it is valid, in RES. Return whether the option is
18709 valid. */
18711 static bool
18712 aarch64_validate_mtune (const char *str, const struct processor **res)
18714 enum aarch_parse_opt_result parse_res
18715 = aarch64_parse_tune (str, res);
18717 if (parse_res == AARCH_PARSE_OK)
18718 return true;
18720 switch (parse_res)
18722 case AARCH_PARSE_MISSING_ARG:
18723 error ("missing cpu name in %<-mtune=%s%>", str);
18724 break;
18725 case AARCH_PARSE_INVALID_ARG:
18726 error ("unknown value %qs for %<-mtune%>", str);
18727 aarch64_print_hint_for_core (str);
18728 break;
18729 default:
18730 gcc_unreachable ();
18732 return false;
18735 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18737 static poly_uint16
18738 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18740 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18741 on big-endian targets, so we would need to forbid subregs that convert
18742 from one to the other. By default a reinterpret sequence would then
18743 involve a store to memory in one mode and a load back in the other.
18744 Even if we optimize that sequence using reverse instructions,
18745 it would still be a significant potential overhead.
18747 For now, it seems better to generate length-agnostic code for that
18748 case instead. */
18749 if (value == SVE_SCALABLE
18750 || (value == SVE_128 && BYTES_BIG_ENDIAN))
18751 return poly_uint16 (2, 2);
18752 else
18753 return (int) value / 64;
18756 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18757 aarch64_isa_flags accordingly. */
18759 void
18760 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18762 aarch64_set_asm_isa_flags (&global_options, flags);
18765 static void
18766 aarch64_handle_no_branch_protection (void)
18768 aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18769 aarch_enable_bti = 0;
18772 static void
18773 aarch64_handle_standard_branch_protection (void)
18775 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18776 aarch64_ra_sign_key = AARCH64_KEY_A;
18777 aarch_enable_bti = 1;
18780 static void
18781 aarch64_handle_pac_ret_protection (void)
18783 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18784 aarch64_ra_sign_key = AARCH64_KEY_A;
18787 static void
18788 aarch64_handle_pac_ret_leaf (void)
18790 aarch_ra_sign_scope = AARCH_FUNCTION_ALL;
18793 static void
18794 aarch64_handle_pac_ret_b_key (void)
18796 aarch64_ra_sign_key = AARCH64_KEY_B;
18799 static void
18800 aarch64_handle_bti_protection (void)
18802 aarch_enable_bti = 1;
18805 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes[] = {
18806 { "leaf", false, aarch64_handle_pac_ret_leaf, NULL, 0 },
18807 { "b-key", false, aarch64_handle_pac_ret_b_key, NULL, 0 },
18808 { NULL, false, NULL, NULL, 0 }
18811 static const struct aarch_branch_protect_type aarch64_branch_protect_types[] =
18813 { "none", true, aarch64_handle_no_branch_protection, NULL, 0 },
18814 { "standard", true, aarch64_handle_standard_branch_protection, NULL, 0 },
18815 { "pac-ret", false, aarch64_handle_pac_ret_protection,
18816 aarch64_pac_ret_subtypes, ARRAY_SIZE (aarch64_pac_ret_subtypes) },
18817 { "bti", false, aarch64_handle_bti_protection, NULL, 0 },
18818 { NULL, false, NULL, NULL, 0 }
18821 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18822 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18823 tuning structs. In particular it must set selected_tune and
18824 aarch64_asm_isa_flags that define the available ISA features and tuning
18825 decisions. It must also set selected_arch as this will be used to
18826 output the .arch asm tags for each function. */
18828 static void
18829 aarch64_override_options (void)
18831 aarch64_feature_flags cpu_isa = 0;
18832 aarch64_feature_flags arch_isa = 0;
18833 aarch64_set_asm_isa_flags (0);
18835 const struct processor *cpu = NULL;
18836 const struct processor *arch = NULL;
18837 const struct processor *tune = NULL;
18839 if (aarch64_harden_sls_string)
18840 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18842 if (aarch64_branch_protection_string)
18843 aarch_validate_mbranch_protection (aarch64_branch_protect_types,
18844 aarch64_branch_protection_string,
18845 "-mbranch-protection=");
18847 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18848 If either of -march or -mtune is given, they override their
18849 respective component of -mcpu. */
18850 if (aarch64_cpu_string)
18851 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18853 if (aarch64_arch_string)
18854 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18856 if (aarch64_tune_string)
18857 aarch64_validate_mtune (aarch64_tune_string, &tune);
18859 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18860 SUBTARGET_OVERRIDE_OPTIONS;
18861 #endif
18863 auto isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
18864 if (cpu && arch)
18866 /* If both -mcpu and -march are specified, warn if they are not
18867 feature compatible. feature compatible means that the inclusion of the
18868 cpu features would end up disabling an achitecture feature. In
18869 otherwords the cpu features need to be a strict superset of the arch
18870 features and if so prefer the -march ISA flags. */
18871 auto full_arch_flags = arch->flags | arch_isa;
18872 auto full_cpu_flags = cpu->flags | cpu_isa;
18873 if (~full_cpu_flags & full_arch_flags)
18875 std::string ext_diff
18876 = aarch64_get_extension_string_for_isa_flags (full_arch_flags,
18877 full_cpu_flags);
18878 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18879 "and resulted in options %<%s%> being added",
18880 aarch64_cpu_string,
18881 aarch64_arch_string,
18882 ext_diff.c_str ());
18885 selected_arch = arch->arch;
18886 aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18888 else if (cpu)
18890 selected_arch = cpu->arch;
18891 aarch64_set_asm_isa_flags (cpu_isa | isa_mode);
18893 else if (arch)
18895 cpu = &all_cores[arch->ident];
18896 selected_arch = arch->arch;
18897 aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18899 else
18901 /* No -mcpu or -march specified, so use the default CPU. */
18902 cpu = &all_cores[TARGET_CPU_DEFAULT];
18903 selected_arch = cpu->arch;
18904 aarch64_set_asm_isa_flags (cpu->flags | isa_mode);
18907 selected_tune = tune ? tune->ident : cpu->ident;
18909 if (aarch_enable_bti == 2)
18911 #ifdef TARGET_ENABLE_BTI
18912 aarch_enable_bti = 1;
18913 #else
18914 aarch_enable_bti = 0;
18915 #endif
18918 /* Return address signing is currently not supported for ILP32 targets. For
18919 LP64 targets use the configured option in the absence of a command-line
18920 option for -mbranch-protection. */
18921 if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
18923 #ifdef TARGET_ENABLE_PAC_RET
18924 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18925 #else
18926 aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18927 #endif
18930 #ifndef HAVE_AS_MABI_OPTION
18931 /* The compiler may have been configured with 2.23.* binutils, which does
18932 not have support for ILP32. */
18933 if (TARGET_ILP32)
18934 error ("assembler does not support %<-mabi=ilp32%>");
18935 #endif
18937 /* Convert -msve-vector-bits to a VG count. */
18938 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18940 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
18941 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18943 /* The pass to insert speculation tracking runs before
18944 shrink-wrapping and the latter does not know how to update the
18945 tracking status. So disable it in this case. */
18946 if (aarch64_track_speculation)
18947 flag_shrink_wrap = 0;
18949 aarch64_override_options_internal (&global_options);
18951 /* Save these options as the default ones in case we push and pop them later
18952 while processing functions with potential target attributes. */
18953 target_option_default_node = target_option_current_node
18954 = build_target_option_node (&global_options, &global_options_set);
18957 /* Implement targetm.override_options_after_change. */
18959 static void
18960 aarch64_override_options_after_change (void)
18962 aarch64_override_options_after_change_1 (&global_options);
18965 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18966 static char *
18967 aarch64_offload_options (void)
18969 if (TARGET_ILP32)
18970 return xstrdup ("-foffload-abi=ilp32");
18971 else
18972 return xstrdup ("-foffload-abi=lp64");
18975 static struct machine_function *
18976 aarch64_init_machine_status (void)
18978 struct machine_function *machine;
18979 machine = ggc_cleared_alloc<machine_function> ();
18980 return machine;
18983 void
18984 aarch64_init_expanders (void)
18986 init_machine_status = aarch64_init_machine_status;
18989 /* A checking mechanism for the implementation of the various code models. */
18990 static void
18991 initialize_aarch64_code_model (struct gcc_options *opts)
18993 aarch64_cmodel = opts->x_aarch64_cmodel_var;
18994 switch (opts->x_aarch64_cmodel_var)
18996 case AARCH64_CMODEL_TINY:
18997 if (opts->x_flag_pic)
18998 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18999 break;
19000 case AARCH64_CMODEL_SMALL:
19001 if (opts->x_flag_pic)
19003 #ifdef HAVE_AS_SMALL_PIC_RELOCS
19004 aarch64_cmodel = (flag_pic == 2
19005 ? AARCH64_CMODEL_SMALL_PIC
19006 : AARCH64_CMODEL_SMALL_SPIC);
19007 #else
19008 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
19009 #endif
19011 break;
19012 case AARCH64_CMODEL_LARGE:
19013 if (opts->x_flag_pic)
19014 sorry ("code model %qs with %<-f%s%>", "large",
19015 opts->x_flag_pic > 1 ? "PIC" : "pic");
19016 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
19017 sorry ("code model %qs not supported in ilp32 mode", "large");
19018 break;
19019 case AARCH64_CMODEL_TINY_PIC:
19020 case AARCH64_CMODEL_SMALL_PIC:
19021 case AARCH64_CMODEL_SMALL_SPIC:
19022 gcc_unreachable ();
19026 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
19027 using the information saved in PTR. */
19029 static void
19030 aarch64_option_restore (struct gcc_options *opts,
19031 struct gcc_options * /* opts_set */,
19032 struct cl_target_option * /* ptr */)
19034 aarch64_override_options_internal (opts);
19037 /* Implement TARGET_OPTION_PRINT. */
19039 static void
19040 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
19042 const struct processor *cpu
19043 = aarch64_get_tune_cpu (ptr->x_selected_tune);
19044 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
19045 std::string extension
19046 = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
19047 arch->flags);
19049 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
19050 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
19051 arch->name, extension.c_str ());
19054 static GTY(()) tree aarch64_previous_fndecl;
19056 void
19057 aarch64_reset_previous_fndecl (void)
19059 aarch64_previous_fndecl = NULL;
19062 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19063 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19064 make sure optab availability predicates are recomputed when necessary. */
19066 void
19067 aarch64_save_restore_target_globals (tree new_tree)
19069 if (TREE_TARGET_GLOBALS (new_tree))
19070 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
19071 else if (new_tree == target_option_default_node)
19072 restore_target_globals (&default_target_globals);
19073 else
19074 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
19077 /* Return the target_option_node for FNDECL, or the current options
19078 if FNDECL is null. */
19080 static tree
19081 aarch64_fndecl_options (tree fndecl)
19083 if (!fndecl)
19084 return target_option_current_node;
19086 if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
19087 return options;
19089 return target_option_default_node;
19092 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
19093 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19094 of the function, if such exists. This function may be called multiple
19095 times on a single function so use aarch64_previous_fndecl to avoid
19096 setting up identical state. */
19098 static void
19099 aarch64_set_current_function (tree fndecl)
19101 tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
19102 tree new_tree = aarch64_fndecl_options (fndecl);
19104 auto new_isa_mode = (fndecl
19105 ? aarch64_fndecl_isa_mode (fndecl)
19106 : AARCH64_FL_DEFAULT_ISA_MODE);
19107 auto isa_flags = TREE_TARGET_OPTION (new_tree)->x_aarch64_isa_flags;
19109 static bool reported_zt0_p;
19110 if (!reported_zt0_p
19111 && !(isa_flags & AARCH64_FL_SME2)
19112 && fndecl
19113 && aarch64_fndecl_has_state (fndecl, "zt0"))
19115 error ("functions with %qs state require the ISA extension %qs",
19116 "zt0", "sme2");
19117 inform (input_location, "you can enable %qs using the command-line"
19118 " option %<-march%>, or by using the %<target%>"
19119 " attribute or pragma", "sme2");
19120 reported_zt0_p = true;
19123 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
19124 the default have been handled by aarch64_save_restore_target_globals from
19125 aarch64_pragma_target_parse. */
19126 if (old_tree == new_tree
19127 && (!fndecl || aarch64_previous_fndecl)
19128 && (isa_flags & AARCH64_FL_ISA_MODES) == new_isa_mode)
19130 gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19131 return;
19134 aarch64_previous_fndecl = fndecl;
19136 /* First set the target options. */
19137 cl_target_option_restore (&global_options, &global_options_set,
19138 TREE_TARGET_OPTION (new_tree));
19140 /* The ISA mode can vary based on function type attributes and
19141 function declaration attributes. Make sure that the target
19142 options correctly reflect these attributes. */
19143 if ((isa_flags & AARCH64_FL_ISA_MODES) != new_isa_mode)
19145 auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
19146 aarch64_set_asm_isa_flags (base_flags | new_isa_mode);
19148 aarch64_override_options_internal (&global_options);
19149 new_tree = build_target_option_node (&global_options,
19150 &global_options_set);
19151 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
19153 tree new_optimize = build_optimization_node (&global_options,
19154 &global_options_set);
19155 if (new_optimize != optimization_default_node)
19156 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19159 aarch64_save_restore_target_globals (new_tree);
19161 gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19164 /* Enum describing the various ways we can handle attributes.
19165 In many cases we can reuse the generic option handling machinery. */
19167 enum aarch64_attr_opt_type
19169 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
19170 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
19171 aarch64_attr_enum, /* Attribute sets an enum variable. */
19172 aarch64_attr_custom /* Attribute requires a custom handling function. */
19175 /* All the information needed to handle a target attribute.
19176 NAME is the name of the attribute.
19177 ATTR_TYPE specifies the type of behavior of the attribute as described
19178 in the definition of enum aarch64_attr_opt_type.
19179 ALLOW_NEG is true if the attribute supports a "no-" form.
19180 HANDLER is the function that takes the attribute string as an argument
19181 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19182 OPT_NUM is the enum specifying the option that the attribute modifies.
19183 This is needed for attributes that mirror the behavior of a command-line
19184 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19185 aarch64_attr_enum. */
19187 struct aarch64_attribute_info
19189 const char *name;
19190 enum aarch64_attr_opt_type attr_type;
19191 bool allow_neg;
19192 bool (*handler) (const char *);
19193 enum opt_code opt_num;
19196 /* Handle the ARCH_STR argument to the arch= target attribute. */
19198 static bool
19199 aarch64_handle_attr_arch (const char *str)
19201 const struct processor *tmp_arch = NULL;
19202 std::string invalid_extension;
19203 aarch64_feature_flags tmp_flags;
19204 enum aarch_parse_opt_result parse_res
19205 = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19207 if (parse_res == AARCH_PARSE_OK)
19209 gcc_assert (tmp_arch);
19210 selected_arch = tmp_arch->arch;
19211 aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19212 return true;
19215 switch (parse_res)
19217 case AARCH_PARSE_MISSING_ARG:
19218 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19219 break;
19220 case AARCH_PARSE_INVALID_ARG:
19221 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19222 aarch64_print_hint_for_arch (str);
19223 break;
19224 case AARCH_PARSE_INVALID_FEATURE:
19225 error ("invalid feature modifier %s of value %qs in "
19226 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19227 aarch64_print_hint_for_extensions (invalid_extension);
19228 break;
19229 default:
19230 gcc_unreachable ();
19233 return false;
19236 /* Handle the argument CPU_STR to the cpu= target attribute. */
19238 static bool
19239 aarch64_handle_attr_cpu (const char *str)
19241 const struct processor *tmp_cpu = NULL;
19242 std::string invalid_extension;
19243 aarch64_feature_flags tmp_flags;
19244 enum aarch_parse_opt_result parse_res
19245 = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19247 if (parse_res == AARCH_PARSE_OK)
19249 gcc_assert (tmp_cpu);
19250 selected_tune = tmp_cpu->ident;
19251 selected_arch = tmp_cpu->arch;
19252 aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19253 return true;
19256 switch (parse_res)
19258 case AARCH_PARSE_MISSING_ARG:
19259 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19260 break;
19261 case AARCH_PARSE_INVALID_ARG:
19262 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19263 aarch64_print_hint_for_core (str);
19264 break;
19265 case AARCH_PARSE_INVALID_FEATURE:
19266 error ("invalid feature modifier %qs of value %qs in "
19267 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19268 aarch64_print_hint_for_extensions (invalid_extension);
19269 break;
19270 default:
19271 gcc_unreachable ();
19274 return false;
19277 /* Handle the argument STR to the branch-protection= attribute. */
19279 static bool
19280 aarch64_handle_attr_branch_protection (const char* str)
19282 return aarch_validate_mbranch_protection (aarch64_branch_protect_types, str,
19283 "target(\"branch-protection=\")");
19286 /* Handle the argument STR to the tune= target attribute. */
19288 static bool
19289 aarch64_handle_attr_tune (const char *str)
19291 const struct processor *tmp_tune = NULL;
19292 enum aarch_parse_opt_result parse_res
19293 = aarch64_parse_tune (str, &tmp_tune);
19295 if (parse_res == AARCH_PARSE_OK)
19297 gcc_assert (tmp_tune);
19298 selected_tune = tmp_tune->ident;
19299 return true;
19302 switch (parse_res)
19304 case AARCH_PARSE_INVALID_ARG:
19305 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19306 aarch64_print_hint_for_core (str);
19307 break;
19308 default:
19309 gcc_unreachable ();
19312 return false;
19315 /* Parse an architecture extensions target attribute string specified in STR.
19316 For example "+fp+nosimd". Show any errors if needed. Return TRUE
19317 if successful. Update aarch64_isa_flags to reflect the ISA features
19318 modified. */
19320 static bool
19321 aarch64_handle_attr_isa_flags (char *str)
19323 enum aarch_parse_opt_result parse_res;
19324 auto isa_flags = aarch64_asm_isa_flags;
19326 /* We allow "+nothing" in the beginning to clear out all architectural
19327 features if the user wants to handpick specific features. */
19328 if (strncmp ("+nothing", str, 8) == 0)
19330 isa_flags = AARCH64_ISA_MODE;
19331 str += 8;
19334 std::string invalid_extension;
19335 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19337 if (parse_res == AARCH_PARSE_OK)
19339 aarch64_set_asm_isa_flags (isa_flags);
19340 return true;
19343 switch (parse_res)
19345 case AARCH_PARSE_MISSING_ARG:
19346 error ("missing value in %<target()%> pragma or attribute");
19347 break;
19349 case AARCH_PARSE_INVALID_FEATURE:
19350 error ("invalid feature modifier %qs of value %qs in "
19351 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19352 break;
19354 default:
19355 gcc_unreachable ();
19358 return false;
19361 /* The target attributes that we support. On top of these we also support just
19362 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
19363 handled explicitly in aarch64_process_one_target_attr. */
19365 static const struct aarch64_attribute_info aarch64_attributes[] =
19367 { "general-regs-only", aarch64_attr_mask, false, NULL,
19368 OPT_mgeneral_regs_only },
19369 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19370 OPT_mfix_cortex_a53_835769 },
19371 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19372 OPT_mfix_cortex_a53_843419 },
19373 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19374 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19375 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19376 OPT_momit_leaf_frame_pointer },
19377 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19378 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19379 OPT_march_ },
19380 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19381 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19382 OPT_mtune_ },
19383 { "branch-protection", aarch64_attr_custom, false,
19384 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19385 { "sign-return-address", aarch64_attr_enum, false, NULL,
19386 OPT_msign_return_address_ },
19387 { "outline-atomics", aarch64_attr_bool, true, NULL,
19388 OPT_moutline_atomics},
19389 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19392 /* Parse ARG_STR which contains the definition of one target attribute.
19393 Show appropriate errors if any or return true if the attribute is valid. */
19395 static bool
19396 aarch64_process_one_target_attr (char *arg_str)
19398 bool invert = false;
19400 size_t len = strlen (arg_str);
19402 if (len == 0)
19404 error ("malformed %<target()%> pragma or attribute");
19405 return false;
19408 auto_vec<char, 32> buffer;
19409 buffer.safe_grow (len + 1);
19410 char *str_to_check = buffer.address ();
19411 memcpy (str_to_check, arg_str, len + 1);
19413 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19414 It is easier to detect and handle it explicitly here rather than going
19415 through the machinery for the rest of the target attributes in this
19416 function. */
19417 if (*str_to_check == '+')
19418 return aarch64_handle_attr_isa_flags (str_to_check);
19420 if (len > 3 && startswith (str_to_check, "no-"))
19422 invert = true;
19423 str_to_check += 3;
19425 char *arg = strchr (str_to_check, '=');
19427 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19428 and point ARG to "foo". */
19429 if (arg)
19431 *arg = '\0';
19432 arg++;
19434 const struct aarch64_attribute_info *p_attr;
19435 bool found = false;
19436 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19438 /* If the names don't match up, or the user has given an argument
19439 to an attribute that doesn't accept one, or didn't give an argument
19440 to an attribute that expects one, fail to match. */
19441 if (strcmp (str_to_check, p_attr->name) != 0)
19442 continue;
19444 found = true;
19445 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19446 || p_attr->attr_type == aarch64_attr_enum;
19448 if (attr_need_arg_p ^ (arg != NULL))
19450 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19451 return false;
19454 /* If the name matches but the attribute does not allow "no-" versions
19455 then we can't match. */
19456 if (invert && !p_attr->allow_neg)
19458 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19459 return false;
19462 switch (p_attr->attr_type)
19464 /* Has a custom handler registered.
19465 For example, cpu=, arch=, tune=. */
19466 case aarch64_attr_custom:
19467 gcc_assert (p_attr->handler);
19468 if (!p_attr->handler (arg))
19469 return false;
19470 break;
19472 /* Either set or unset a boolean option. */
19473 case aarch64_attr_bool:
19475 struct cl_decoded_option decoded;
19477 generate_option (p_attr->opt_num, NULL, !invert,
19478 CL_TARGET, &decoded);
19479 aarch64_handle_option (&global_options, &global_options_set,
19480 &decoded, input_location);
19481 break;
19483 /* Set or unset a bit in the target_flags. aarch64_handle_option
19484 should know what mask to apply given the option number. */
19485 case aarch64_attr_mask:
19487 struct cl_decoded_option decoded;
19488 /* We only need to specify the option number.
19489 aarch64_handle_option will know which mask to apply. */
19490 decoded.opt_index = p_attr->opt_num;
19491 decoded.value = !invert;
19492 aarch64_handle_option (&global_options, &global_options_set,
19493 &decoded, input_location);
19494 break;
19496 /* Use the option setting machinery to set an option to an enum. */
19497 case aarch64_attr_enum:
19499 gcc_assert (arg);
19500 bool valid;
19501 int value;
19502 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19503 &value, CL_TARGET);
19504 if (valid)
19506 set_option (&global_options, NULL, p_attr->opt_num, value,
19507 NULL, DK_UNSPECIFIED, input_location,
19508 global_dc);
19510 else
19512 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19514 break;
19516 default:
19517 gcc_unreachable ();
19521 /* If we reached here we either have found an attribute and validated
19522 it or didn't match any. If we matched an attribute but its arguments
19523 were malformed we will have returned false already. */
19524 return found;
19527 /* Count how many times the character C appears in
19528 NULL-terminated string STR. */
19530 static unsigned int
19531 num_occurences_in_str (char c, char *str)
19533 unsigned int res = 0;
19534 while (*str != '\0')
19536 if (*str == c)
19537 res++;
19539 str++;
19542 return res;
19545 /* Parse the tree in ARGS that contains the target attribute information
19546 and update the global target options space. */
19548 bool
19549 aarch64_process_target_attr (tree args)
19551 if (TREE_CODE (args) == TREE_LIST)
19555 tree head = TREE_VALUE (args);
19556 if (head)
19558 if (!aarch64_process_target_attr (head))
19559 return false;
19561 args = TREE_CHAIN (args);
19562 } while (args);
19564 return true;
19567 if (TREE_CODE (args) != STRING_CST)
19569 error ("attribute %<target%> argument not a string");
19570 return false;
19573 size_t len = strlen (TREE_STRING_POINTER (args));
19574 auto_vec<char, 32> buffer;
19575 buffer.safe_grow (len + 1);
19576 char *str_to_check = buffer.address ();
19577 memcpy (str_to_check, TREE_STRING_POINTER (args), len + 1);
19579 if (len == 0)
19581 error ("malformed %<target()%> pragma or attribute");
19582 return false;
19585 /* Used to catch empty spaces between commas i.e.
19586 attribute ((target ("attr1,,attr2"))). */
19587 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19589 /* Handle multiple target attributes separated by ','. */
19590 char *token = strtok_r (str_to_check, ",", &str_to_check);
19592 unsigned int num_attrs = 0;
19593 while (token)
19595 num_attrs++;
19596 if (!aarch64_process_one_target_attr (token))
19598 /* Check if token is possibly an arch extension without
19599 leading '+'. */
19600 aarch64_feature_flags isa_temp = 0;
19601 auto with_plus = std::string ("+") + token;
19602 enum aarch_parse_opt_result ext_res
19603 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19605 if (ext_res == AARCH_PARSE_OK)
19606 error ("arch extension %<%s%> should be prefixed by %<+%>",
19607 token);
19608 else
19609 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19610 return false;
19613 token = strtok_r (NULL, ",", &str_to_check);
19616 if (num_attrs != num_commas + 1)
19618 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19619 return false;
19622 return true;
19625 static bool aarch64_process_target_version_attr (tree args);
19627 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19628 process attribute ((target ("..."))). */
19630 static bool
19631 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19633 struct cl_target_option cur_target;
19634 bool ret;
19635 tree old_optimize;
19636 tree new_target, new_optimize;
19637 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19639 /* If what we're processing is the current pragma string then the
19640 target option node is already stored in target_option_current_node
19641 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19642 having to re-parse the string. This is especially useful to keep
19643 arm_neon.h compile times down since that header contains a lot
19644 of intrinsics enclosed in pragmas. */
19645 if (!existing_target && args == current_target_pragma)
19647 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19648 return true;
19650 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19652 old_optimize
19653 = build_optimization_node (&global_options, &global_options_set);
19654 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19656 /* If the function changed the optimization levels as well as setting
19657 target options, start with the optimizations specified. */
19658 if (func_optimize && func_optimize != old_optimize)
19659 cl_optimization_restore (&global_options, &global_options_set,
19660 TREE_OPTIMIZATION (func_optimize));
19662 /* Save the current target options to restore at the end. */
19663 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19665 /* If fndecl already has some target attributes applied to it, unpack
19666 them so that we add this attribute on top of them, rather than
19667 overwriting them. */
19668 if (existing_target)
19670 struct cl_target_option *existing_options
19671 = TREE_TARGET_OPTION (existing_target);
19673 if (existing_options)
19674 cl_target_option_restore (&global_options, &global_options_set,
19675 existing_options);
19677 else
19678 cl_target_option_restore (&global_options, &global_options_set,
19679 TREE_TARGET_OPTION (target_option_current_node));
19681 ret = aarch64_process_target_attr (args);
19682 if (ret)
19684 tree version_attr = lookup_attribute ("target_version",
19685 DECL_ATTRIBUTES (fndecl));
19686 if (version_attr != NULL_TREE)
19688 /* Reapply any target_version attribute after target attribute.
19689 This should be equivalent to applying the target_version once
19690 after processing all target attributes. */
19691 tree version_args = TREE_VALUE (version_attr);
19692 ret = aarch64_process_target_version_attr (version_args);
19696 /* Set up any additional state. */
19697 if (ret)
19699 aarch64_override_options_internal (&global_options);
19700 new_target = build_target_option_node (&global_options,
19701 &global_options_set);
19703 else
19704 new_target = NULL;
19706 new_optimize = build_optimization_node (&global_options,
19707 &global_options_set);
19709 if (fndecl && ret)
19711 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19713 if (old_optimize != new_optimize)
19714 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19717 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19719 if (old_optimize != new_optimize)
19720 cl_optimization_restore (&global_options, &global_options_set,
19721 TREE_OPTIMIZATION (old_optimize));
19722 return ret;
19725 typedef unsigned long long aarch64_fmv_feature_mask;
19727 typedef struct
19729 const char *name;
19730 aarch64_fmv_feature_mask feature_mask;
19731 aarch64_feature_flags opt_flags;
19732 } aarch64_fmv_feature_datum;
19734 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19735 {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19737 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19738 feature_deps name. */
19739 #define FEAT_RDMA FEAT_RDM
19741 /* FMV features are listed in priority order, to make it easier to sort target
19742 strings. */
19743 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19744 #include "config/aarch64/aarch64-option-extensions.def"
19747 /* Parse a function multiversioning feature string STR, as found in a
19748 target_version or target_clones attribute.
19750 If ISA_FLAGS is nonnull, then update it with the specified architecture
19751 features turned on. If FEATURE_MASK is nonnull, then assign to it a bitmask
19752 representing the set of features explicitly specified in the feature string.
19753 Return an aarch_parse_opt_result describing the result.
19755 When the STR string contains an invalid or duplicate extension, a copy of
19756 the extension string is created and stored to INVALID_EXTENSION. */
19758 static enum aarch_parse_opt_result
19759 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19760 aarch64_fmv_feature_mask *feature_mask,
19761 std::string *invalid_extension)
19763 if (feature_mask)
19764 *feature_mask = 0ULL;
19766 if (strcmp (str, "default") == 0)
19767 return AARCH_PARSE_OK;
19769 while (str != NULL && *str != 0)
19771 const char *ext;
19772 size_t len;
19774 ext = strchr (str, '+');
19776 if (ext != NULL)
19777 len = ext - str;
19778 else
19779 len = strlen (str);
19781 if (len == 0)
19782 return AARCH_PARSE_MISSING_ARG;
19784 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19785 int i;
19786 for (i = 0; i < num_features; i++)
19788 if (strlen (aarch64_fmv_feature_data[i].name) == len
19789 && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19791 if (isa_flags)
19792 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19793 if (feature_mask)
19795 auto old_feature_mask = *feature_mask;
19796 *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19797 if (*feature_mask == old_feature_mask)
19799 /* Duplicate feature. */
19800 if (invalid_extension)
19801 *invalid_extension = std::string (str, len);
19802 return AARCH_PARSE_DUPLICATE_FEATURE;
19805 break;
19809 if (i == num_features)
19811 /* Feature not found in list. */
19812 if (invalid_extension)
19813 *invalid_extension = std::string (str, len);
19814 return AARCH_PARSE_INVALID_FEATURE;
19817 str = ext;
19818 if (str)
19819 /* Skip over the next '+'. */
19820 str++;
19823 return AARCH_PARSE_OK;
19826 /* Parse the tree in ARGS that contains the target_version attribute
19827 information and update the global target options space. */
19829 static bool
19830 aarch64_process_target_version_attr (tree args)
19832 if (TREE_CODE (args) == TREE_LIST)
19834 if (TREE_CHAIN (args))
19836 error ("attribute %<target_version%> has multiple values");
19837 return false;
19839 args = TREE_VALUE (args);
19842 if (!args || TREE_CODE (args) != STRING_CST)
19844 error ("attribute %<target_version%> argument not a string");
19845 return false;
19848 const char *str = TREE_STRING_POINTER (args);
19850 enum aarch_parse_opt_result parse_res;
19851 auto isa_flags = aarch64_asm_isa_flags;
19853 std::string invalid_extension;
19854 parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19855 &invalid_extension);
19857 if (parse_res == AARCH_PARSE_OK)
19859 aarch64_set_asm_isa_flags (isa_flags);
19860 return true;
19863 switch (parse_res)
19865 case AARCH_PARSE_MISSING_ARG:
19866 error ("missing value in %<target_version%> attribute");
19867 break;
19869 case AARCH_PARSE_INVALID_FEATURE:
19870 error ("invalid feature modifier %qs of value %qs in "
19871 "%<target_version%> attribute", invalid_extension.c_str (),
19872 str);
19873 break;
19875 case AARCH_PARSE_DUPLICATE_FEATURE:
19876 error ("duplicate feature modifier %qs of value %qs in "
19877 "%<target_version%> attribute", invalid_extension.c_str (),
19878 str);
19879 break;
19881 default:
19882 gcc_unreachable ();
19885 return false;
19888 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P. This is used to
19889 process attribute ((target_version ("..."))). */
19891 static bool
19892 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
19894 struct cl_target_option cur_target;
19895 bool ret;
19896 tree new_target;
19897 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19899 /* Save the current target options to restore at the end. */
19900 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19902 /* If fndecl already has some target attributes applied to it, unpack
19903 them so that we add this attribute on top of them, rather than
19904 overwriting them. */
19905 if (existing_target)
19907 struct cl_target_option *existing_options
19908 = TREE_TARGET_OPTION (existing_target);
19910 if (existing_options)
19911 cl_target_option_restore (&global_options, &global_options_set,
19912 existing_options);
19914 else
19915 cl_target_option_restore (&global_options, &global_options_set,
19916 TREE_TARGET_OPTION (target_option_current_node));
19918 ret = aarch64_process_target_version_attr (args);
19920 /* Set up any additional state. */
19921 if (ret)
19923 aarch64_override_options_internal (&global_options);
19924 new_target = build_target_option_node (&global_options,
19925 &global_options_set);
19927 else
19928 new_target = NULL;
19930 if (fndecl && ret)
19931 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19933 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19935 return ret;
19938 /* This parses the attribute arguments to target_version in DECL and the
19939 feature mask required to select those targets. No adjustments are made to
19940 add or remove redundant feature requirements. */
19942 static aarch64_fmv_feature_mask
19943 get_feature_mask_for_version (tree decl)
19945 tree version_attr = lookup_attribute ("target_version",
19946 DECL_ATTRIBUTES (decl));
19947 if (version_attr == NULL)
19948 return 0;
19950 const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
19951 (version_attr)));
19952 enum aarch_parse_opt_result parse_res;
19953 aarch64_fmv_feature_mask feature_mask;
19955 parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
19956 NULL);
19958 /* We should have detected any errors before getting here. */
19959 gcc_assert (parse_res == AARCH_PARSE_OK);
19961 return feature_mask;
19964 /* Compare priorities of two feature masks. Return:
19965 1: mask1 is higher priority
19966 -1: mask2 is higher priority
19967 0: masks are equal. */
19969 static int
19970 compare_feature_masks (aarch64_fmv_feature_mask mask1,
19971 aarch64_fmv_feature_mask mask2)
19973 int pop1 = popcount_hwi (mask1);
19974 int pop2 = popcount_hwi (mask2);
19975 if (pop1 > pop2)
19976 return 1;
19977 if (pop2 > pop1)
19978 return -1;
19980 auto diff_mask = mask1 ^ mask2;
19981 if (diff_mask == 0ULL)
19982 return 0;
19983 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19984 for (int i = num_features - 1; i >= 0; i--)
19986 auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
19987 if (diff_mask & bit_mask)
19988 return (mask1 & bit_mask) ? 1 : -1;
19990 gcc_unreachable();
19993 /* Compare priorities of two version decls. */
19996 aarch64_compare_version_priority (tree decl1, tree decl2)
19998 auto mask1 = get_feature_mask_for_version (decl1);
19999 auto mask2 = get_feature_mask_for_version (decl2);
20001 return compare_feature_masks (mask1, mask2);
20004 /* Build the struct __ifunc_arg_t type:
20006 struct __ifunc_arg_t
20008 unsigned long _size; // Size of the struct, so it can grow.
20009 unsigned long _hwcap;
20010 unsigned long _hwcap2;
20014 static tree
20015 build_ifunc_arg_type ()
20017 tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
20018 tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20019 get_identifier ("_size"),
20020 long_unsigned_type_node);
20021 tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20022 get_identifier ("_hwcap"),
20023 long_unsigned_type_node);
20024 tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20025 get_identifier ("_hwcap2"),
20026 long_unsigned_type_node);
20028 DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
20029 DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
20030 DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
20032 TYPE_FIELDS (ifunc_arg_type) = field1;
20033 DECL_CHAIN (field1) = field2;
20034 DECL_CHAIN (field2) = field3;
20036 layout_type (ifunc_arg_type);
20038 tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
20039 tree pointer_type = build_pointer_type (const_type);
20041 return pointer_type;
20044 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20045 suffixes. */
20047 tree
20048 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20050 /* For function version, add the target suffix to the assembler name. */
20051 if (TREE_CODE (decl) == FUNCTION_DECL
20052 && DECL_FUNCTION_VERSIONED (decl))
20054 aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20056 std::string name = IDENTIFIER_POINTER (id);
20058 /* For the default version, append ".default". */
20059 if (feature_mask == 0ULL)
20061 name += ".default";
20062 return get_identifier (name.c_str());
20065 name += "._";
20067 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20068 for (int i = 0; i < num_features; i++)
20070 if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20072 name += "M";
20073 name += aarch64_fmv_feature_data[i].name;
20077 if (DECL_ASSEMBLER_NAME_SET_P (decl))
20078 SET_DECL_RTL (decl, NULL);
20080 id = get_identifier (name.c_str());
20082 return id;
20085 /* Return an identifier for the base assembler name of a versioned function.
20086 This is computed by taking the default version's assembler name, and
20087 stripping off the ".default" suffix if it's already been appended. */
20089 static tree
20090 get_suffixed_assembler_name (tree default_decl, const char *suffix)
20092 std::string name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl));
20094 auto size = name.size ();
20095 if (size >= 8 && name.compare (size - 8, 8, ".default") == 0)
20096 name.resize (size - 8);
20097 name += suffix;
20098 return get_identifier (name.c_str());
20101 /* Make the resolver function decl to dispatch the versions of
20102 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
20103 ifunc alias that will point to the created resolver. Create an
20104 empty basic block in the resolver and store the pointer in
20105 EMPTY_BB. Return the decl of the resolver function. */
20107 static tree
20108 make_resolver_func (const tree default_decl,
20109 const tree ifunc_alias_decl,
20110 basic_block *empty_bb)
20112 tree decl, type, t;
20114 /* Create resolver function name based on default_decl. We need to remove an
20115 existing ".default" suffix if this has already been appended. */
20116 tree decl_name = get_suffixed_assembler_name (default_decl, ".resolver");
20117 const char *resolver_name = IDENTIFIER_POINTER (decl_name);
20119 /* The resolver function should have signature
20120 (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20121 type = build_function_type_list (ptr_type_node,
20122 uint64_type_node,
20123 build_ifunc_arg_type (),
20124 NULL_TREE);
20126 decl = build_fn_decl (resolver_name, type);
20127 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
20129 DECL_NAME (decl) = decl_name;
20130 TREE_USED (decl) = 1;
20131 DECL_ARTIFICIAL (decl) = 1;
20132 DECL_IGNORED_P (decl) = 1;
20133 TREE_PUBLIC (decl) = 0;
20134 DECL_UNINLINABLE (decl) = 1;
20136 /* Resolver is not external, body is generated. */
20137 DECL_EXTERNAL (decl) = 0;
20138 DECL_EXTERNAL (ifunc_alias_decl) = 0;
20140 DECL_CONTEXT (decl) = NULL_TREE;
20141 DECL_INITIAL (decl) = make_node (BLOCK);
20142 DECL_STATIC_CONSTRUCTOR (decl) = 0;
20144 if (DECL_COMDAT_GROUP (default_decl)
20145 || TREE_PUBLIC (default_decl))
20147 /* In this case, each translation unit with a call to this
20148 versioned function will put out a resolver. Ensure it
20149 is comdat to keep just one copy. */
20150 DECL_COMDAT (decl) = 1;
20151 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
20153 else
20154 TREE_PUBLIC (ifunc_alias_decl) = 0;
20156 /* Build result decl and add to function_decl. */
20157 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
20158 DECL_CONTEXT (t) = decl;
20159 DECL_ARTIFICIAL (t) = 1;
20160 DECL_IGNORED_P (t) = 1;
20161 DECL_RESULT (decl) = t;
20163 /* Build parameter decls and add to function_decl. */
20164 tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20165 get_identifier ("hwcap"),
20166 uint64_type_node);
20167 tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20168 get_identifier ("arg"),
20169 build_ifunc_arg_type());
20170 DECL_CONTEXT (arg1) = decl;
20171 DECL_CONTEXT (arg2) = decl;
20172 DECL_ARTIFICIAL (arg1) = 1;
20173 DECL_ARTIFICIAL (arg2) = 1;
20174 DECL_IGNORED_P (arg1) = 1;
20175 DECL_IGNORED_P (arg2) = 1;
20176 DECL_ARG_TYPE (arg1) = uint64_type_node;
20177 DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
20178 DECL_ARGUMENTS (decl) = arg1;
20179 TREE_CHAIN (arg1) = arg2;
20181 gimplify_function_tree (decl);
20182 push_cfun (DECL_STRUCT_FUNCTION (decl));
20183 *empty_bb = init_lowered_empty_function (decl, false,
20184 profile_count::uninitialized ());
20186 cgraph_node::add_new_function (decl, true);
20187 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
20189 pop_cfun ();
20191 gcc_assert (ifunc_alias_decl != NULL);
20192 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
20193 DECL_ATTRIBUTES (ifunc_alias_decl)
20194 = make_attribute ("ifunc", resolver_name,
20195 DECL_ATTRIBUTES (ifunc_alias_decl));
20197 /* Create the alias for dispatch to resolver here. */
20198 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
20199 return decl;
20202 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20203 to return a pointer to VERSION_DECL if all feature bits specified in
20204 FEATURE_MASK are not set in MASK_VAR. This function will be called during
20205 version dispatch to decide which function version to execute. It returns
20206 the basic block at the end, to which more conditions can be added. */
20207 static basic_block
20208 add_condition_to_bb (tree function_decl, tree version_decl,
20209 aarch64_fmv_feature_mask feature_mask,
20210 tree mask_var, basic_block new_bb)
20212 gimple *return_stmt;
20213 tree convert_expr, result_var;
20214 gimple *convert_stmt;
20215 gimple *if_else_stmt;
20217 basic_block bb1, bb2, bb3;
20218 edge e12, e23;
20220 gimple_seq gseq;
20222 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
20224 gcc_assert (new_bb != NULL);
20225 gseq = bb_seq (new_bb);
20227 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
20228 build_fold_addr_expr (version_decl));
20229 result_var = create_tmp_var (ptr_type_node);
20230 convert_stmt = gimple_build_assign (result_var, convert_expr);
20231 return_stmt = gimple_build_return (result_var);
20233 if (feature_mask == 0ULL)
20235 /* Default version. */
20236 gimple_seq_add_stmt (&gseq, convert_stmt);
20237 gimple_seq_add_stmt (&gseq, return_stmt);
20238 set_bb_seq (new_bb, gseq);
20239 gimple_set_bb (convert_stmt, new_bb);
20240 gimple_set_bb (return_stmt, new_bb);
20241 pop_cfun ();
20242 return new_bb;
20245 tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
20246 tree and_expr = build2 (BIT_AND_EXPR,
20247 long_long_unsigned_type_node,
20248 mask_var,
20249 build_int_cst (long_long_unsigned_type_node,
20250 feature_mask));
20251 gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
20252 gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
20253 gimple_set_bb (and_stmt, new_bb);
20254 gimple_seq_add_stmt (&gseq, and_stmt);
20256 tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20257 if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20258 NULL_TREE, NULL_TREE);
20259 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20260 gimple_set_bb (if_else_stmt, new_bb);
20261 gimple_seq_add_stmt (&gseq, if_else_stmt);
20263 gimple_seq_add_stmt (&gseq, convert_stmt);
20264 gimple_seq_add_stmt (&gseq, return_stmt);
20265 set_bb_seq (new_bb, gseq);
20267 bb1 = new_bb;
20268 e12 = split_block (bb1, if_else_stmt);
20269 bb2 = e12->dest;
20270 e12->flags &= ~EDGE_FALLTHRU;
20271 e12->flags |= EDGE_TRUE_VALUE;
20273 e23 = split_block (bb2, return_stmt);
20275 gimple_set_bb (convert_stmt, bb2);
20276 gimple_set_bb (return_stmt, bb2);
20278 bb3 = e23->dest;
20279 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20281 remove_edge (e23);
20282 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20284 pop_cfun ();
20286 return bb3;
20289 /* This function generates the dispatch function for
20290 multi-versioned functions. DISPATCH_DECL is the function which will
20291 contain the dispatch logic. FNDECLS are the function choices for
20292 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
20293 in DISPATCH_DECL in which the dispatch code is generated. */
20295 static int
20296 dispatch_function_versions (tree dispatch_decl,
20297 void *fndecls_p,
20298 basic_block *empty_bb)
20300 gimple *ifunc_cpu_init_stmt;
20301 gimple_seq gseq;
20302 vec<tree> *fndecls;
20304 gcc_assert (dispatch_decl != NULL
20305 && fndecls_p != NULL
20306 && empty_bb != NULL);
20308 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20310 gseq = bb_seq (*empty_bb);
20311 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
20312 constructors, so explicity call __init_cpu_features_resolver here. */
20313 tree init_fn_type = build_function_type_list (void_type_node,
20314 long_unsigned_type_node,
20315 build_ifunc_arg_type(),
20316 NULL);
20317 tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20318 tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20319 init_fn_id, init_fn_type);
20320 tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20321 tree arg2 = TREE_CHAIN (arg1);
20322 ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20323 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20324 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20326 /* Build the struct type for __aarch64_cpu_features. */
20327 tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20328 tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20329 get_identifier ("features"),
20330 long_long_unsigned_type_node);
20331 DECL_FIELD_CONTEXT (field1) = global_type;
20332 TYPE_FIELDS (global_type) = field1;
20333 layout_type (global_type);
20335 tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20336 get_identifier ("__aarch64_cpu_features"),
20337 global_type);
20338 DECL_EXTERNAL (global_var) = 1;
20339 tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20341 tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20342 global_var, field1, NULL_TREE);
20343 gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20344 gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20345 gimple_set_bb (component_stmt, *empty_bb);
20346 gimple_seq_add_stmt (&gseq, component_stmt);
20348 tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20349 gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20350 gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20351 gimple_set_bb (not_stmt, *empty_bb);
20352 gimple_seq_add_stmt (&gseq, not_stmt);
20354 set_bb_seq (*empty_bb, gseq);
20356 pop_cfun ();
20358 /* fndecls_p is actually a vector. */
20359 fndecls = static_cast<vec<tree> *> (fndecls_p);
20361 /* At least one more version other than the default. */
20362 unsigned int num_versions = fndecls->length ();
20363 gcc_assert (num_versions >= 2);
20365 struct function_version_info
20367 tree version_decl;
20368 aarch64_fmv_feature_mask feature_mask;
20369 } *function_versions;
20371 function_versions = (struct function_version_info *)
20372 XNEWVEC (struct function_version_info, (num_versions));
20374 unsigned int actual_versions = 0;
20376 for (tree version_decl : *fndecls)
20378 aarch64_fmv_feature_mask feature_mask;
20379 /* Get attribute string, parse it and find the right features. */
20380 feature_mask = get_feature_mask_for_version (version_decl);
20381 function_versions [actual_versions].version_decl = version_decl;
20382 function_versions [actual_versions].feature_mask = feature_mask;
20383 actual_versions++;
20386 auto compare_feature_version_info = [](const void *p1, const void *p2) {
20387 const function_version_info v1 = *(const function_version_info *)p1;
20388 const function_version_info v2 = *(const function_version_info *)p2;
20389 return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20392 /* Sort the versions according to descending order of dispatch priority. */
20393 qsort (function_versions, actual_versions,
20394 sizeof (struct function_version_info), compare_feature_version_info);
20396 for (unsigned int i = 0; i < actual_versions; ++i)
20397 *empty_bb = add_condition_to_bb (dispatch_decl,
20398 function_versions[i].version_decl,
20399 function_versions[i].feature_mask,
20400 mask_var,
20401 *empty_bb);
20403 free (function_versions);
20404 return 0;
20407 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY. */
20409 tree
20410 aarch64_generate_version_dispatcher_body (void *node_p)
20412 tree resolver_decl;
20413 basic_block empty_bb;
20414 tree default_ver_decl;
20415 struct cgraph_node *versn;
20416 struct cgraph_node *node;
20418 struct cgraph_function_version_info *node_version_info = NULL;
20419 struct cgraph_function_version_info *versn_info = NULL;
20421 node = (cgraph_node *)node_p;
20423 node_version_info = node->function_version ();
20424 gcc_assert (node->dispatcher_function
20425 && node_version_info != NULL);
20427 if (node_version_info->dispatcher_resolver)
20428 return node_version_info->dispatcher_resolver;
20430 /* The first version in the chain corresponds to the default version. */
20431 default_ver_decl = node_version_info->next->this_node->decl;
20433 /* node is going to be an alias, so remove the finalized bit. */
20434 node->definition = false;
20436 resolver_decl = make_resolver_func (default_ver_decl,
20437 node->decl, &empty_bb);
20439 node_version_info->dispatcher_resolver = resolver_decl;
20441 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20443 auto_vec<tree, 2> fn_ver_vec;
20445 for (versn_info = node_version_info->next; versn_info;
20446 versn_info = versn_info->next)
20448 versn = versn_info->this_node;
20449 /* Check for virtual functions here again, as by this time it should
20450 have been determined if this function needs a vtable index or
20451 not. This happens for methods in derived classes that override
20452 virtual methods in base classes but are not explicitly marked as
20453 virtual. */
20454 if (DECL_VINDEX (versn->decl))
20455 sorry ("virtual function multiversioning not supported");
20457 fn_ver_vec.safe_push (versn->decl);
20460 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20461 cgraph_edge::rebuild_edges ();
20462 pop_cfun ();
20464 /* Fix up symbol names. First we need to obtain the base name, which may
20465 have already been mangled. */
20466 tree base_name = get_suffixed_assembler_name (default_ver_decl, "");
20468 /* We need to redo the version mangling on the non-default versions for the
20469 target_clones case. Redoing the mangling for the target_version case is
20470 redundant but does no harm. We need to skip the default version, because
20471 expand_clones will append ".default" later; fortunately that suffix is the
20472 one we want anyway. */
20473 for (versn_info = node_version_info->next->next; versn_info;
20474 versn_info = versn_info->next)
20476 tree version_decl = versn_info->this_node->decl;
20477 tree name = aarch64_mangle_decl_assembler_name (version_decl,
20478 base_name);
20479 symtab->change_decl_assembler_name (version_decl, name);
20482 /* We also need to use the base name for the ifunc declaration. */
20483 symtab->change_decl_assembler_name (node->decl, base_name);
20485 return resolver_decl;
20488 /* Make a dispatcher declaration for the multi-versioned function DECL.
20489 Calls to DECL function will be replaced with calls to the dispatcher
20490 by the front-end. Returns the decl of the dispatcher function. */
20492 tree
20493 aarch64_get_function_versions_dispatcher (void *decl)
20495 tree fn = (tree) decl;
20496 struct cgraph_node *node = NULL;
20497 struct cgraph_node *default_node = NULL;
20498 struct cgraph_function_version_info *node_v = NULL;
20499 struct cgraph_function_version_info *first_v = NULL;
20501 tree dispatch_decl = NULL;
20503 struct cgraph_function_version_info *default_version_info = NULL;
20505 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20507 node = cgraph_node::get (fn);
20508 gcc_assert (node != NULL);
20510 node_v = node->function_version ();
20511 gcc_assert (node_v != NULL);
20513 if (node_v->dispatcher_resolver != NULL)
20514 return node_v->dispatcher_resolver;
20516 /* Find the default version and make it the first node. */
20517 first_v = node_v;
20518 /* Go to the beginning of the chain. */
20519 while (first_v->prev != NULL)
20520 first_v = first_v->prev;
20521 default_version_info = first_v;
20522 while (default_version_info != NULL)
20524 if (get_feature_mask_for_version
20525 (default_version_info->this_node->decl) == 0ULL)
20526 break;
20527 default_version_info = default_version_info->next;
20530 /* If there is no default node, just return NULL. */
20531 if (default_version_info == NULL)
20532 return NULL;
20534 /* Make default info the first node. */
20535 if (first_v != default_version_info)
20537 default_version_info->prev->next = default_version_info->next;
20538 if (default_version_info->next)
20539 default_version_info->next->prev = default_version_info->prev;
20540 first_v->prev = default_version_info;
20541 default_version_info->next = first_v;
20542 default_version_info->prev = NULL;
20545 default_node = default_version_info->this_node;
20547 if (targetm.has_ifunc_p ())
20549 struct cgraph_function_version_info *it_v = NULL;
20550 struct cgraph_node *dispatcher_node = NULL;
20551 struct cgraph_function_version_info *dispatcher_version_info = NULL;
20553 /* Right now, the dispatching is done via ifunc. */
20554 dispatch_decl = make_dispatcher_decl (default_node->decl);
20555 TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20557 dispatcher_node = cgraph_node::get_create (dispatch_decl);
20558 gcc_assert (dispatcher_node != NULL);
20559 dispatcher_node->dispatcher_function = 1;
20560 dispatcher_version_info
20561 = dispatcher_node->insert_new_function_version ();
20562 dispatcher_version_info->next = default_version_info;
20563 dispatcher_node->definition = 1;
20565 /* Set the dispatcher for all the versions. */
20566 it_v = default_version_info;
20567 while (it_v != NULL)
20569 it_v->dispatcher_resolver = dispatch_decl;
20570 it_v = it_v->next;
20573 else
20575 error_at (DECL_SOURCE_LOCATION (default_node->decl),
20576 "multiversioning needs %<ifunc%> which is not supported "
20577 "on this target");
20580 return dispatch_decl;
20583 /* This function returns true if FN1 and FN2 are versions of the same function,
20584 that is, the target_version attributes of the function decls are different.
20585 This assumes that FN1 and FN2 have the same signature. */
20587 bool
20588 aarch64_common_function_versions (tree fn1, tree fn2)
20590 if (TREE_CODE (fn1) != FUNCTION_DECL
20591 || TREE_CODE (fn2) != FUNCTION_DECL)
20592 return false;
20594 return (aarch64_compare_version_priority (fn1, fn2) != 0);
20597 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P. Use an opt-out
20598 rather than an opt-in list. */
20600 static bool
20601 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20603 /* A function that has local SME state cannot be inlined into its caller,
20604 since we only support managing PSTATE.ZA switches at function scope. */
20605 return (!aarch64_fndecl_has_new_state (fndecl, "za")
20606 && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20609 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
20610 tri-bool options (yes, no, don't care) and the default value is
20611 DEF, determine whether to reject inlining. */
20613 static bool
20614 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20615 int dont_care, int def)
20617 /* If the callee doesn't care, always allow inlining. */
20618 if (callee == dont_care)
20619 return true;
20621 /* If the caller doesn't care, always allow inlining. */
20622 if (caller == dont_care)
20623 return true;
20625 /* Otherwise, allow inlining if either the callee and caller values
20626 agree, or if the callee is using the default value. */
20627 return (callee == caller || callee == def);
20630 /* Bit allocations for ipa_fn_summary::target_info. */
20632 /* Set if the function contains a stmt that relies on the function's
20633 choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20634 Not meaningful for streaming-compatible functions. */
20635 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20637 /* Set if the function clobbers ZA and ZT0. Not meaningful for functions that
20638 have ZA state. */
20639 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20640 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20642 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO. */
20644 static bool
20645 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20647 /* We could in principle skip this for streaming-compatible functions
20648 that have ZA state, but that's a rare combination. */
20649 return true;
20652 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO. */
20654 static bool
20655 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20657 if (auto *ga = dyn_cast<const gasm *> (stmt))
20659 /* We don't know what the asm does, so conservatively assume that
20660 it requires the function's current SM mode. */
20661 info |= AARCH64_IPA_SM_FIXED;
20662 for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20664 tree op = gimple_asm_clobber_op (ga, i);
20665 const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20666 if (strcmp (clobber, "za") == 0)
20667 info |= AARCH64_IPA_CLOBBERS_ZA;
20668 if (strcmp (clobber, "zt0") == 0)
20669 info |= AARCH64_IPA_CLOBBERS_ZT0;
20672 if (auto *call = dyn_cast<const gcall *> (stmt))
20674 if (gimple_call_builtin_p (call, BUILT_IN_MD))
20676 /* The attributes on AArch64 builtins are supposed to be accurate.
20677 If the function isn't marked streaming-compatible then it
20678 needs whichever SM mode it selects. */
20679 tree decl = gimple_call_fndecl (call);
20680 if (aarch64_fndecl_pstate_sm (decl) != 0)
20681 info |= AARCH64_IPA_SM_FIXED;
20684 return true;
20687 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
20688 to inline CALLEE into CALLER based on target-specific info.
20689 Make sure that the caller and callee have compatible architectural
20690 features. Then go through the other possible target attributes
20691 and see if they can block inlining. Try not to reject always_inline
20692 callees unless they are incompatible architecturally. */
20694 static bool
20695 aarch64_can_inline_p (tree caller, tree callee)
20697 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20698 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20700 struct cl_target_option *caller_opts
20701 = TREE_TARGET_OPTION (caller_tree ? caller_tree
20702 : target_option_default_node);
20704 struct cl_target_option *callee_opts
20705 = TREE_TARGET_OPTION (callee_tree ? callee_tree
20706 : target_option_default_node);
20708 /* Callee's ISA flags should be a subset of the caller's. */
20709 auto caller_asm_isa = (caller_opts->x_aarch64_asm_isa_flags
20710 & ~AARCH64_FL_ISA_MODES);
20711 auto callee_asm_isa = (callee_opts->x_aarch64_asm_isa_flags
20712 & ~AARCH64_FL_ISA_MODES);
20713 if (callee_asm_isa & ~caller_asm_isa)
20714 return false;
20716 auto caller_isa = (caller_opts->x_aarch64_isa_flags
20717 & ~AARCH64_FL_ISA_MODES);
20718 auto callee_isa = (callee_opts->x_aarch64_isa_flags
20719 & ~AARCH64_FL_ISA_MODES);
20720 if (callee_isa & ~caller_isa)
20721 return false;
20723 /* Return true if the callee might have target_info property PROPERTY.
20724 The answer must be true unless we have positive proof to the contrary. */
20725 auto callee_has_property = [&](unsigned int property)
20727 if (ipa_fn_summaries)
20728 if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20729 if (!(summary->target_info & property))
20730 return false;
20731 return true;
20734 /* Streaming-compatible code can be inlined into functions with any
20735 PSTATE.SM mode. Otherwise the caller and callee must agree on
20736 PSTATE.SM mode, unless we can prove that the callee is naturally
20737 streaming-compatible. */
20738 auto caller_sm = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20739 auto callee_sm = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20740 if (callee_sm
20741 && caller_sm != callee_sm
20742 && callee_has_property (AARCH64_IPA_SM_FIXED))
20743 return false;
20745 /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20746 functions from being inlined into others. We also need to prevent
20747 inlining of shared-ZA functions into functions without ZA state,
20748 since this is an error condition.
20750 The only other problematic case for ZA is inlining a function that
20751 directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state. */
20752 auto caller_za = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20753 auto callee_za = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20754 if (!caller_za && callee_za)
20755 return false;
20756 if (!callee_za
20757 && aarch64_fndecl_has_state (caller, "za")
20758 && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20759 return false;
20760 if (!callee_za
20761 && aarch64_fndecl_has_state (caller, "zt0")
20762 && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20763 return false;
20765 /* Allow non-strict aligned functions inlining into strict
20766 aligned ones. */
20767 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20768 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20769 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20770 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20771 return false;
20773 bool always_inline = lookup_attribute ("always_inline",
20774 DECL_ATTRIBUTES (callee));
20776 /* If the architectural features match up and the callee is always_inline
20777 then the other attributes don't matter. */
20778 if (always_inline)
20779 return true;
20781 if (caller_opts->x_aarch64_cmodel_var
20782 != callee_opts->x_aarch64_cmodel_var)
20783 return false;
20785 if (caller_opts->x_aarch64_tls_dialect
20786 != callee_opts->x_aarch64_tls_dialect)
20787 return false;
20789 /* Honour explicit requests to workaround errata. */
20790 if (!aarch64_tribools_ok_for_inlining_p (
20791 caller_opts->x_aarch64_fix_a53_err835769,
20792 callee_opts->x_aarch64_fix_a53_err835769,
20793 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20794 return false;
20796 if (!aarch64_tribools_ok_for_inlining_p (
20797 caller_opts->x_aarch64_fix_a53_err843419,
20798 callee_opts->x_aarch64_fix_a53_err843419,
20799 2, TARGET_FIX_ERR_A53_843419))
20800 return false;
20802 /* If the user explicitly specified -momit-leaf-frame-pointer for the
20803 caller and calle and they don't match up, reject inlining. */
20804 if (!aarch64_tribools_ok_for_inlining_p (
20805 caller_opts->x_flag_omit_leaf_frame_pointer,
20806 callee_opts->x_flag_omit_leaf_frame_pointer,
20807 2, 1))
20808 return false;
20810 /* If the callee has specific tuning overrides, respect them. */
20811 if (callee_opts->x_aarch64_override_tune_string != NULL
20812 && caller_opts->x_aarch64_override_tune_string == NULL)
20813 return false;
20815 /* If the user specified tuning override strings for the
20816 caller and callee and they don't match up, reject inlining.
20817 We just do a string compare here, we don't analyze the meaning
20818 of the string, as it would be too costly for little gain. */
20819 if (callee_opts->x_aarch64_override_tune_string
20820 && caller_opts->x_aarch64_override_tune_string
20821 && (strcmp (callee_opts->x_aarch64_override_tune_string,
20822 caller_opts->x_aarch64_override_tune_string) != 0))
20823 return false;
20825 return true;
20828 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20829 been already. */
20831 arm_pcs
20832 aarch64_tlsdesc_abi_id ()
20834 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20835 if (!tlsdesc_abi.initialized_p ())
20837 HARD_REG_SET full_reg_clobbers;
20838 CLEAR_HARD_REG_SET (full_reg_clobbers);
20839 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20840 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20841 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20842 SET_HARD_REG_BIT (full_reg_clobbers, regno);
20843 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20845 return ARM_PCS_TLSDESC;
20848 /* Return true if SYMBOL_REF X binds locally. */
20850 static bool
20851 aarch64_symbol_binds_local_p (const_rtx x)
20853 return (SYMBOL_REF_DECL (x)
20854 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20855 : SYMBOL_REF_LOCAL_P (x));
20858 /* Return true if SYMBOL_REF X is thread local */
20859 static bool
20860 aarch64_tls_symbol_p (rtx x)
20862 if (! TARGET_HAVE_TLS)
20863 return false;
20865 x = strip_salt (x);
20866 if (!SYMBOL_REF_P (x))
20867 return false;
20869 return SYMBOL_REF_TLS_MODEL (x) != 0;
20872 /* Classify a TLS symbol into one of the TLS kinds. */
20873 enum aarch64_symbol_type
20874 aarch64_classify_tls_symbol (rtx x)
20876 enum tls_model tls_kind = tls_symbolic_operand_type (x);
20878 switch (tls_kind)
20880 case TLS_MODEL_GLOBAL_DYNAMIC:
20881 case TLS_MODEL_LOCAL_DYNAMIC:
20882 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
20884 case TLS_MODEL_INITIAL_EXEC:
20885 switch (aarch64_cmodel)
20887 case AARCH64_CMODEL_TINY:
20888 case AARCH64_CMODEL_TINY_PIC:
20889 return SYMBOL_TINY_TLSIE;
20890 default:
20891 return SYMBOL_SMALL_TLSIE;
20894 case TLS_MODEL_LOCAL_EXEC:
20895 if (aarch64_tls_size == 12)
20896 return SYMBOL_TLSLE12;
20897 else if (aarch64_tls_size == 24)
20898 return SYMBOL_TLSLE24;
20899 else if (aarch64_tls_size == 32)
20900 return SYMBOL_TLSLE32;
20901 else if (aarch64_tls_size == 48)
20902 return SYMBOL_TLSLE48;
20903 else
20904 gcc_unreachable ();
20906 case TLS_MODEL_EMULATED:
20907 case TLS_MODEL_NONE:
20908 return SYMBOL_FORCE_TO_MEM;
20910 default:
20911 gcc_unreachable ();
20915 /* Return the correct method for accessing X + OFFSET, where X is either
20916 a SYMBOL_REF or LABEL_REF. */
20918 enum aarch64_symbol_type
20919 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
20921 x = strip_salt (x);
20923 if (LABEL_REF_P (x))
20925 switch (aarch64_cmodel)
20927 case AARCH64_CMODEL_LARGE:
20928 return SYMBOL_FORCE_TO_MEM;
20930 case AARCH64_CMODEL_TINY_PIC:
20931 case AARCH64_CMODEL_TINY:
20932 return SYMBOL_TINY_ABSOLUTE;
20934 case AARCH64_CMODEL_SMALL_SPIC:
20935 case AARCH64_CMODEL_SMALL_PIC:
20936 case AARCH64_CMODEL_SMALL:
20937 return SYMBOL_SMALL_ABSOLUTE;
20939 default:
20940 gcc_unreachable ();
20944 if (SYMBOL_REF_P (x))
20946 if (aarch64_tls_symbol_p (x))
20947 return aarch64_classify_tls_symbol (x);
20949 switch (aarch64_cmodel)
20951 case AARCH64_CMODEL_TINY_PIC:
20952 case AARCH64_CMODEL_TINY:
20953 /* With -fPIC non-local symbols use the GOT. For orthogonality
20954 always use the GOT for extern weak symbols. */
20955 if ((flag_pic || SYMBOL_REF_WEAK (x))
20956 && !aarch64_symbol_binds_local_p (x))
20957 return SYMBOL_TINY_GOT;
20959 /* When we retrieve symbol + offset address, we have to make sure
20960 the offset does not cause overflow of the final address. But
20961 we have no way of knowing the address of symbol at compile time
20962 so we can't accurately say if the distance between the PC and
20963 symbol + offset is outside the addressible range of +/-1MB in the
20964 TINY code model. So we limit the maximum offset to +/-64KB and
20965 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
20966 If offset_within_block_p is true we allow larger offsets. */
20967 if (!(IN_RANGE (offset, -0x10000, 0x10000)
20968 || offset_within_block_p (x, offset)))
20969 return SYMBOL_FORCE_TO_MEM;
20971 return SYMBOL_TINY_ABSOLUTE;
20974 case AARCH64_CMODEL_SMALL_SPIC:
20975 case AARCH64_CMODEL_SMALL_PIC:
20976 case AARCH64_CMODEL_SMALL:
20977 if ((flag_pic || SYMBOL_REF_WEAK (x))
20978 && !aarch64_symbol_binds_local_p (x))
20979 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
20980 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
20982 /* Same reasoning as the tiny code model, but the offset cap here is
20983 1MB, allowing +/-3.9GB for the offset to the symbol. */
20984 if (!(IN_RANGE (offset, -0x100000, 0x100000)
20985 || offset_within_block_p (x, offset)))
20986 return SYMBOL_FORCE_TO_MEM;
20988 return SYMBOL_SMALL_ABSOLUTE;
20990 case AARCH64_CMODEL_LARGE:
20991 /* This is alright even in PIC code as the constant
20992 pool reference is always PC relative and within
20993 the same translation unit. */
20994 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
20995 return SYMBOL_SMALL_ABSOLUTE;
20996 else
20997 return SYMBOL_FORCE_TO_MEM;
20999 default:
21000 gcc_unreachable ();
21004 /* By default push everything into the constant pool. */
21005 return SYMBOL_FORCE_TO_MEM;
21008 bool
21009 aarch64_constant_address_p (rtx x)
21011 return (CONSTANT_P (x) && memory_address_p (DImode, x));
21014 bool
21015 aarch64_legitimate_pic_operand_p (rtx x)
21017 poly_int64 offset;
21018 x = strip_offset_and_salt (x, &offset);
21019 if (SYMBOL_REF_P (x))
21020 return false;
21022 return true;
21025 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
21026 that should be rematerialized rather than spilled. */
21028 static bool
21029 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
21031 /* Support CSE and rematerialization of common constants. */
21032 if (CONST_INT_P (x)
21033 || CONST_DOUBLE_P (x))
21034 return true;
21036 /* Only accept variable-length vector constants if they can be
21037 handled directly.
21039 ??? It would be possible (but complex) to handle rematerialization
21040 of other constants via secondary reloads. */
21041 if (!GET_MODE_SIZE (mode).is_constant ())
21042 return aarch64_simd_valid_immediate (x, NULL);
21044 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21045 least be forced to memory and loaded from there. */
21046 if (CONST_VECTOR_P (x))
21047 return !targetm.cannot_force_const_mem (mode, x);
21049 /* Do not allow vector struct mode constants for Advanced SIMD.
21050 We could support 0 and -1 easily, but they need support in
21051 aarch64-simd.md. */
21052 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21053 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21054 return false;
21056 if (GET_CODE (x) == HIGH)
21057 x = XEXP (x, 0);
21059 /* Accept polynomial constants that can be calculated by using the
21060 destination of a move as the sole temporary. Constants that
21061 require a second temporary cannot be rematerialized (they can't be
21062 forced to memory and also aren't legitimate constants). */
21063 poly_int64 offset;
21064 if (poly_int_rtx_p (x, &offset))
21065 return aarch64_offset_temporaries (false, offset) <= 1;
21067 /* If an offset is being added to something else, we need to allow the
21068 base to be moved into the destination register, meaning that there
21069 are no free temporaries for the offset. */
21070 x = strip_offset_and_salt (x, &offset);
21071 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
21072 return false;
21074 /* Do not allow const (plus (anchor_symbol, const_int)). */
21075 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
21076 return false;
21078 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
21079 so spilling them is better than rematerialization. */
21080 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
21081 return true;
21083 /* Label references are always constant. */
21084 if (LABEL_REF_P (x))
21085 return true;
21087 return false;
21091 aarch64_load_tp (rtx target)
21093 if (!target
21094 || GET_MODE (target) != Pmode
21095 || !register_operand (target, Pmode))
21096 target = gen_reg_rtx (Pmode);
21098 /* Can return in any reg. */
21099 emit_insn (gen_aarch64_load_tp_hard (target));
21100 return target;
21103 /* On AAPCS systems, this is the "struct __va_list". */
21104 static GTY(()) tree va_list_type;
21106 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21107 Return the type to use as __builtin_va_list.
21109 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21111 struct __va_list
21113 void *__stack;
21114 void *__gr_top;
21115 void *__vr_top;
21116 int __gr_offs;
21117 int __vr_offs;
21118 }; */
21120 static tree
21121 aarch64_build_builtin_va_list (void)
21123 tree va_list_name;
21124 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21126 /* Create the type. */
21127 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
21128 /* Give it the required name. */
21129 va_list_name = build_decl (BUILTINS_LOCATION,
21130 TYPE_DECL,
21131 get_identifier ("__va_list"),
21132 va_list_type);
21133 DECL_ARTIFICIAL (va_list_name) = 1;
21134 TYPE_NAME (va_list_type) = va_list_name;
21135 TYPE_STUB_DECL (va_list_type) = va_list_name;
21137 /* Create the fields. */
21138 f_stack = build_decl (BUILTINS_LOCATION,
21139 FIELD_DECL, get_identifier ("__stack"),
21140 ptr_type_node);
21141 f_grtop = build_decl (BUILTINS_LOCATION,
21142 FIELD_DECL, get_identifier ("__gr_top"),
21143 ptr_type_node);
21144 f_vrtop = build_decl (BUILTINS_LOCATION,
21145 FIELD_DECL, get_identifier ("__vr_top"),
21146 ptr_type_node);
21147 f_groff = build_decl (BUILTINS_LOCATION,
21148 FIELD_DECL, get_identifier ("__gr_offs"),
21149 integer_type_node);
21150 f_vroff = build_decl (BUILTINS_LOCATION,
21151 FIELD_DECL, get_identifier ("__vr_offs"),
21152 integer_type_node);
21154 /* Tell tree-stdarg pass about our internal offset fields.
21155 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21156 purpose to identify whether the code is updating va_list internal
21157 offset fields through irregular way. */
21158 va_list_gpr_counter_field = f_groff;
21159 va_list_fpr_counter_field = f_vroff;
21161 DECL_ARTIFICIAL (f_stack) = 1;
21162 DECL_ARTIFICIAL (f_grtop) = 1;
21163 DECL_ARTIFICIAL (f_vrtop) = 1;
21164 DECL_ARTIFICIAL (f_groff) = 1;
21165 DECL_ARTIFICIAL (f_vroff) = 1;
21167 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
21168 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
21169 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
21170 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
21171 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
21173 TYPE_FIELDS (va_list_type) = f_stack;
21174 DECL_CHAIN (f_stack) = f_grtop;
21175 DECL_CHAIN (f_grtop) = f_vrtop;
21176 DECL_CHAIN (f_vrtop) = f_groff;
21177 DECL_CHAIN (f_groff) = f_vroff;
21179 /* Compute its layout. */
21180 layout_type (va_list_type);
21182 return va_list_type;
21185 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
21186 static void
21187 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
21189 const CUMULATIVE_ARGS *cum;
21190 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21191 tree stack, grtop, vrtop, groff, vroff;
21192 tree t;
21193 int gr_save_area_size = cfun->va_list_gpr_size;
21194 int vr_save_area_size = cfun->va_list_fpr_size;
21195 int vr_offset;
21197 cum = &crtl->args.info;
21198 if (cfun->va_list_gpr_size)
21199 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
21200 cfun->va_list_gpr_size);
21201 if (cfun->va_list_fpr_size)
21202 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
21203 * UNITS_PER_VREG, cfun->va_list_fpr_size);
21205 if (!TARGET_FLOAT)
21207 gcc_assert (cum->aapcs_nvrn == 0);
21208 vr_save_area_size = 0;
21211 f_stack = TYPE_FIELDS (va_list_type_node);
21212 f_grtop = DECL_CHAIN (f_stack);
21213 f_vrtop = DECL_CHAIN (f_grtop);
21214 f_groff = DECL_CHAIN (f_vrtop);
21215 f_vroff = DECL_CHAIN (f_groff);
21217 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
21218 NULL_TREE);
21219 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
21220 NULL_TREE);
21221 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
21222 NULL_TREE);
21223 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
21224 NULL_TREE);
21225 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
21226 NULL_TREE);
21228 /* Emit code to initialize STACK, which points to the next varargs stack
21229 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
21230 by named arguments. STACK is 8-byte aligned. */
21231 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
21232 if (cum->aapcs_stack_size > 0)
21233 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
21234 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
21235 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21237 /* Emit code to initialize GRTOP, the top of the GR save area.
21238 virtual_incoming_args_rtx should have been 16 byte aligned. */
21239 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
21240 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
21241 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21243 /* Emit code to initialize VRTOP, the top of the VR save area.
21244 This address is gr_save_area_bytes below GRTOP, rounded
21245 down to the next 16-byte boundary. */
21246 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21247 vr_offset = ROUND_UP (gr_save_area_size,
21248 STACK_BOUNDARY / BITS_PER_UNIT);
21250 if (vr_offset)
21251 t = fold_build_pointer_plus_hwi (t, -vr_offset);
21252 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21253 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21255 /* Emit code to initialize GROFF, the offset from GRTOP of the
21256 next GPR argument. */
21257 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21258 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21259 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21261 /* Likewise emit code to initialize VROFF, the offset from FTOP
21262 of the next VR argument. */
21263 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21264 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21265 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21268 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
21270 static tree
21271 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21272 gimple_seq *post_p ATTRIBUTE_UNUSED)
21274 tree addr;
21275 bool indirect_p;
21276 bool is_ha; /* is HFA or HVA. */
21277 bool dw_align; /* double-word align. */
21278 machine_mode ag_mode = VOIDmode;
21279 int nregs;
21280 machine_mode mode;
21282 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21283 tree stack, f_top, f_off, off, arg, roundup, on_stack;
21284 HOST_WIDE_INT size, rsize, adjust, align;
21285 tree t, u, cond1, cond2;
21287 indirect_p = pass_va_arg_by_reference (type);
21288 if (indirect_p)
21289 type = build_pointer_type (type);
21291 mode = TYPE_MODE (type);
21293 f_stack = TYPE_FIELDS (va_list_type_node);
21294 f_grtop = DECL_CHAIN (f_stack);
21295 f_vrtop = DECL_CHAIN (f_grtop);
21296 f_groff = DECL_CHAIN (f_vrtop);
21297 f_vroff = DECL_CHAIN (f_groff);
21299 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21300 f_stack, NULL_TREE);
21301 size = int_size_in_bytes (type);
21303 unsigned int abi_break_gcc_9;
21304 unsigned int abi_break_gcc_13;
21305 unsigned int abi_break_gcc_14;
21306 align
21307 = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21308 &abi_break_gcc_13, &abi_break_gcc_14)
21309 / BITS_PER_UNIT;
21311 dw_align = false;
21312 adjust = 0;
21313 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21314 &is_ha, false))
21316 /* No frontends can create types with variable-sized modes, so we
21317 shouldn't be asked to pass or return them. */
21318 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21320 /* TYPE passed in fp/simd registers. */
21321 if (!TARGET_FLOAT)
21322 aarch64_err_no_fpadvsimd (mode);
21324 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21325 unshare_expr (valist), f_vrtop, NULL_TREE);
21326 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21327 unshare_expr (valist), f_vroff, NULL_TREE);
21329 rsize = nregs * UNITS_PER_VREG;
21331 if (is_ha)
21333 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21334 adjust = UNITS_PER_VREG - ag_size;
21336 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21337 && size < UNITS_PER_VREG)
21339 adjust = UNITS_PER_VREG - size;
21342 else
21344 /* TYPE passed in general registers. */
21345 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21346 unshare_expr (valist), f_grtop, NULL_TREE);
21347 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21348 unshare_expr (valist), f_groff, NULL_TREE);
21349 rsize = ROUND_UP (size, UNITS_PER_WORD);
21350 nregs = rsize / UNITS_PER_WORD;
21352 if (align <= 8
21353 && abi_break_gcc_13
21354 && warn_psabi
21355 && !bitint_or_aggr_of_bitint_p (type))
21356 inform (input_location, "parameter passing for argument of type "
21357 "%qT changed in GCC 13.1", type);
21359 if (warn_psabi
21360 && abi_break_gcc_14
21361 && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
21362 && !bitint_or_aggr_of_bitint_p (type))
21363 inform (input_location, "parameter passing for argument of type "
21364 "%qT changed in GCC 14.1", type);
21366 if (align > 8)
21368 if (abi_break_gcc_9
21369 && warn_psabi
21370 && !bitint_or_aggr_of_bitint_p (type))
21371 inform (input_location, "parameter passing for argument of type "
21372 "%qT changed in GCC 9.1", type);
21373 dw_align = true;
21376 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21377 && size < UNITS_PER_WORD)
21379 adjust = UNITS_PER_WORD - size;
21383 /* Get a local temporary for the field value. */
21384 off = get_initialized_tmp_var (f_off, pre_p, NULL);
21386 /* Emit code to branch if off >= 0. */
21387 t = build2 (GE_EXPR, boolean_type_node, off,
21388 build_int_cst (TREE_TYPE (off), 0));
21389 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21391 if (dw_align)
21393 /* Emit: offs = (offs + 15) & -16. */
21394 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21395 build_int_cst (TREE_TYPE (off), 15));
21396 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21397 build_int_cst (TREE_TYPE (off), -16));
21398 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21400 else
21401 roundup = NULL;
21403 /* Update ap.__[g|v]r_offs */
21404 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21405 build_int_cst (TREE_TYPE (off), rsize));
21406 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21408 /* String up. */
21409 if (roundup)
21410 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21412 /* [cond2] if (ap.__[g|v]r_offs > 0) */
21413 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21414 build_int_cst (TREE_TYPE (f_off), 0));
21415 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21417 /* String up: make sure the assignment happens before the use. */
21418 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21419 COND_EXPR_ELSE (cond1) = t;
21421 /* Prepare the trees handling the argument that is passed on the stack;
21422 the top level node will store in ON_STACK. */
21423 arg = get_initialized_tmp_var (stack, pre_p, NULL);
21424 if (align > 8)
21426 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
21427 t = fold_build_pointer_plus_hwi (arg, 15);
21428 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21429 build_int_cst (TREE_TYPE (t), -16));
21430 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21432 else
21433 roundup = NULL;
21434 /* Advance ap.__stack */
21435 t = fold_build_pointer_plus_hwi (arg, size + 7);
21436 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21437 build_int_cst (TREE_TYPE (t), -8));
21438 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21439 /* String up roundup and advance. */
21440 if (roundup)
21441 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21442 /* String up with arg */
21443 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21444 /* Big-endianness related address adjustment. */
21445 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21446 && size < UNITS_PER_WORD)
21448 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21449 size_int (UNITS_PER_WORD - size));
21450 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21453 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21454 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21456 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
21457 t = off;
21458 if (adjust)
21459 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21460 build_int_cst (TREE_TYPE (off), adjust));
21462 t = fold_convert (sizetype, t);
21463 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21465 if (is_ha)
21467 /* type ha; // treat as "struct {ftype field[n];}"
21468 ... [computing offs]
21469 for (i = 0; i <nregs; ++i, offs += 16)
21470 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21471 return ha; */
21472 int i;
21473 tree tmp_ha, field_t, field_ptr_t;
21475 /* Declare a local variable. */
21476 tmp_ha = create_tmp_var_raw (type, "ha");
21477 gimple_add_tmp_var (tmp_ha);
21479 /* Establish the base type. */
21480 switch (ag_mode)
21482 case E_SFmode:
21483 field_t = float_type_node;
21484 field_ptr_t = float_ptr_type_node;
21485 break;
21486 case E_DFmode:
21487 field_t = double_type_node;
21488 field_ptr_t = double_ptr_type_node;
21489 break;
21490 case E_TFmode:
21491 field_t = long_double_type_node;
21492 field_ptr_t = long_double_ptr_type_node;
21493 break;
21494 case E_SDmode:
21495 field_t = dfloat32_type_node;
21496 field_ptr_t = build_pointer_type (dfloat32_type_node);
21497 break;
21498 case E_DDmode:
21499 field_t = dfloat64_type_node;
21500 field_ptr_t = build_pointer_type (dfloat64_type_node);
21501 break;
21502 case E_TDmode:
21503 field_t = dfloat128_type_node;
21504 field_ptr_t = build_pointer_type (dfloat128_type_node);
21505 break;
21506 case E_HFmode:
21507 field_t = aarch64_fp16_type_node;
21508 field_ptr_t = aarch64_fp16_ptr_type_node;
21509 break;
21510 case E_BFmode:
21511 field_t = bfloat16_type_node;
21512 field_ptr_t = aarch64_bf16_ptr_type_node;
21513 break;
21514 case E_V2SImode:
21515 case E_V4SImode:
21517 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21518 field_t = build_vector_type_for_mode (innertype, ag_mode);
21519 field_ptr_t = build_pointer_type (field_t);
21521 break;
21522 default:
21523 gcc_assert (0);
21526 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
21527 TREE_ADDRESSABLE (tmp_ha) = 1;
21528 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21529 addr = t;
21530 t = fold_convert (field_ptr_t, addr);
21531 t = build2 (MODIFY_EXPR, field_t,
21532 build1 (INDIRECT_REF, field_t, tmp_ha),
21533 build1 (INDIRECT_REF, field_t, t));
21535 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
21536 for (i = 1; i < nregs; ++i)
21538 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21539 u = fold_convert (field_ptr_t, addr);
21540 u = build2 (MODIFY_EXPR, field_t,
21541 build2 (MEM_REF, field_t, tmp_ha,
21542 build_int_cst (field_ptr_t,
21543 (i *
21544 int_size_in_bytes (field_t)))),
21545 build1 (INDIRECT_REF, field_t, u));
21546 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21549 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21550 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21553 COND_EXPR_ELSE (cond2) = t;
21554 addr = fold_convert (build_pointer_type (type), cond1);
21555 addr = build_va_arg_indirect_ref (addr);
21557 if (indirect_p)
21558 addr = build_va_arg_indirect_ref (addr);
21560 return addr;
21563 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
21565 static void
21566 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21567 const function_arg_info &arg,
21568 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21570 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21571 CUMULATIVE_ARGS local_cum;
21572 int gr_saved = cfun->va_list_gpr_size;
21573 int vr_saved = cfun->va_list_fpr_size;
21575 /* The caller has advanced CUM up to, but not beyond, the last named
21576 argument. Advance a local copy of CUM past the last "real" named
21577 argument, to find out how many registers are left over. */
21578 local_cum = *cum;
21579 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21580 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21582 /* Found out how many registers we need to save.
21583 Honor tree-stdvar analysis results. */
21584 if (cfun->va_list_gpr_size)
21585 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21586 cfun->va_list_gpr_size / UNITS_PER_WORD);
21587 if (cfun->va_list_fpr_size)
21588 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21589 cfun->va_list_fpr_size / UNITS_PER_VREG);
21591 if (!TARGET_FLOAT)
21593 gcc_assert (local_cum.aapcs_nvrn == 0);
21594 vr_saved = 0;
21597 if (!no_rtl)
21599 if (gr_saved > 0)
21601 rtx ptr, mem;
21603 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
21604 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21605 - gr_saved * UNITS_PER_WORD);
21606 mem = gen_frame_mem (BLKmode, ptr);
21607 set_mem_alias_set (mem, get_varargs_alias_set ());
21609 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21610 mem, gr_saved);
21612 if (vr_saved > 0)
21614 /* We can't use move_block_from_reg, because it will use
21615 the wrong mode, storing D regs only. */
21616 machine_mode mode = TImode;
21617 int off, i, vr_start;
21619 /* Set OFF to the offset from virtual_incoming_args_rtx of
21620 the first vector register. The VR save area lies below
21621 the GR one, and is aligned to 16 bytes. */
21622 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21623 STACK_BOUNDARY / BITS_PER_UNIT);
21624 off -= vr_saved * UNITS_PER_VREG;
21626 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21627 for (i = 0; i < vr_saved; ++i)
21629 rtx ptr, mem;
21631 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21632 mem = gen_frame_mem (mode, ptr);
21633 set_mem_alias_set (mem, get_varargs_alias_set ());
21634 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21635 off += UNITS_PER_VREG;
21640 /* We don't save the size into *PRETEND_SIZE because we want to avoid
21641 any complication of having crtl->args.pretend_args_size changed. */
21642 cfun->machine->frame.saved_varargs_size
21643 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21644 STACK_BOUNDARY / BITS_PER_UNIT)
21645 + vr_saved * UNITS_PER_VREG);
21648 static void
21649 aarch64_conditional_register_usage (void)
21651 int i;
21652 if (!TARGET_FLOAT)
21654 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21656 fixed_regs[i] = 1;
21657 call_used_regs[i] = 1;
21658 CLEAR_HARD_REG_BIT (operand_reg_set, i);
21661 if (!TARGET_SVE)
21662 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21664 fixed_regs[i] = 1;
21665 call_used_regs[i] = 1;
21668 /* Only allow these registers to be accessed via special patterns. */
21669 CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21670 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21671 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21672 for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21673 CLEAR_HARD_REG_BIT (operand_reg_set, i);
21675 /* When tracking speculation, we need a couple of call-clobbered registers
21676 to track the speculation state. It would be nice to just use
21677 IP0 and IP1, but currently there are numerous places that just
21678 assume these registers are free for other uses (eg pointer
21679 authentication). */
21680 if (aarch64_track_speculation)
21682 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21683 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21684 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21685 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21689 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
21691 bool
21692 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21694 /* For records we're passed a FIELD_DECL, for arrays we're passed
21695 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
21696 const_tree type = TREE_TYPE (field_or_array);
21698 /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21699 For structures, the "multiple" case is indicated by MODE being
21700 VOIDmode. */
21701 unsigned int num_zr, num_pr;
21702 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21704 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21705 return !simple_cst_equal (TYPE_SIZE (field_or_array),
21706 TYPE_SIZE (type));
21707 return mode == VOIDmode;
21710 return default_member_type_forces_blk (field_or_array, mode);
21713 /* Bitmasks that indicate whether earlier versions of GCC would have
21714 taken a different path through the ABI logic. This should result in
21715 a -Wpsabi warning if the earlier path led to a different ABI decision.
21717 WARN_PSABI_EMPTY_CXX17_BASE
21718 Indicates that the type includes an artificial empty C++17 base field
21719 that, prior to GCC 10.1, would prevent the type from being treated as
21720 a HFA or HVA. See PR94383 for details.
21722 WARN_PSABI_NO_UNIQUE_ADDRESS
21723 Indicates that the type includes an empty [[no_unique_address]] field
21724 that, prior to GCC 10.1, would prevent the type from being treated as
21725 a HFA or HVA. */
21726 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21727 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21728 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21730 /* Walk down the type tree of TYPE counting consecutive base elements.
21731 If *MODEP is VOIDmode, then set it to the first valid floating point
21732 type. If a non-floating point type is found, or if a floating point
21733 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21734 otherwise return the count in the sub-tree.
21736 The WARN_PSABI_FLAGS argument allows the caller to check whether this
21737 function has changed its behavior relative to earlier versions of GCC.
21738 Normally the argument should be nonnull and point to a zero-initialized
21739 variable. The function then records whether the ABI decision might
21740 be affected by a known fix to the ABI logic, setting the associated
21741 WARN_PSABI_* bits if so.
21743 When the argument is instead a null pointer, the function tries to
21744 simulate the behavior of GCC before all such ABI fixes were made.
21745 This is useful to check whether the function returns something
21746 different after the ABI fixes. */
21747 static int
21748 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21749 unsigned int *warn_psabi_flags)
21751 machine_mode mode;
21752 HOST_WIDE_INT size;
21754 if (aarch64_sve::builtin_type_p (type))
21755 return -1;
21757 switch (TREE_CODE (type))
21759 case REAL_TYPE:
21760 mode = TYPE_MODE (type);
21761 if (mode != DFmode && mode != SFmode
21762 && mode != TFmode && mode != HFmode
21763 && mode != SDmode && mode != DDmode && mode != TDmode)
21764 return -1;
21766 if (*modep == VOIDmode)
21767 *modep = mode;
21769 if (*modep == mode)
21770 return 1;
21772 break;
21774 case COMPLEX_TYPE:
21775 mode = TYPE_MODE (TREE_TYPE (type));
21776 if (mode != DFmode && mode != SFmode
21777 && mode != TFmode && mode != HFmode)
21778 return -1;
21780 if (*modep == VOIDmode)
21781 *modep = mode;
21783 if (*modep == mode)
21784 return 2;
21786 break;
21788 case VECTOR_TYPE:
21789 /* Use V2SImode and V4SImode as representatives of all 64-bit
21790 and 128-bit vector types. */
21791 size = int_size_in_bytes (type);
21792 switch (size)
21794 case 8:
21795 mode = V2SImode;
21796 break;
21797 case 16:
21798 mode = V4SImode;
21799 break;
21800 default:
21801 return -1;
21804 if (*modep == VOIDmode)
21805 *modep = mode;
21807 /* Vector modes are considered to be opaque: two vectors are
21808 equivalent for the purposes of being homogeneous aggregates
21809 if they are the same size. */
21810 if (*modep == mode)
21811 return 1;
21813 break;
21815 case ARRAY_TYPE:
21817 int count;
21818 tree index = TYPE_DOMAIN (type);
21820 /* Can't handle incomplete types nor sizes that are not
21821 fixed. */
21822 if (!COMPLETE_TYPE_P (type)
21823 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21824 return -1;
21826 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21827 warn_psabi_flags);
21828 if (count == -1
21829 || !index
21830 || !TYPE_MAX_VALUE (index)
21831 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21832 || !TYPE_MIN_VALUE (index)
21833 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21834 || count < 0)
21835 return -1;
21837 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21838 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21840 /* There must be no padding. */
21841 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21842 count * GET_MODE_BITSIZE (*modep)))
21843 return -1;
21845 return count;
21848 case RECORD_TYPE:
21850 int count = 0;
21851 int sub_count;
21852 tree field;
21854 /* Can't handle incomplete types nor sizes that are not
21855 fixed. */
21856 if (!COMPLETE_TYPE_P (type)
21857 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21858 return -1;
21860 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21862 if (TREE_CODE (field) != FIELD_DECL)
21863 continue;
21865 if (DECL_FIELD_ABI_IGNORED (field))
21867 /* See whether this is something that earlier versions of
21868 GCC failed to ignore. */
21869 unsigned int flag;
21870 if (lookup_attribute ("no_unique_address",
21871 DECL_ATTRIBUTES (field)))
21872 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
21873 else if (cxx17_empty_base_field_p (field))
21874 flag = WARN_PSABI_EMPTY_CXX17_BASE;
21875 else
21876 /* No compatibility problem. */
21877 continue;
21879 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
21880 if (warn_psabi_flags)
21882 *warn_psabi_flags |= flag;
21883 continue;
21886 /* A zero-width bitfield may affect layout in some
21887 circumstances, but adds no members. The determination
21888 of whether or not a type is an HFA is performed after
21889 layout is complete, so if the type still looks like an
21890 HFA afterwards, it is still classed as one. This is
21891 potentially an ABI break for the hard-float ABI. */
21892 else if (DECL_BIT_FIELD (field)
21893 && integer_zerop (DECL_SIZE (field)))
21895 /* Prior to GCC-12 these fields were striped early,
21896 hiding them from the back-end entirely and
21897 resulting in the correct behaviour for argument
21898 passing. Simulate that old behaviour without
21899 generating a warning. */
21900 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
21901 continue;
21902 if (warn_psabi_flags)
21904 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
21905 continue;
21909 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21910 warn_psabi_flags);
21911 if (sub_count < 0)
21912 return -1;
21913 count += sub_count;
21916 /* There must be no padding. */
21917 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21918 count * GET_MODE_BITSIZE (*modep)))
21919 return -1;
21921 return count;
21924 case UNION_TYPE:
21925 case QUAL_UNION_TYPE:
21927 /* These aren't very interesting except in a degenerate case. */
21928 int count = 0;
21929 int sub_count;
21930 tree field;
21932 /* Can't handle incomplete types nor sizes that are not
21933 fixed. */
21934 if (!COMPLETE_TYPE_P (type)
21935 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21936 return -1;
21938 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21940 if (TREE_CODE (field) != FIELD_DECL)
21941 continue;
21943 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21944 warn_psabi_flags);
21945 if (sub_count < 0)
21946 return -1;
21947 count = count > sub_count ? count : sub_count;
21950 /* There must be no padding. */
21951 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21952 count * GET_MODE_BITSIZE (*modep)))
21953 return -1;
21955 return count;
21958 default:
21959 break;
21962 return -1;
21965 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
21966 type as described in AAPCS64 \S 4.1.2.
21968 See the comment above aarch64_composite_type_p for the notes on MODE. */
21970 static bool
21971 aarch64_short_vector_p (const_tree type,
21972 machine_mode mode)
21974 poly_int64 size = -1;
21976 if (type && VECTOR_TYPE_P (type))
21978 if (aarch64_sve::builtin_type_p (type))
21979 return false;
21980 size = int_size_in_bytes (type);
21982 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
21983 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
21985 /* The containing "else if" is too loose: it means that we look at TYPE
21986 if the type is a vector type (good), but that we otherwise ignore TYPE
21987 and look only at the mode. This is wrong because the type describes
21988 the language-level information whereas the mode is purely an internal
21989 GCC concept. We can therefore reach here for types that are not
21990 vectors in the AAPCS64 sense.
21992 We can't "fix" that for the traditional Advanced SIMD vector modes
21993 without breaking backwards compatibility. However, there's no such
21994 baggage for the structure modes, which were introduced in GCC 12. */
21995 if (aarch64_advsimd_struct_mode_p (mode))
21996 return false;
21998 /* For similar reasons, rely only on the type, not the mode, when
21999 processing SVE types. */
22000 if (type && aarch64_some_values_include_pst_objects_p (type))
22001 /* Leave later code to report an error if SVE is disabled. */
22002 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
22003 else
22004 size = GET_MODE_SIZE (mode);
22006 if (known_eq (size, 8) || known_eq (size, 16))
22008 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
22009 they are being treated as scalable AAPCS64 types. */
22010 gcc_assert (!aarch64_sve_mode_p (mode)
22011 && !aarch64_advsimd_struct_mode_p (mode));
22012 return true;
22014 return false;
22017 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
22018 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
22019 array types. The C99 floating-point complex types are also considered
22020 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
22021 types, which are GCC extensions and out of the scope of AAPCS64, are
22022 treated as composite types here as well.
22024 Note that MODE itself is not sufficient in determining whether a type
22025 is such a composite type or not. This is because
22026 stor-layout.cc:compute_record_mode may have already changed the MODE
22027 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
22028 structure with only one field may have its MODE set to the mode of the
22029 field. Also an integer mode whose size matches the size of the
22030 RECORD_TYPE type may be used to substitute the original mode
22031 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
22032 solely relied on. */
22034 static bool
22035 aarch64_composite_type_p (const_tree type,
22036 machine_mode mode)
22038 if (aarch64_short_vector_p (type, mode))
22039 return false;
22041 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
22042 return true;
22044 if (type
22045 && TREE_CODE (type) == BITINT_TYPE
22046 && int_size_in_bytes (type) > 16)
22047 return true;
22049 if (mode == BLKmode
22050 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
22051 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
22052 return true;
22054 return false;
22057 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22058 shall be passed or returned in simd/fp register(s) (providing these
22059 parameter passing registers are available).
22061 Upon successful return, *COUNT returns the number of needed registers,
22062 *BASE_MODE returns the mode of the individual register and when IS_HA
22063 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22064 floating-point aggregate or a homogeneous short-vector aggregate.
22066 SILENT_P is true if the function should refrain from reporting any
22067 diagnostics. This should only be used if the caller is certain that
22068 any ABI decisions would eventually come through this function with
22069 SILENT_P set to false. */
22071 static bool
22072 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
22073 const_tree type,
22074 machine_mode *base_mode,
22075 int *count,
22076 bool *is_ha,
22077 bool silent_p)
22079 if (is_ha != NULL) *is_ha = false;
22081 machine_mode new_mode = VOIDmode;
22082 bool composite_p = aarch64_composite_type_p (type, mode);
22084 if ((!composite_p
22085 && (GET_MODE_CLASS (mode) == MODE_FLOAT
22086 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
22087 || aarch64_short_vector_p (type, mode))
22089 *count = 1;
22090 new_mode = mode;
22092 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
22094 if (is_ha != NULL) *is_ha = true;
22095 *count = 2;
22096 new_mode = GET_MODE_INNER (mode);
22098 else if (type && composite_p)
22100 unsigned int warn_psabi_flags = 0;
22101 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
22102 &warn_psabi_flags);
22103 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
22105 static unsigned last_reported_type_uid;
22106 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
22107 int alt;
22108 if (!silent_p
22109 && warn_psabi
22110 && warn_psabi_flags
22111 && uid != last_reported_type_uid
22112 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
22113 != ag_count))
22115 const char *url10
22116 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
22117 const char *url12
22118 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
22119 gcc_assert (alt == -1);
22120 last_reported_type_uid = uid;
22121 /* Use TYPE_MAIN_VARIANT to strip any redundant const
22122 qualification. */
22123 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
22124 inform (input_location, "parameter passing for argument of "
22125 "type %qT with %<[[no_unique_address]]%> members "
22126 "changed %{in GCC 10.1%}",
22127 TYPE_MAIN_VARIANT (type), url10);
22128 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
22129 inform (input_location, "parameter passing for argument of "
22130 "type %qT when C++17 is enabled changed to match "
22131 "C++14 %{in GCC 10.1%}",
22132 TYPE_MAIN_VARIANT (type), url10);
22133 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
22134 inform (input_location, "parameter passing for argument of "
22135 "type %qT changed %{in GCC 12.1%}",
22136 TYPE_MAIN_VARIANT (type), url12);
22139 if (is_ha != NULL) *is_ha = true;
22140 *count = ag_count;
22142 else
22143 return false;
22145 else
22146 return false;
22148 gcc_assert (!aarch64_sve_mode_p (new_mode));
22149 *base_mode = new_mode;
22150 return true;
22153 /* Implement TARGET_STRUCT_VALUE_RTX. */
22155 static rtx
22156 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
22157 int incoming ATTRIBUTE_UNUSED)
22159 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
22162 /* Implements target hook vector_mode_supported_p. */
22163 static bool
22164 aarch64_vector_mode_supported_p (machine_mode mode)
22166 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22167 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22170 /* Implements target hook vector_mode_supported_any_target_p. */
22171 static bool
22172 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
22174 unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
22175 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22178 /* Return the full-width SVE vector mode for element mode MODE, if one
22179 exists. */
22180 opt_machine_mode
22181 aarch64_full_sve_mode (scalar_mode mode)
22183 switch (mode)
22185 case E_DFmode:
22186 return VNx2DFmode;
22187 case E_SFmode:
22188 return VNx4SFmode;
22189 case E_HFmode:
22190 return VNx8HFmode;
22191 case E_BFmode:
22192 return VNx8BFmode;
22193 case E_DImode:
22194 return VNx2DImode;
22195 case E_SImode:
22196 return VNx4SImode;
22197 case E_HImode:
22198 return VNx8HImode;
22199 case E_QImode:
22200 return VNx16QImode;
22201 default:
22202 return opt_machine_mode ();
22206 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22207 if it exists. */
22208 opt_machine_mode
22209 aarch64_vq_mode (scalar_mode mode)
22211 switch (mode)
22213 case E_DFmode:
22214 return V2DFmode;
22215 case E_SFmode:
22216 return V4SFmode;
22217 case E_HFmode:
22218 return V8HFmode;
22219 case E_BFmode:
22220 return V8BFmode;
22221 case E_SImode:
22222 return V4SImode;
22223 case E_HImode:
22224 return V8HImode;
22225 case E_QImode:
22226 return V16QImode;
22227 case E_DImode:
22228 return V2DImode;
22229 default:
22230 return opt_machine_mode ();
22234 /* Return appropriate SIMD container
22235 for MODE within a vector of WIDTH bits. */
22236 static machine_mode
22237 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
22239 if (TARGET_SVE
22240 && maybe_ne (width, 128)
22241 && known_eq (width, BITS_PER_SVE_VECTOR))
22242 return aarch64_full_sve_mode (mode).else_mode (word_mode);
22244 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
22245 if (TARGET_BASE_SIMD)
22247 if (known_eq (width, 128))
22248 return aarch64_vq_mode (mode).else_mode (word_mode);
22249 else
22250 switch (mode)
22252 case E_SFmode:
22253 return V2SFmode;
22254 case E_HFmode:
22255 return V4HFmode;
22256 case E_BFmode:
22257 return V4BFmode;
22258 case E_SImode:
22259 return V2SImode;
22260 case E_HImode:
22261 return V4HImode;
22262 case E_QImode:
22263 return V8QImode;
22264 default:
22265 break;
22268 return word_mode;
22271 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22272 and return whether the SVE mode should be preferred over the
22273 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
22274 static bool
22275 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22277 /* Take into account the aarch64-autovec-preference param if non-zero. */
22278 bool only_asimd_p = aarch64_autovec_preference == 1;
22279 bool only_sve_p = aarch64_autovec_preference == 2;
22281 if (only_asimd_p)
22282 return false;
22283 if (only_sve_p)
22284 return true;
22286 /* The preference in case of a tie in costs. */
22287 bool prefer_asimd = aarch64_autovec_preference == 3;
22288 bool prefer_sve = aarch64_autovec_preference == 4;
22290 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22291 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22292 /* If the CPU information does not have an SVE width registered use the
22293 generic poly_int comparison that prefers SVE. If a preference is
22294 explicitly requested avoid this path. */
22295 if (aarch64_tune_params.sve_width == SVE_SCALABLE
22296 && !prefer_asimd
22297 && !prefer_sve)
22298 return maybe_gt (nunits_sve, nunits_asimd);
22300 /* Otherwise estimate the runtime width of the modes involved. */
22301 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22302 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22304 /* Preferring SVE means picking it first unless the Advanced SIMD mode
22305 is clearly wider. */
22306 if (prefer_sve)
22307 return est_sve >= est_asimd;
22308 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22309 is clearly wider. */
22310 if (prefer_asimd)
22311 return est_sve > est_asimd;
22313 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
22314 return est_sve > est_asimd;
22317 /* Return 128-bit container as the preferred SIMD mode for MODE. */
22318 static machine_mode
22319 aarch64_preferred_simd_mode (scalar_mode mode)
22321 /* Take into account explicit auto-vectorization ISA preferences through
22322 aarch64_cmp_autovec_modes. */
22323 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22324 return aarch64_full_sve_mode (mode).else_mode (word_mode);
22325 if (TARGET_SIMD)
22326 return aarch64_vq_mode (mode).else_mode (word_mode);
22327 return word_mode;
22330 /* Return a list of possible vector sizes for the vectorizer
22331 to iterate over. */
22332 static unsigned int
22333 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22335 static const machine_mode sve_modes[] = {
22336 /* Try using full vectors for all element types. */
22337 VNx16QImode,
22339 /* Try using 16-bit containers for 8-bit elements and full vectors
22340 for wider elements. */
22341 VNx8QImode,
22343 /* Try using 32-bit containers for 8-bit and 16-bit elements and
22344 full vectors for wider elements. */
22345 VNx4QImode,
22347 /* Try using 64-bit containers for all element types. */
22348 VNx2QImode
22351 static const machine_mode advsimd_modes[] = {
22352 /* Try using 128-bit vectors for all element types. */
22353 V16QImode,
22355 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22356 for wider elements. */
22357 V8QImode,
22359 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22360 for wider elements.
22362 TODO: We could support a limited form of V4QImode too, so that
22363 we use 32-bit vectors for 8-bit elements. */
22364 V4HImode,
22366 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22367 for 64-bit elements.
22369 TODO: We could similarly support limited forms of V2QImode and V2HImode
22370 for this case. */
22371 V2SImode
22374 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22375 This is because:
22377 - If we can't use N-byte Advanced SIMD vectors then the placement
22378 doesn't matter; we'll just continue as though the Advanced SIMD
22379 entry didn't exist.
22381 - If an SVE main loop with N bytes ends up being cheaper than an
22382 Advanced SIMD main loop with N bytes then by default we'll replace
22383 the Advanced SIMD version with the SVE one.
22385 - If an Advanced SIMD main loop with N bytes ends up being cheaper
22386 than an SVE main loop with N bytes then by default we'll try to
22387 use the SVE loop to vectorize the epilogue instead. */
22389 bool only_asimd_p = aarch64_autovec_preference == 1;
22390 bool only_sve_p = aarch64_autovec_preference == 2;
22392 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22393 unsigned int advsimd_i = 0;
22395 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22397 if (sve_i < ARRAY_SIZE (sve_modes)
22398 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22399 advsimd_modes[advsimd_i]))
22400 modes->safe_push (sve_modes[sve_i++]);
22401 else
22402 modes->safe_push (advsimd_modes[advsimd_i++]);
22404 while (sve_i < ARRAY_SIZE (sve_modes))
22405 modes->safe_push (sve_modes[sve_i++]);
22407 unsigned int flags = 0;
22408 if (aarch64_vect_compare_costs)
22409 flags |= VECT_COMPARE_COSTS;
22410 return flags;
22413 /* Implement TARGET_MANGLE_TYPE. */
22415 static const char *
22416 aarch64_mangle_type (const_tree type)
22418 /* The AArch64 ABI documents say that "__va_list" has to be
22419 mangled as if it is in the "std" namespace. */
22420 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22421 return "St9__va_list";
22423 /* Half-precision floating point types. */
22424 if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22426 if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22427 return NULL;
22428 if (TYPE_MODE (type) == BFmode)
22429 return "u6__bf16";
22430 else
22431 return "Dh";
22434 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
22435 builtin types. */
22436 if (TYPE_NAME (type) != NULL)
22438 const char *res;
22439 if ((res = aarch64_general_mangle_builtin_type (type))
22440 || (res = aarch64_sve::mangle_builtin_type (type)))
22441 return res;
22444 /* Use the default mangling. */
22445 return NULL;
22448 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
22450 static bool
22451 aarch64_verify_type_context (location_t loc, type_context_kind context,
22452 const_tree type, bool silent_p)
22454 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22457 /* Find the first rtx_insn before insn that will generate an assembly
22458 instruction. */
22460 static rtx_insn *
22461 aarch64_prev_real_insn (rtx_insn *insn)
22463 if (!insn)
22464 return NULL;
22468 insn = prev_real_insn (insn);
22470 while (insn && recog_memoized (insn) < 0);
22472 return insn;
22475 static bool
22476 is_madd_op (enum attr_type t1)
22478 unsigned int i;
22479 /* A number of these may be AArch32 only. */
22480 enum attr_type mlatypes[] = {
22481 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22482 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22483 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22486 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22488 if (t1 == mlatypes[i])
22489 return true;
22492 return false;
22495 /* Check if there is a register dependency between a load and the insn
22496 for which we hold recog_data. */
22498 static bool
22499 dep_between_memop_and_curr (rtx memop)
22501 rtx load_reg;
22502 int opno;
22504 gcc_assert (GET_CODE (memop) == SET);
22506 if (!REG_P (SET_DEST (memop)))
22507 return false;
22509 load_reg = SET_DEST (memop);
22510 for (opno = 1; opno < recog_data.n_operands; opno++)
22512 rtx operand = recog_data.operand[opno];
22513 if (REG_P (operand)
22514 && reg_overlap_mentioned_p (load_reg, operand))
22515 return true;
22518 return false;
22522 /* When working around the Cortex-A53 erratum 835769,
22523 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22524 instruction and has a preceding memory instruction such that a NOP
22525 should be inserted between them. */
22527 bool
22528 aarch64_madd_needs_nop (rtx_insn* insn)
22530 enum attr_type attr_type;
22531 rtx_insn *prev;
22532 rtx body;
22534 if (!TARGET_FIX_ERR_A53_835769)
22535 return false;
22537 if (!INSN_P (insn) || recog_memoized (insn) < 0)
22538 return false;
22540 attr_type = get_attr_type (insn);
22541 if (!is_madd_op (attr_type))
22542 return false;
22544 prev = aarch64_prev_real_insn (insn);
22545 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22546 Restore recog state to INSN to avoid state corruption. */
22547 extract_constrain_insn_cached (insn);
22549 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22550 return false;
22552 body = single_set (prev);
22554 /* If the previous insn is a memory op and there is no dependency between
22555 it and the DImode madd, emit a NOP between them. If body is NULL then we
22556 have a complex memory operation, probably a load/store pair.
22557 Be conservative for now and emit a NOP. */
22558 if (GET_MODE (recog_data.operand[0]) == DImode
22559 && (!body || !dep_between_memop_and_curr (body)))
22560 return true;
22562 return false;
22567 /* Implement FINAL_PRESCAN_INSN. */
22569 void
22570 aarch64_final_prescan_insn (rtx_insn *insn)
22572 if (aarch64_madd_needs_nop (insn))
22573 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22577 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22578 instruction. */
22580 bool
22581 aarch64_sve_index_immediate_p (rtx base_or_step)
22583 return (CONST_INT_P (base_or_step)
22584 && IN_RANGE (INTVAL (base_or_step), -16, 15));
22587 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22588 when applied to mode MODE. Negate X first if NEGATE_P is true. */
22590 bool
22591 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22593 rtx elt = unwrap_const_vec_duplicate (x);
22594 if (!CONST_INT_P (elt))
22595 return false;
22597 HOST_WIDE_INT val = INTVAL (elt);
22598 if (negate_p)
22599 val = -val;
22600 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22602 if (val & 0xff)
22603 return IN_RANGE (val, 0, 0xff);
22604 return IN_RANGE (val, 0, 0xff00);
22607 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22608 instructions when applied to mode MODE. Negate X first if NEGATE_P
22609 is true. */
22611 bool
22612 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22614 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22615 return false;
22617 /* After the optional negation, the immediate must be nonnegative.
22618 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22619 instead of SQADD Zn.B, Zn.B, #129. */
22620 rtx elt = unwrap_const_vec_duplicate (x);
22621 return negate_p == (INTVAL (elt) < 0);
22624 /* Return true if X is a valid immediate operand for an SVE logical
22625 instruction such as AND. */
22627 bool
22628 aarch64_sve_bitmask_immediate_p (rtx x)
22630 rtx elt;
22632 return (const_vec_duplicate_p (x, &elt)
22633 && CONST_INT_P (elt)
22634 && aarch64_bitmask_imm (INTVAL (elt),
22635 GET_MODE_INNER (GET_MODE (x))));
22638 /* Return true if X is a valid immediate for the SVE DUP and CPY
22639 instructions. */
22641 bool
22642 aarch64_sve_dup_immediate_p (rtx x)
22644 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22645 if (!CONST_INT_P (x))
22646 return false;
22648 HOST_WIDE_INT val = INTVAL (x);
22649 if (val & 0xff)
22650 return IN_RANGE (val, -0x80, 0x7f);
22651 return IN_RANGE (val, -0x8000, 0x7f00);
22654 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22655 SIGNED_P says whether the operand is signed rather than unsigned. */
22657 bool
22658 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22660 x = unwrap_const_vec_duplicate (x);
22661 return (CONST_INT_P (x)
22662 && (signed_p
22663 ? IN_RANGE (INTVAL (x), -16, 15)
22664 : IN_RANGE (INTVAL (x), 0, 127)));
22667 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22668 instruction. Negate X first if NEGATE_P is true. */
22670 bool
22671 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22673 rtx elt;
22674 REAL_VALUE_TYPE r;
22676 if (!const_vec_duplicate_p (x, &elt)
22677 || !CONST_DOUBLE_P (elt))
22678 return false;
22680 r = *CONST_DOUBLE_REAL_VALUE (elt);
22682 if (negate_p)
22683 r = real_value_negate (&r);
22685 if (real_equal (&r, &dconst1))
22686 return true;
22687 if (real_equal (&r, &dconsthalf))
22688 return true;
22689 return false;
22692 /* Return true if X is a valid immediate operand for an SVE FMUL
22693 instruction. */
22695 bool
22696 aarch64_sve_float_mul_immediate_p (rtx x)
22698 rtx elt;
22700 return (const_vec_duplicate_p (x, &elt)
22701 && CONST_DOUBLE_P (elt)
22702 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22703 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22706 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22707 for the Advanced SIMD operation described by WHICH and INSN. If INFO
22708 is nonnull, use it to describe valid immediates. */
22709 static bool
22710 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22711 simd_immediate_info *info,
22712 enum simd_immediate_check which,
22713 simd_immediate_info::insn_type insn)
22715 /* Try a 4-byte immediate with LSL. */
22716 for (unsigned int shift = 0; shift < 32; shift += 8)
22717 if ((val32 & (0xff << shift)) == val32)
22719 if (info)
22720 *info = simd_immediate_info (SImode, val32 >> shift, insn,
22721 simd_immediate_info::LSL, shift);
22722 return true;
22725 /* Try a 2-byte immediate with LSL. */
22726 unsigned int imm16 = val32 & 0xffff;
22727 if (imm16 == (val32 >> 16))
22728 for (unsigned int shift = 0; shift < 16; shift += 8)
22729 if ((imm16 & (0xff << shift)) == imm16)
22731 if (info)
22732 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22733 simd_immediate_info::LSL, shift);
22734 return true;
22737 /* Try a 4-byte immediate with MSL, except for cases that MVN
22738 can handle. */
22739 if (which == AARCH64_CHECK_MOV)
22740 for (unsigned int shift = 8; shift < 24; shift += 8)
22742 unsigned int low = (1 << shift) - 1;
22743 if (((val32 & (0xff << shift)) | low) == val32)
22745 if (info)
22746 *info = simd_immediate_info (SImode, val32 >> shift, insn,
22747 simd_immediate_info::MSL, shift);
22748 return true;
22752 return false;
22755 /* Return true if replicating VAL64 is a valid immediate for the
22756 Advanced SIMD operation described by WHICH. If INFO is nonnull,
22757 use it to describe valid immediates. */
22758 static bool
22759 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22760 simd_immediate_info *info,
22761 enum simd_immediate_check which)
22763 unsigned int val32 = val64 & 0xffffffff;
22764 unsigned int val16 = val64 & 0xffff;
22765 unsigned int val8 = val64 & 0xff;
22767 if (val32 == (val64 >> 32))
22769 if ((which & AARCH64_CHECK_ORR) != 0
22770 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22771 simd_immediate_info::MOV))
22772 return true;
22774 if ((which & AARCH64_CHECK_BIC) != 0
22775 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22776 simd_immediate_info::MVN))
22777 return true;
22779 /* Try using a replicated byte. */
22780 if (which == AARCH64_CHECK_MOV
22781 && val16 == (val32 >> 16)
22782 && val8 == (val16 >> 8))
22784 if (info)
22785 *info = simd_immediate_info (QImode, val8);
22786 return true;
22790 /* Try using a bit-to-bytemask. */
22791 if (which == AARCH64_CHECK_MOV)
22793 unsigned int i;
22794 for (i = 0; i < 64; i += 8)
22796 unsigned char byte = (val64 >> i) & 0xff;
22797 if (byte != 0 && byte != 0xff)
22798 break;
22800 if (i == 64)
22802 if (info)
22803 *info = simd_immediate_info (DImode, val64);
22804 return true;
22807 return false;
22810 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22811 instruction. If INFO is nonnull, use it to describe valid immediates. */
22813 static bool
22814 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
22815 simd_immediate_info *info)
22817 scalar_int_mode mode = DImode;
22818 unsigned int val32 = val64 & 0xffffffff;
22819 if (val32 == (val64 >> 32))
22821 mode = SImode;
22822 unsigned int val16 = val32 & 0xffff;
22823 if (val16 == (val32 >> 16))
22825 mode = HImode;
22826 unsigned int val8 = val16 & 0xff;
22827 if (val8 == (val16 >> 8))
22828 mode = QImode;
22831 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
22832 if (IN_RANGE (val, -0x80, 0x7f))
22834 /* DUP with no shift. */
22835 if (info)
22836 *info = simd_immediate_info (mode, val);
22837 return true;
22839 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
22841 /* DUP with LSL #8. */
22842 if (info)
22843 *info = simd_immediate_info (mode, val);
22844 return true;
22846 if (aarch64_bitmask_imm (val64, mode))
22848 /* DUPM. */
22849 if (info)
22850 *info = simd_immediate_info (mode, val);
22851 return true;
22853 return false;
22856 /* Return true if X is an UNSPEC_PTRUE constant of the form:
22858 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
22860 where PATTERN is the svpattern as a CONST_INT and where ZERO
22861 is a zero constant of the required PTRUE mode (which can have
22862 fewer elements than X's mode, if zero bits are significant).
22864 If so, and if INFO is nonnull, describe the immediate in INFO. */
22865 bool
22866 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
22868 if (GET_CODE (x) != CONST)
22869 return false;
22871 x = XEXP (x, 0);
22872 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
22873 return false;
22875 if (info)
22877 aarch64_svpattern pattern
22878 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
22879 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
22880 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
22881 *info = simd_immediate_info (int_mode, pattern);
22883 return true;
22886 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
22887 it to describe valid immediates. */
22889 static bool
22890 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
22892 if (aarch64_sve_ptrue_svpattern_p (x, info))
22893 return true;
22895 if (x == CONST0_RTX (GET_MODE (x)))
22897 if (info)
22898 *info = simd_immediate_info (DImode, 0);
22899 return true;
22902 /* Analyze the value as a VNx16BImode. This should be relatively
22903 efficient, since rtx_vector_builder has enough built-in capacity
22904 to store all VLA predicate constants without needing the heap. */
22905 rtx_vector_builder builder;
22906 if (!aarch64_get_sve_pred_bits (builder, x))
22907 return false;
22909 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
22910 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
22912 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
22913 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
22914 if (pattern != AARCH64_NUM_SVPATTERNS)
22916 if (info)
22918 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
22919 *info = simd_immediate_info (int_mode, pattern);
22921 return true;
22924 return false;
22927 /* Return true if OP is a valid SIMD immediate for the operation
22928 described by WHICH. If INFO is nonnull, use it to describe valid
22929 immediates. */
22930 bool
22931 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
22932 enum simd_immediate_check which)
22934 machine_mode mode = GET_MODE (op);
22935 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22936 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
22937 return false;
22939 if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
22940 return false;
22942 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
22943 return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
22945 if (vec_flags & VEC_SVE_PRED)
22946 return aarch64_sve_pred_valid_immediate (op, info);
22948 scalar_mode elt_mode = GET_MODE_INNER (mode);
22949 rtx base, step;
22950 unsigned int n_elts;
22951 if (CONST_VECTOR_P (op)
22952 && CONST_VECTOR_DUPLICATE_P (op))
22953 n_elts = CONST_VECTOR_NPATTERNS (op);
22954 else if ((vec_flags & VEC_SVE_DATA)
22955 && const_vec_series_p (op, &base, &step))
22957 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
22958 if (!aarch64_sve_index_immediate_p (base)
22959 || !aarch64_sve_index_immediate_p (step))
22960 return false;
22962 if (info)
22964 /* Get the corresponding container mode. E.g. an INDEX on V2SI
22965 should yield two integer values per 128-bit block, meaning
22966 that we need to treat it in the same way as V2DI and then
22967 ignore the upper 32 bits of each element. */
22968 elt_mode = aarch64_sve_container_int_mode (mode);
22969 *info = simd_immediate_info (elt_mode, base, step);
22971 return true;
22973 else if (CONST_VECTOR_P (op)
22974 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
22975 /* N_ELTS set above. */;
22976 else
22977 return false;
22979 scalar_float_mode elt_float_mode;
22980 if (n_elts == 1
22981 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
22983 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
22984 if (aarch64_float_const_zero_rtx_p (elt)
22985 || aarch64_float_const_representable_p (elt))
22987 if (info)
22988 *info = simd_immediate_info (elt_float_mode, elt);
22989 return true;
22993 /* If all elements in an SVE vector have the same value, we have a free
22994 choice between using the element mode and using the container mode.
22995 Using the element mode means that unused parts of the vector are
22996 duplicates of the used elements, while using the container mode means
22997 that the unused parts are an extension of the used elements. Using the
22998 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
22999 for its container mode VNx4SI while 0x00000101 isn't.
23001 If not all elements in an SVE vector have the same value, we need the
23002 transition from one element to the next to occur at container boundaries.
23003 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
23004 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
23005 scalar_int_mode elt_int_mode;
23006 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
23007 elt_int_mode = aarch64_sve_container_int_mode (mode);
23008 else
23009 elt_int_mode = int_mode_for_mode (elt_mode).require ();
23011 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
23012 if (elt_size > 8)
23013 return false;
23015 /* Expand the vector constant out into a byte vector, with the least
23016 significant byte of the register first. */
23017 auto_vec<unsigned char, 16> bytes;
23018 bytes.reserve (n_elts * elt_size);
23019 for (unsigned int i = 0; i < n_elts; i++)
23021 /* The vector is provided in gcc endian-neutral fashion.
23022 For aarch64_be Advanced SIMD, it must be laid out in the vector
23023 register in reverse order. */
23024 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
23025 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
23027 if (elt_mode != elt_int_mode)
23028 elt = gen_lowpart (elt_int_mode, elt);
23030 if (!CONST_INT_P (elt))
23031 return false;
23033 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
23034 for (unsigned int byte = 0; byte < elt_size; byte++)
23036 bytes.quick_push (elt_val & 0xff);
23037 elt_val >>= BITS_PER_UNIT;
23041 /* The immediate must repeat every eight bytes. */
23042 unsigned int nbytes = bytes.length ();
23043 for (unsigned i = 8; i < nbytes; ++i)
23044 if (bytes[i] != bytes[i - 8])
23045 return false;
23047 /* Get the repeating 8-byte value as an integer. No endian correction
23048 is needed here because bytes is already in lsb-first order. */
23049 unsigned HOST_WIDE_INT val64 = 0;
23050 for (unsigned int i = 0; i < 8; i++)
23051 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
23052 << (i * BITS_PER_UNIT));
23054 if (vec_flags & VEC_SVE_DATA)
23055 return aarch64_sve_valid_immediate (val64, info);
23056 else
23057 return aarch64_advsimd_valid_immediate (val64, info, which);
23060 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23061 has a step in the range of INDEX. Return the index expression if so,
23062 otherwise return null. */
23064 aarch64_check_zero_based_sve_index_immediate (rtx x)
23066 rtx base, step;
23067 if (const_vec_series_p (x, &base, &step)
23068 && base == const0_rtx
23069 && aarch64_sve_index_immediate_p (step))
23070 return step;
23071 return NULL_RTX;
23074 /* Check of immediate shift constants are within range. */
23075 bool
23076 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
23078 x = unwrap_const_vec_duplicate (x);
23079 if (!CONST_INT_P (x))
23080 return false;
23081 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
23082 if (left)
23083 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
23084 else
23085 return IN_RANGE (INTVAL (x), 1, bit_width);
23088 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23089 operation of width WIDTH at bit position POS. */
23092 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
23094 gcc_assert (CONST_INT_P (width));
23095 gcc_assert (CONST_INT_P (pos));
23097 unsigned HOST_WIDE_INT mask
23098 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
23099 return GEN_INT (mask << UINTVAL (pos));
23102 bool
23103 aarch64_mov_operand_p (rtx x, machine_mode mode)
23105 if (GET_CODE (x) == HIGH
23106 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
23107 return true;
23109 if (CONST_INT_P (x))
23110 return true;
23112 if (VECTOR_MODE_P (GET_MODE (x)))
23114 /* Require predicate constants to be VNx16BI before RA, so that we
23115 force everything to have a canonical form. */
23116 if (!lra_in_progress
23117 && !reload_completed
23118 && aarch64_sve_pred_mode_p (GET_MODE (x))
23119 && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
23120 && GET_MODE (x) != VNx16BImode)
23121 return false;
23123 return aarch64_simd_valid_immediate (x, NULL);
23126 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
23127 x = strip_salt (x);
23129 /* GOT accesses are valid moves. */
23130 if (SYMBOL_REF_P (x)
23131 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
23132 return true;
23134 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
23135 return true;
23137 if (TARGET_SVE
23138 && (aarch64_sve_cnt_immediate_p (x)
23139 || aarch64_sve_rdvl_immediate_p (x)))
23140 return true;
23142 if (aarch64_rdsvl_immediate_p (x))
23143 return true;
23145 return aarch64_classify_symbolic_expression (x)
23146 == SYMBOL_TINY_ABSOLUTE;
23149 /* Return a function-invariant register that contains VALUE. *CACHED_INSN
23150 caches instructions that set up such registers, so that they can be
23151 reused by future calls. */
23153 static rtx
23154 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
23156 rtx_insn *insn = *cached_insn;
23157 if (insn && INSN_P (insn) && !insn->deleted ())
23159 rtx pat = PATTERN (insn);
23160 if (GET_CODE (pat) == SET)
23162 rtx dest = SET_DEST (pat);
23163 if (REG_P (dest)
23164 && !HARD_REGISTER_P (dest)
23165 && rtx_equal_p (SET_SRC (pat), value))
23166 return dest;
23169 rtx reg = gen_reg_rtx (GET_MODE (value));
23170 *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
23171 function_beg_insn);
23172 return reg;
23175 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23176 the constant creation. */
23179 aarch64_gen_shareable_zero (machine_mode mode)
23181 rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
23182 CONST0_RTX (V4SImode));
23183 return lowpart_subreg (mode, reg, GET_MODE (reg));
23186 /* INSN is some form of extension or shift that can be split into a
23187 permutation involving a shared zero. Return true if we should
23188 perform such a split.
23190 ??? For now, make sure that the split instruction executes more
23191 frequently than the zero that feeds it. In future it would be good
23192 to split without that restriction and instead recombine shared zeros
23193 if they turn out not to be worthwhile. This would allow splits in
23194 single-block functions and would also cope more naturally with
23195 rematerialization. The downside of not doing this is that we lose the
23196 optimizations for vector epilogues as well. */
23198 bool
23199 aarch64_split_simd_shift_p (rtx_insn *insn)
23201 return (can_create_pseudo_p ()
23202 && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
23203 && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
23204 < BLOCK_FOR_INSN (insn)->count));
23207 /* Return a const_int vector of VAL. */
23209 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
23211 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
23212 return gen_const_vec_duplicate (mode, c);
23215 /* Check OP is a legal scalar immediate for the MOVI instruction. */
23217 bool
23218 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
23220 machine_mode vmode;
23222 vmode = aarch64_simd_container_mode (mode, 64);
23223 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
23224 return aarch64_simd_valid_immediate (op_v, NULL);
23227 /* Construct and return a PARALLEL RTX vector with elements numbering the
23228 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23229 the vector - from the perspective of the architecture. This does not
23230 line up with GCC's perspective on lane numbers, so we end up with
23231 different masks depending on our target endian-ness. The diagram
23232 below may help. We must draw the distinction when building masks
23233 which select one half of the vector. An instruction selecting
23234 architectural low-lanes for a big-endian target, must be described using
23235 a mask selecting GCC high-lanes.
23237 Big-Endian Little-Endian
23239 GCC 0 1 2 3 3 2 1 0
23240 | x | x | x | x | | x | x | x | x |
23241 Architecture 3 2 1 0 3 2 1 0
23243 Low Mask: { 2, 3 } { 0, 1 }
23244 High Mask: { 0, 1 } { 2, 3 }
23246 MODE Is the mode of the vector and NUNITS is the number of units in it. */
23249 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
23251 rtvec v = rtvec_alloc (nunits / 2);
23252 int high_base = nunits / 2;
23253 int low_base = 0;
23254 int base;
23255 rtx t1;
23256 int i;
23258 if (BYTES_BIG_ENDIAN)
23259 base = high ? low_base : high_base;
23260 else
23261 base = high ? high_base : low_base;
23263 for (i = 0; i < nunits / 2; i++)
23264 RTVEC_ELT (v, i) = GEN_INT (base + i);
23266 t1 = gen_rtx_PARALLEL (mode, v);
23267 return t1;
23270 /* Check OP for validity as a PARALLEL RTX vector with elements
23271 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23272 from the perspective of the architecture. See the diagram above
23273 aarch64_simd_vect_par_cnst_half for more details. */
23275 bool
23276 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23277 bool high)
23279 int nelts;
23280 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23281 return false;
23283 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23284 HOST_WIDE_INT count_op = XVECLEN (op, 0);
23285 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23286 int i = 0;
23288 if (count_op != count_ideal)
23289 return false;
23291 for (i = 0; i < count_ideal; i++)
23293 rtx elt_op = XVECEXP (op, 0, i);
23294 rtx elt_ideal = XVECEXP (ideal, 0, i);
23296 if (!CONST_INT_P (elt_op)
23297 || INTVAL (elt_ideal) != INTVAL (elt_op))
23298 return false;
23300 return true;
23303 /* Return a PARALLEL containing NELTS elements, with element I equal
23304 to BASE + I * STEP. */
23307 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23309 rtvec vec = rtvec_alloc (nelts);
23310 for (unsigned int i = 0; i < nelts; ++i)
23311 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23312 return gen_rtx_PARALLEL (VOIDmode, vec);
23315 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23316 series with step STEP. */
23318 bool
23319 aarch64_stepped_int_parallel_p (rtx op, int step)
23321 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23322 return false;
23324 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23325 for (int i = 1; i < XVECLEN (op, 0); ++i)
23326 if (!CONST_INT_P (XVECEXP (op, 0, i))
23327 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23328 return false;
23330 return true;
23333 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23334 sequence of strided registers, with the stride being equal STRIDE.
23335 The operands are already known to be FPRs. */
23336 bool
23337 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23338 unsigned int stride)
23340 for (unsigned int i = 1; i < num_operands; ++i)
23341 if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23342 return false;
23343 return true;
23346 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
23347 HIGH (exclusive). */
23348 void
23349 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23350 const_tree exp)
23352 HOST_WIDE_INT lane;
23353 gcc_assert (CONST_INT_P (operand));
23354 lane = INTVAL (operand);
23356 if (lane < low || lane >= high)
23358 if (exp)
23359 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23360 lane, low, high - 1);
23361 else
23362 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23366 /* Peform endian correction on lane number N, which indexes a vector
23367 of mode MODE, and return the result as an SImode rtx. */
23370 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23372 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23375 /* Return TRUE if OP is a valid vector addressing mode. */
23377 bool
23378 aarch64_simd_mem_operand_p (rtx op)
23380 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
23381 || REG_P (XEXP (op, 0)));
23384 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
23386 bool
23387 aarch64_sve_ld1r_operand_p (rtx op)
23389 struct aarch64_address_info addr;
23390 scalar_mode mode;
23392 return (MEM_P (op)
23393 && is_a <scalar_mode> (GET_MODE (op), &mode)
23394 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23395 && addr.type == ADDRESS_REG_IMM
23396 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23399 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23400 where the size of the read data is specified by `mode` and the size of the
23401 vector elements are specified by `elem_mode`. */
23402 bool
23403 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23404 scalar_mode elem_mode)
23406 struct aarch64_address_info addr;
23407 if (!MEM_P (op)
23408 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23409 return false;
23411 if (addr.type == ADDRESS_REG_IMM)
23412 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23414 if (addr.type == ADDRESS_REG_REG)
23415 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23417 return false;
23420 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
23421 bool
23422 aarch64_sve_ld1rq_operand_p (rtx op)
23424 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23425 GET_MODE_INNER (GET_MODE (op)));
23428 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23429 accessing a vector where the element size is specified by `elem_mode`. */
23430 bool
23431 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23433 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23436 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
23437 bool
23438 aarch64_sve_ldff1_operand_p (rtx op)
23440 if (!MEM_P (op))
23441 return false;
23443 struct aarch64_address_info addr;
23444 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23445 return false;
23447 if (addr.type == ADDRESS_REG_IMM)
23448 return known_eq (addr.const_offset, 0);
23450 return addr.type == ADDRESS_REG_REG;
23453 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
23454 bool
23455 aarch64_sve_ldnf1_operand_p (rtx op)
23457 struct aarch64_address_info addr;
23459 return (MEM_P (op)
23460 && aarch64_classify_address (&addr, XEXP (op, 0),
23461 GET_MODE (op), false)
23462 && addr.type == ADDRESS_REG_IMM);
23465 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23466 The conditions for STR are the same. */
23467 bool
23468 aarch64_sve_ldr_operand_p (rtx op)
23470 struct aarch64_address_info addr;
23472 return (MEM_P (op)
23473 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23474 false, ADDR_QUERY_ANY)
23475 && addr.type == ADDRESS_REG_IMM);
23478 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23479 addressing memory of mode MODE. */
23480 bool
23481 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23483 struct aarch64_address_info addr;
23484 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23485 return false;
23487 if (addr.type == ADDRESS_REG_IMM)
23488 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23490 return addr.type == ADDRESS_REG_REG;
23493 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23494 We need to be able to access the individual pieces, so the range
23495 is different from LD[234] and ST[234]. */
23496 bool
23497 aarch64_sve_struct_memory_operand_p (rtx op)
23499 if (!MEM_P (op))
23500 return false;
23502 machine_mode mode = GET_MODE (op);
23503 struct aarch64_address_info addr;
23504 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23505 ADDR_QUERY_ANY)
23506 || addr.type != ADDRESS_REG_IMM)
23507 return false;
23509 poly_int64 first = addr.const_offset;
23510 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23511 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23512 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23515 /* Return true if OFFSET is a constant integer and if VNUM is
23516 OFFSET * the number of bytes in an SVE vector. This is the requirement
23517 that exists in SME LDR and STR instructions, where the VL offset must
23518 equal the ZA slice offset. */
23519 bool
23520 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23522 if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23523 return false;
23525 if (TARGET_STREAMING)
23527 poly_int64 const_vnum;
23528 return (poly_int_rtx_p (vnum, &const_vnum)
23529 && known_eq (const_vnum,
23530 INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23532 else
23534 HOST_WIDE_INT factor;
23535 return (aarch64_sme_vq_unspec_p (vnum, &factor)
23536 && factor == INTVAL (offset) * 16);
23540 /* Emit a register copy from operand to operand, taking care not to
23541 early-clobber source registers in the process.
23543 COUNT is the number of components into which the copy needs to be
23544 decomposed. */
23545 void
23546 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23547 unsigned int count)
23549 unsigned int i;
23550 int rdest = REGNO (operands[0]);
23551 int rsrc = REGNO (operands[1]);
23553 if (!reg_overlap_mentioned_p (operands[0], operands[1])
23554 || rdest < rsrc)
23555 for (i = 0; i < count; i++)
23556 emit_move_insn (gen_rtx_REG (mode, rdest + i),
23557 gen_rtx_REG (mode, rsrc + i));
23558 else
23559 for (i = 0; i < count; i++)
23560 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23561 gen_rtx_REG (mode, rsrc + count - i - 1));
23564 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23565 one of VSTRUCT modes: OI, CI, or XI. */
23567 aarch64_simd_attr_length_rglist (machine_mode mode)
23569 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
23570 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23573 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
23574 alignment of a vector to 128 bits. SVE predicates have an alignment of
23575 16 bits. */
23576 static HOST_WIDE_INT
23577 aarch64_simd_vector_alignment (const_tree type)
23579 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23580 be set for non-predicate vectors of booleans. Modes are the most
23581 direct way we have of identifying real SVE predicate types. */
23582 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23583 return 16;
23584 widest_int min_size
23585 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23586 return wi::umin (min_size, 128).to_uhwi ();
23589 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
23590 static poly_uint64
23591 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23593 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23595 /* If the length of the vector is a fixed power of 2, try to align
23596 to that length, otherwise don't try to align at all. */
23597 HOST_WIDE_INT result;
23598 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23599 || !pow2p_hwi (result))
23600 result = TYPE_ALIGN (TREE_TYPE (type));
23601 return result;
23603 return TYPE_ALIGN (type);
23606 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
23607 static bool
23608 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23610 if (is_packed)
23611 return false;
23613 /* For fixed-length vectors, check that the vectorizer will aim for
23614 full-vector alignment. This isn't true for generic GCC vectors
23615 that are wider than the ABI maximum of 128 bits. */
23616 poly_uint64 preferred_alignment =
23617 aarch64_vectorize_preferred_vector_alignment (type);
23618 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23619 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23620 preferred_alignment))
23621 return false;
23623 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
23624 return true;
23627 /* Return true if the vector misalignment factor is supported by the
23628 target. */
23629 static bool
23630 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23631 const_tree type, int misalignment,
23632 bool is_packed)
23634 if (TARGET_SIMD && STRICT_ALIGNMENT)
23636 /* Return if movmisalign pattern is not supported for this mode. */
23637 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23638 return false;
23640 /* Misalignment factor is unknown at compile time. */
23641 if (misalignment == -1)
23642 return false;
23644 return default_builtin_support_vector_misalignment (mode, type, misalignment,
23645 is_packed);
23648 /* If VALS is a vector constant that can be loaded into a register
23649 using DUP, generate instructions to do so and return an RTX to
23650 assign to the register. Otherwise return NULL_RTX. */
23651 static rtx
23652 aarch64_simd_dup_constant (rtx vals)
23654 machine_mode mode = GET_MODE (vals);
23655 machine_mode inner_mode = GET_MODE_INNER (mode);
23656 rtx x;
23658 if (!const_vec_duplicate_p (vals, &x))
23659 return NULL_RTX;
23661 /* We can load this constant by using DUP and a constant in a
23662 single ARM register. This will be cheaper than a vector
23663 load. */
23664 x = force_reg (inner_mode, x);
23665 return gen_vec_duplicate (mode, x);
23669 /* Generate code to load VALS, which is a PARALLEL containing only
23670 constants (for vec_init) or CONST_VECTOR, efficiently into a
23671 register. Returns an RTX to copy into the register, or NULL_RTX
23672 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
23673 static rtx
23674 aarch64_simd_make_constant (rtx vals)
23676 machine_mode mode = GET_MODE (vals);
23677 rtx const_dup;
23678 rtx const_vec = NULL_RTX;
23679 int n_const = 0;
23680 int i;
23682 if (CONST_VECTOR_P (vals))
23683 const_vec = vals;
23684 else if (GET_CODE (vals) == PARALLEL)
23686 /* A CONST_VECTOR must contain only CONST_INTs and
23687 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23688 Only store valid constants in a CONST_VECTOR. */
23689 int n_elts = XVECLEN (vals, 0);
23690 for (i = 0; i < n_elts; ++i)
23692 rtx x = XVECEXP (vals, 0, i);
23693 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23694 n_const++;
23696 if (n_const == n_elts)
23697 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
23699 else
23700 gcc_unreachable ();
23702 if (const_vec != NULL_RTX
23703 && aarch64_simd_valid_immediate (const_vec, NULL))
23704 /* Load using MOVI/MVNI. */
23705 return const_vec;
23706 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
23707 /* Loaded using DUP. */
23708 return const_dup;
23709 else if (const_vec != NULL_RTX)
23710 /* Load from constant pool. We cannot take advantage of single-cycle
23711 LD1 because we need a PC-relative addressing mode. */
23712 return const_vec;
23713 else
23714 /* A PARALLEL containing something not valid inside CONST_VECTOR.
23715 We cannot construct an initializer. */
23716 return NULL_RTX;
23719 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23720 The caller has already tried a divide-and-conquer approach, so do
23721 not consider that case here. */
23723 void
23724 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
23726 machine_mode mode = GET_MODE (target);
23727 scalar_mode inner_mode = GET_MODE_INNER (mode);
23728 /* The number of vector elements. */
23729 int n_elts = XVECLEN (vals, 0);
23730 /* The number of vector elements which are not constant. */
23731 int n_var = 0;
23732 rtx any_const = NULL_RTX;
23733 /* The first element of vals. */
23734 rtx v0 = XVECEXP (vals, 0, 0);
23735 bool all_same = true;
23737 /* This is a special vec_init<M><N> where N is not an element mode but a
23738 vector mode with half the elements of M. We expect to find two entries
23739 of mode N in VALS and we must put their concatentation into TARGET. */
23740 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
23742 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
23743 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
23744 && known_eq (GET_MODE_SIZE (mode),
23745 2 * GET_MODE_SIZE (narrow_mode)));
23746 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
23747 XVECEXP (vals, 0, 0),
23748 XVECEXP (vals, 0, 1)));
23749 return;
23752 /* Count the number of variable elements to initialise. */
23753 for (int i = 0; i < n_elts; ++i)
23755 rtx x = XVECEXP (vals, 0, i);
23756 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
23757 ++n_var;
23758 else
23759 any_const = x;
23761 all_same &= rtx_equal_p (x, v0);
23764 /* No variable elements, hand off to aarch64_simd_make_constant which knows
23765 how best to handle this. */
23766 if (n_var == 0)
23768 rtx constant = aarch64_simd_make_constant (vals);
23769 if (constant != NULL_RTX)
23771 emit_move_insn (target, constant);
23772 return;
23776 /* Splat a single non-constant element if we can. */
23777 if (all_same)
23779 rtx x = force_reg (inner_mode, v0);
23780 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23781 return;
23784 enum insn_code icode = optab_handler (vec_set_optab, mode);
23785 gcc_assert (icode != CODE_FOR_nothing);
23787 /* If there are only variable elements, try to optimize
23788 the insertion using dup for the most common element
23789 followed by insertions. */
23791 /* The algorithm will fill matches[*][0] with the earliest matching element,
23792 and matches[X][1] with the count of duplicate elements (if X is the
23793 earliest element which has duplicates). */
23795 if (n_var >= n_elts - 1 && n_elts <= 16)
23797 int matches[16][2] = {0};
23798 for (int i = 0; i < n_elts; i++)
23800 for (int j = 0; j <= i; j++)
23802 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
23804 matches[i][0] = j;
23805 matches[j][1]++;
23806 break;
23810 int maxelement = 0;
23811 int maxv = 0;
23812 rtx const_elem = NULL_RTX;
23813 int const_elem_pos = 0;
23815 for (int i = 0; i < n_elts; i++)
23817 if (matches[i][1] > maxv)
23819 maxelement = i;
23820 maxv = matches[i][1];
23822 if (CONST_INT_P (XVECEXP (vals, 0, i))
23823 || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
23825 const_elem_pos = i;
23826 const_elem = XVECEXP (vals, 0, i);
23830 /* Create a duplicate of the most common element, unless all elements
23831 are equally useless to us, in which case just immediately set the
23832 vector register using the first element. */
23834 if (maxv == 1)
23836 /* For vectors of two 64-bit elements, we can do even better. */
23837 if (n_elts == 2
23838 && (inner_mode == E_DImode
23839 || inner_mode == E_DFmode))
23842 rtx x0 = XVECEXP (vals, 0, 0);
23843 rtx x1 = XVECEXP (vals, 0, 1);
23844 /* Combine can pick up this case, but handling it directly
23845 here leaves clearer RTL.
23847 This is load_pair_lanes<mode>, and also gives us a clean-up
23848 for store_pair_lanes<mode>. */
23849 if (memory_operand (x0, inner_mode)
23850 && memory_operand (x1, inner_mode)
23851 && aarch64_mergeable_load_pair_p (mode, x0, x1))
23853 rtx t;
23854 if (inner_mode == DFmode)
23855 t = gen_load_pair_lanesdf (target, x0, x1);
23856 else
23857 t = gen_load_pair_lanesdi (target, x0, x1);
23858 emit_insn (t);
23859 return;
23862 /* The subreg-move sequence below will move into lane zero of the
23863 vector register. For big-endian we want that position to hold
23864 the last element of VALS. */
23865 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
23867 /* If we have a single constant element, use that for duplicating
23868 instead. */
23869 if (const_elem)
23871 maxelement = const_elem_pos;
23872 aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
23874 else
23876 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23877 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
23880 else
23882 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23883 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23886 /* Insert the rest. */
23887 for (int i = 0; i < n_elts; i++)
23889 rtx x = XVECEXP (vals, 0, i);
23890 if (matches[i][0] == maxelement)
23891 continue;
23892 x = force_reg (inner_mode, x);
23893 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23895 return;
23898 /* Initialise a vector which is part-variable. We want to first try
23899 to build those lanes which are constant in the most efficient way we
23900 can. */
23901 if (n_var != n_elts)
23903 rtx copy = copy_rtx (vals);
23905 /* Load constant part of vector. We really don't care what goes into the
23906 parts we will overwrite, but we're more likely to be able to load the
23907 constant efficiently if it has fewer, larger, repeating parts
23908 (see aarch64_simd_valid_immediate). */
23909 for (int i = 0; i < n_elts; i++)
23911 rtx x = XVECEXP (vals, 0, i);
23912 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23913 continue;
23914 rtx subst = any_const;
23915 for (int bit = n_elts / 2; bit > 0; bit /= 2)
23917 /* Look in the copied vector, as more elements are const. */
23918 rtx test = XVECEXP (copy, 0, i ^ bit);
23919 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
23921 subst = test;
23922 break;
23925 XVECEXP (copy, 0, i) = subst;
23927 aarch64_expand_vector_init_fallback (target, copy);
23930 /* Insert the variable lanes directly. */
23931 for (int i = 0; i < n_elts; i++)
23933 rtx x = XVECEXP (vals, 0, i);
23934 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23935 continue;
23936 x = force_reg (inner_mode, x);
23937 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23941 /* Return even or odd half of VALS depending on EVEN_P. */
23943 static rtx
23944 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
23946 int n = XVECLEN (vals, 0);
23947 machine_mode new_mode
23948 = aarch64_simd_container_mode (GET_MODE_INNER (mode),
23949 GET_MODE_BITSIZE (mode).to_constant () / 2);
23950 rtvec vec = rtvec_alloc (n / 2);
23951 for (int i = 0; i < n / 2; i++)
23952 RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
23953 : XVECEXP (vals, 0, 2 * i + 1);
23954 return gen_rtx_PARALLEL (new_mode, vec);
23957 /* Return true if SET is a scalar move. */
23959 static bool
23960 scalar_move_insn_p (rtx set)
23962 rtx src = SET_SRC (set);
23963 rtx dest = SET_DEST (set);
23964 return (is_a<scalar_mode> (GET_MODE (dest))
23965 && aarch64_mov_operand (src, GET_MODE (dest)));
23968 /* Similar to seq_cost, but ignore cost for scalar moves. */
23970 static unsigned
23971 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
23973 unsigned cost = 0;
23975 for (; seq; seq = NEXT_INSN (seq))
23976 if (NONDEBUG_INSN_P (seq))
23978 if (rtx set = single_set (seq))
23980 if (!scalar_move_insn_p (set))
23981 cost += set_rtx_cost (set, speed);
23983 else
23985 int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
23986 if (this_cost > 0)
23987 cost += this_cost;
23988 else
23989 cost++;
23993 return cost;
23996 /* Expand a vector initialization sequence, such that TARGET is
23997 initialized to contain VALS. */
23999 void
24000 aarch64_expand_vector_init (rtx target, rtx vals)
24002 /* Try decomposing the initializer into even and odd halves and
24003 then ZIP them together. Use the resulting sequence if it is
24004 strictly cheaper than loading VALS directly.
24006 Prefer the fallback sequence in the event of a tie, since it
24007 will tend to use fewer registers. */
24009 machine_mode mode = GET_MODE (target);
24010 int n_elts = XVECLEN (vals, 0);
24012 if (n_elts < 4
24013 || maybe_ne (GET_MODE_BITSIZE (mode), 128))
24015 aarch64_expand_vector_init_fallback (target, vals);
24016 return;
24019 start_sequence ();
24020 rtx halves[2];
24021 unsigned costs[2];
24022 for (int i = 0; i < 2; i++)
24024 start_sequence ();
24025 rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
24026 rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
24027 aarch64_expand_vector_init (tmp_reg, new_vals);
24028 halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
24029 rtx_insn *rec_seq = get_insns ();
24030 end_sequence ();
24031 costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
24032 emit_insn (rec_seq);
24035 rtvec v = gen_rtvec (2, halves[0], halves[1]);
24036 rtx_insn *zip1_insn
24037 = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24038 unsigned seq_total_cost
24039 = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
24040 seq_total_cost += insn_cost (zip1_insn, !optimize_size);
24042 rtx_insn *seq = get_insns ();
24043 end_sequence ();
24045 start_sequence ();
24046 aarch64_expand_vector_init_fallback (target, vals);
24047 rtx_insn *fallback_seq = get_insns ();
24048 unsigned fallback_seq_cost
24049 = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
24050 end_sequence ();
24052 emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
24055 /* Emit RTL corresponding to:
24056 insr TARGET, ELEM. */
24058 static void
24059 emit_insr (rtx target, rtx elem)
24061 machine_mode mode = GET_MODE (target);
24062 scalar_mode elem_mode = GET_MODE_INNER (mode);
24063 elem = force_reg (elem_mode, elem);
24065 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
24066 gcc_assert (icode != CODE_FOR_nothing);
24067 emit_insn (GEN_FCN (icode) (target, target, elem));
24070 /* Subroutine of aarch64_sve_expand_vector_init for handling
24071 trailing constants.
24072 This function works as follows:
24073 (a) Create a new vector consisting of trailing constants.
24074 (b) Initialize TARGET with the constant vector using emit_move_insn.
24075 (c) Insert remaining elements in TARGET using insr.
24076 NELTS is the total number of elements in original vector while
24077 while NELTS_REQD is the number of elements that are actually
24078 significant.
24080 ??? The heuristic used is to do above only if number of constants
24081 is at least half the total number of elements. May need fine tuning. */
24083 static bool
24084 aarch64_sve_expand_vector_init_handle_trailing_constants
24085 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
24087 machine_mode mode = GET_MODE (target);
24088 scalar_mode elem_mode = GET_MODE_INNER (mode);
24089 int n_trailing_constants = 0;
24091 for (int i = nelts_reqd - 1;
24092 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
24093 i--)
24094 n_trailing_constants++;
24096 if (n_trailing_constants >= nelts_reqd / 2)
24098 /* Try to use the natural pattern of BUILDER to extend the trailing
24099 constant elements to a full vector. Replace any variables in the
24100 extra elements with zeros.
24102 ??? It would be better if the builders supported "don't care"
24103 elements, with the builder filling in whichever elements
24104 give the most compact encoding. */
24105 rtx_vector_builder v (mode, nelts, 1);
24106 for (int i = 0; i < nelts; i++)
24108 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
24109 if (!valid_for_const_vector_p (elem_mode, x))
24110 x = CONST0_RTX (elem_mode);
24111 v.quick_push (x);
24113 rtx const_vec = v.build ();
24114 emit_move_insn (target, const_vec);
24116 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
24117 emit_insr (target, builder.elt (i));
24119 return true;
24122 return false;
24125 /* Subroutine of aarch64_sve_expand_vector_init.
24126 Works as follows:
24127 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24128 (b) Skip trailing elements from BUILDER, which are the same as
24129 element NELTS_REQD - 1.
24130 (c) Insert earlier elements in reverse order in TARGET using insr. */
24132 static void
24133 aarch64_sve_expand_vector_init_insert_elems (rtx target,
24134 const rtx_vector_builder &builder,
24135 int nelts_reqd)
24137 machine_mode mode = GET_MODE (target);
24138 scalar_mode elem_mode = GET_MODE_INNER (mode);
24140 struct expand_operand ops[2];
24141 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
24142 gcc_assert (icode != CODE_FOR_nothing);
24144 create_output_operand (&ops[0], target, mode);
24145 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
24146 expand_insn (icode, 2, ops);
24148 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24149 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
24150 emit_insr (target, builder.elt (i));
24153 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24154 when all trailing elements of builder are same.
24155 This works as follows:
24156 (a) Use expand_insn interface to broadcast last vector element in TARGET.
24157 (b) Insert remaining elements in TARGET using insr.
24159 ??? The heuristic used is to do above if number of same trailing elements
24160 is at least 3/4 of total number of elements, loosely based on
24161 heuristic from mostly_zeros_p. May need fine-tuning. */
24163 static bool
24164 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24165 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
24167 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24168 if (ndups >= (3 * nelts_reqd) / 4)
24170 aarch64_sve_expand_vector_init_insert_elems (target, builder,
24171 nelts_reqd - ndups + 1);
24172 return true;
24175 return false;
24178 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24179 of elements in BUILDER.
24181 The function tries to initialize TARGET from BUILDER if it fits one
24182 of the special cases outlined below.
24184 Failing that, the function divides BUILDER into two sub-vectors:
24185 v_even = even elements of BUILDER;
24186 v_odd = odd elements of BUILDER;
24188 and recursively calls itself with v_even and v_odd.
24190 if (recursive call succeeded for v_even or v_odd)
24191 TARGET = zip (v_even, v_odd)
24193 The function returns true if it managed to build TARGET from BUILDER
24194 with one of the special cases, false otherwise.
24196 Example: {a, 1, b, 2, c, 3, d, 4}
24198 The vector gets divided into:
24199 v_even = {a, b, c, d}
24200 v_odd = {1, 2, 3, 4}
24202 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24203 initialize tmp2 from constant vector v_odd using emit_move_insn.
24205 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24206 4 elements, so we construct tmp1 from v_even using insr:
24207 tmp1 = dup(d)
24208 insr tmp1, c
24209 insr tmp1, b
24210 insr tmp1, a
24212 And finally:
24213 TARGET = zip (tmp1, tmp2)
24214 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
24216 static bool
24217 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
24218 int nelts, int nelts_reqd)
24220 machine_mode mode = GET_MODE (target);
24222 /* Case 1: Vector contains trailing constants. */
24224 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24225 (target, builder, nelts, nelts_reqd))
24226 return true;
24228 /* Case 2: Vector contains leading constants. */
24230 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
24231 for (int i = 0; i < nelts_reqd; i++)
24232 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
24233 rev_builder.finalize ();
24235 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24236 (target, rev_builder, nelts, nelts_reqd))
24238 emit_insn (gen_aarch64_sve_rev (mode, target, target));
24239 return true;
24242 /* Case 3: Vector contains trailing same element. */
24244 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24245 (target, builder, nelts_reqd))
24246 return true;
24248 /* Case 4: Vector contains leading same element. */
24250 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24251 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
24253 emit_insn (gen_aarch64_sve_rev (mode, target, target));
24254 return true;
24257 /* Avoid recursing below 4-elements.
24258 ??? The threshold 4 may need fine-tuning. */
24260 if (nelts_reqd <= 4)
24261 return false;
24263 rtx_vector_builder v_even (mode, nelts, 1);
24264 rtx_vector_builder v_odd (mode, nelts, 1);
24266 for (int i = 0; i < nelts * 2; i += 2)
24268 v_even.quick_push (builder.elt (i));
24269 v_odd.quick_push (builder.elt (i + 1));
24272 v_even.finalize ();
24273 v_odd.finalize ();
24275 rtx tmp1 = gen_reg_rtx (mode);
24276 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24277 nelts, nelts_reqd / 2);
24279 rtx tmp2 = gen_reg_rtx (mode);
24280 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24281 nelts, nelts_reqd / 2);
24283 if (!did_even_p && !did_odd_p)
24284 return false;
24286 /* Initialize v_even and v_odd using INSR if it didn't match any of the
24287 special cases and zip v_even, v_odd. */
24289 if (!did_even_p)
24290 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24292 if (!did_odd_p)
24293 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24295 rtvec v = gen_rtvec (2, tmp1, tmp2);
24296 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24297 return true;
24300 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
24302 void
24303 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24305 machine_mode mode = GET_MODE (target);
24306 int nelts = XVECLEN (vals, 0);
24308 rtx_vector_builder v (mode, nelts, 1);
24309 for (int i = 0; i < nelts; i++)
24310 v.quick_push (XVECEXP (vals, 0, i));
24311 v.finalize ();
24313 /* If neither sub-vectors of v could be initialized specially,
24314 then use INSR to insert all elements from v into TARGET.
24315 ??? This might not be optimal for vectors with large
24316 initializers like 16-element or above.
24317 For nelts < 4, it probably isn't useful to handle specially. */
24319 if (nelts < 4
24320 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24321 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24324 /* Check whether VALUE is a vector constant in which every element
24325 is either a power of 2 or a negated power of 2. If so, return
24326 a constant vector of log2s, and flip CODE between PLUS and MINUS
24327 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
24329 static rtx
24330 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24332 if (!CONST_VECTOR_P (value))
24333 return NULL_RTX;
24335 rtx_vector_builder builder;
24336 if (!builder.new_unary_operation (GET_MODE (value), value, false))
24337 return NULL_RTX;
24339 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24340 /* 1 if the result of the multiplication must be negated,
24341 0 if it mustn't, or -1 if we don't yet care. */
24342 int negate = -1;
24343 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24344 for (unsigned int i = 0; i < encoded_nelts; ++i)
24346 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24347 if (!CONST_SCALAR_INT_P (elt))
24348 return NULL_RTX;
24349 rtx_mode_t val (elt, int_mode);
24350 wide_int pow2 = wi::neg (val);
24351 if (val != pow2)
24353 /* It matters whether we negate or not. Make that choice,
24354 and make sure that it's consistent with previous elements. */
24355 if (negate == !wi::neg_p (val))
24356 return NULL_RTX;
24357 negate = wi::neg_p (val);
24358 if (!negate)
24359 pow2 = val;
24361 /* POW2 is now the value that we want to be a power of 2. */
24362 int shift = wi::exact_log2 (pow2);
24363 if (shift < 0)
24364 return NULL_RTX;
24365 builder.quick_push (gen_int_mode (shift, int_mode));
24367 if (negate == -1)
24368 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
24369 code = PLUS;
24370 else if (negate == 1)
24371 code = code == PLUS ? MINUS : PLUS;
24372 return builder.build ();
24375 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24376 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
24377 operands array, in the same order as for fma_optab. Return true if
24378 the function emitted all the necessary instructions, false if the caller
24379 should generate the pattern normally with the new OPERANDS array. */
24381 bool
24382 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24384 machine_mode mode = GET_MODE (operands[0]);
24385 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24387 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24388 NULL_RTX, true, OPTAB_DIRECT);
24389 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24390 operands[3], product, operands[0], true,
24391 OPTAB_DIRECT);
24392 return true;
24394 operands[2] = force_reg (mode, operands[2]);
24395 return false;
24398 /* Likewise, but for a conditional pattern. */
24400 bool
24401 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24403 machine_mode mode = GET_MODE (operands[0]);
24404 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24406 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24407 NULL_RTX, true, OPTAB_DIRECT);
24408 emit_insn (gen_cond (code, mode, operands[0], operands[1],
24409 operands[4], product, operands[5]));
24410 return true;
24412 operands[3] = force_reg (mode, operands[3]);
24413 return false;
24416 static unsigned HOST_WIDE_INT
24417 aarch64_shift_truncation_mask (machine_mode mode)
24419 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24420 return 0;
24421 return GET_MODE_UNIT_BITSIZE (mode) - 1;
24424 /* Select a format to encode pointers in exception handling data. */
24426 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24428 int type;
24429 switch (aarch64_cmodel)
24431 case AARCH64_CMODEL_TINY:
24432 case AARCH64_CMODEL_TINY_PIC:
24433 case AARCH64_CMODEL_SMALL:
24434 case AARCH64_CMODEL_SMALL_PIC:
24435 case AARCH64_CMODEL_SMALL_SPIC:
24436 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
24437 for everything. */
24438 type = DW_EH_PE_sdata4;
24439 break;
24440 default:
24441 /* No assumptions here. 8-byte relocs required. */
24442 type = DW_EH_PE_sdata8;
24443 break;
24445 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24448 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
24450 static void
24451 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24453 if (TREE_CODE (decl) == FUNCTION_DECL)
24455 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24456 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24458 fprintf (stream, "\t.variant_pcs\t");
24459 assemble_name (stream, name);
24460 fprintf (stream, "\n");
24465 /* The last .arch and .tune assembly strings that we printed. */
24466 static std::string aarch64_last_printed_arch_string;
24467 static std::string aarch64_last_printed_tune_string;
24469 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
24470 by the function fndecl. */
24472 void
24473 aarch64_declare_function_name (FILE *stream, const char* name,
24474 tree fndecl)
24476 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24478 struct cl_target_option *targ_options;
24479 if (target_parts)
24480 targ_options = TREE_TARGET_OPTION (target_parts);
24481 else
24482 targ_options = TREE_TARGET_OPTION (target_option_current_node);
24483 gcc_assert (targ_options);
24485 const struct processor *this_arch
24486 = aarch64_get_arch (targ_options->x_selected_arch);
24488 auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
24489 std::string extension
24490 = aarch64_get_extension_string_for_isa_flags (isa_flags,
24491 this_arch->flags);
24492 /* Only update the assembler .arch string if it is distinct from the last
24493 such string we printed. */
24494 std::string to_print = this_arch->name + extension;
24495 if (to_print != aarch64_last_printed_arch_string)
24497 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24498 aarch64_last_printed_arch_string = to_print;
24501 /* Print the cpu name we're tuning for in the comments, might be
24502 useful to readers of the generated asm. Do it only when it changes
24503 from function to function and verbose assembly is requested. */
24504 const struct processor *this_tune
24505 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24507 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24509 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24510 this_tune->name);
24511 aarch64_last_printed_tune_string = this_tune->name;
24514 aarch64_asm_output_variant_pcs (stream, fndecl, name);
24516 /* Don't forget the type directive for ELF. */
24517 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24518 ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24520 cfun->machine->label_is_assembled = true;
24523 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
24525 void
24526 aarch64_print_patchable_function_entry (FILE *file,
24527 unsigned HOST_WIDE_INT patch_area_size,
24528 bool record_p)
24530 if (!cfun->machine->label_is_assembled)
24532 /* Emit the patching area before the entry label, if any. */
24533 default_print_patchable_function_entry (file, patch_area_size,
24534 record_p);
24535 return;
24538 rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24539 GEN_INT (record_p));
24540 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24542 if (!aarch_bti_enabled ()
24543 || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24545 /* Emit the patchable_area at the beginning of the function. */
24546 rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24547 INSN_ADDRESSES_NEW (insn, -1);
24548 return;
24551 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24552 if (!insn
24553 || !INSN_P (insn)
24554 || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24555 || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24557 /* Emit a BTI_C. */
24558 insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24561 /* Emit the patchable_area after BTI_C. */
24562 insn = emit_insn_after (pa, insn);
24563 INSN_ADDRESSES_NEW (insn, -1);
24566 /* Output patchable area. */
24568 void
24569 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24571 default_print_patchable_function_entry (asm_out_file, patch_area_size,
24572 record_p);
24575 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
24577 void
24578 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24580 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24581 const char *value = IDENTIFIER_POINTER (target);
24582 aarch64_asm_output_variant_pcs (stream, decl, name);
24583 ASM_OUTPUT_DEF (stream, name, value);
24586 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
24587 function symbol references. */
24589 void
24590 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24592 default_elf_asm_output_external (stream, decl, name);
24593 aarch64_asm_output_variant_pcs (stream, decl, name);
24596 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24597 Used to output the .cfi_b_key_frame directive when signing the current
24598 function with the B key. */
24600 void
24601 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24603 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24604 && aarch64_ra_sign_key == AARCH64_KEY_B)
24605 asm_fprintf (f, "\t.cfi_b_key_frame\n");
24608 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
24610 static void
24611 aarch64_start_file (void)
24613 struct cl_target_option *default_options
24614 = TREE_TARGET_OPTION (target_option_default_node);
24616 const struct processor *default_arch
24617 = aarch64_get_arch (default_options->x_selected_arch);
24618 auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
24619 std::string extension
24620 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
24621 default_arch->flags);
24623 aarch64_last_printed_arch_string = default_arch->name + extension;
24624 aarch64_last_printed_tune_string = "";
24625 asm_fprintf (asm_out_file, "\t.arch %s\n",
24626 aarch64_last_printed_arch_string.c_str ());
24628 default_file_start ();
24631 /* Emit load exclusive. */
24633 static void
24634 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24635 rtx mem, rtx model_rtx)
24637 if (mode == TImode)
24638 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
24639 gen_highpart (DImode, rval),
24640 mem, model_rtx));
24641 else
24642 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
24645 /* Emit store exclusive. */
24647 static void
24648 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
24649 rtx mem, rtx rval, rtx model_rtx)
24651 if (mode == TImode)
24652 emit_insn (gen_aarch64_store_exclusive_pair
24653 (bval, mem, operand_subword (rval, 0, 0, TImode),
24654 operand_subword (rval, 1, 0, TImode), model_rtx));
24655 else
24656 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
24659 /* Mark the previous jump instruction as unlikely. */
24661 static void
24662 aarch64_emit_unlikely_jump (rtx insn)
24664 rtx_insn *jump = emit_jump_insn (insn);
24665 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
24668 /* We store the names of the various atomic helpers in a 5x5 array.
24669 Return the libcall function given MODE, MODEL and NAMES. */
24672 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
24673 const atomic_ool_names *names)
24675 memmodel model = memmodel_from_int (INTVAL (model_rtx));
24676 int mode_idx, model_idx;
24678 switch (mode)
24680 case E_QImode:
24681 mode_idx = 0;
24682 break;
24683 case E_HImode:
24684 mode_idx = 1;
24685 break;
24686 case E_SImode:
24687 mode_idx = 2;
24688 break;
24689 case E_DImode:
24690 mode_idx = 3;
24691 break;
24692 case E_TImode:
24693 mode_idx = 4;
24694 break;
24695 default:
24696 gcc_unreachable ();
24699 switch (model)
24701 case MEMMODEL_RELAXED:
24702 model_idx = 0;
24703 break;
24704 case MEMMODEL_CONSUME:
24705 case MEMMODEL_ACQUIRE:
24706 model_idx = 1;
24707 break;
24708 case MEMMODEL_RELEASE:
24709 model_idx = 2;
24710 break;
24711 case MEMMODEL_ACQ_REL:
24712 case MEMMODEL_SEQ_CST:
24713 model_idx = 3;
24714 break;
24715 case MEMMODEL_SYNC_ACQUIRE:
24716 case MEMMODEL_SYNC_RELEASE:
24717 case MEMMODEL_SYNC_SEQ_CST:
24718 model_idx = 4;
24719 break;
24720 default:
24721 gcc_unreachable ();
24724 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
24725 VISIBILITY_HIDDEN);
24728 #define DEF0(B, N) \
24729 { "__aarch64_" #B #N "_relax", \
24730 "__aarch64_" #B #N "_acq", \
24731 "__aarch64_" #B #N "_rel", \
24732 "__aarch64_" #B #N "_acq_rel", \
24733 "__aarch64_" #B #N "_sync" }
24735 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24736 { NULL, NULL, NULL, NULL }
24737 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24739 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
24740 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
24741 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
24742 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
24743 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
24744 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
24746 #undef DEF0
24747 #undef DEF4
24748 #undef DEF5
24750 /* Expand a compare and swap pattern. */
24752 void
24753 aarch64_expand_compare_and_swap (rtx operands[])
24755 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
24756 machine_mode mode, r_mode;
24758 bval = operands[0];
24759 rval = operands[1];
24760 mem = operands[2];
24761 oldval = operands[3];
24762 newval = operands[4];
24763 is_weak = operands[5];
24764 mod_s = operands[6];
24765 mod_f = operands[7];
24766 mode = GET_MODE (mem);
24768 /* Normally the succ memory model must be stronger than fail, but in the
24769 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24770 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
24771 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
24772 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
24773 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
24775 r_mode = mode;
24776 if (mode == QImode || mode == HImode)
24778 r_mode = SImode;
24779 rval = gen_reg_rtx (r_mode);
24782 if (TARGET_LSE)
24784 /* The CAS insn requires oldval and rval overlap, but we need to
24785 have a copy of oldval saved across the operation to tell if
24786 the operation is successful. */
24787 if (reg_overlap_mentioned_p (rval, oldval))
24788 rval = copy_to_mode_reg (r_mode, oldval);
24789 else
24790 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
24791 if (mode == TImode)
24792 newval = force_reg (mode, newval);
24794 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
24795 newval, mod_s));
24796 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24798 else if (TARGET_OUTLINE_ATOMICS)
24800 /* Oldval must satisfy compare afterward. */
24801 if (!aarch64_plus_operand (oldval, mode))
24802 oldval = force_reg (mode, oldval);
24803 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
24804 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
24805 oldval, mode, newval, mode,
24806 XEXP (mem, 0), Pmode);
24807 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24809 else
24811 /* The oldval predicate varies by mode. Test it and force to reg. */
24812 insn_code code = code_for_aarch64_compare_and_swap (mode);
24813 if (!insn_data[code].operand[2].predicate (oldval, mode))
24814 oldval = force_reg (mode, oldval);
24816 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
24817 is_weak, mod_s, mod_f));
24818 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
24821 if (r_mode != mode)
24822 rval = gen_lowpart (mode, rval);
24823 emit_move_insn (operands[1], rval);
24825 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
24826 emit_insn (gen_rtx_SET (bval, x));
24829 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24830 sequence implementing an atomic operation. */
24832 static void
24833 aarch64_emit_post_barrier (enum memmodel model)
24835 const enum memmodel base_model = memmodel_base (model);
24837 if (is_mm_sync (model)
24838 && (base_model == MEMMODEL_ACQUIRE
24839 || base_model == MEMMODEL_ACQ_REL
24840 || base_model == MEMMODEL_SEQ_CST))
24842 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
24846 /* Split a compare and swap pattern. */
24848 void
24849 aarch64_split_compare_and_swap (rtx operands[])
24851 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
24852 gcc_assert (epilogue_completed);
24854 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
24855 machine_mode mode;
24856 bool is_weak;
24857 rtx_code_label *label1, *label2;
24858 enum memmodel model;
24860 rval = operands[0];
24861 mem = operands[1];
24862 oldval = operands[2];
24863 newval = operands[3];
24864 model_rtx = operands[5];
24865 scratch = operands[7];
24866 mode = GET_MODE (mem);
24867 model = memmodel_from_int (INTVAL (model_rtx));
24868 is_weak = operands[4] != const0_rtx && mode != TImode;
24870 /* When OLDVAL is zero and we want the strong version we can emit a tighter
24871 loop:
24872 .label1:
24873 LD[A]XR rval, [mem]
24874 CBNZ rval, .label2
24875 ST[L]XR scratch, newval, [mem]
24876 CBNZ scratch, .label1
24877 .label2:
24878 CMP rval, 0. */
24879 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
24880 oldval == const0_rtx && mode != TImode);
24882 label1 = NULL;
24883 if (!is_weak)
24885 label1 = gen_label_rtx ();
24886 emit_label (label1);
24888 label2 = gen_label_rtx ();
24890 /* The initial load can be relaxed for a __sync operation since a final
24891 barrier will be emitted to stop code hoisting. */
24892 if (is_mm_sync (model))
24893 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
24894 else
24895 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
24897 if (strong_zero_p)
24898 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
24899 else
24901 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24902 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
24904 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24905 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
24906 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24908 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
24910 if (!is_weak)
24912 x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
24913 aarch64_emit_unlikely_jump (x);
24915 else
24916 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24918 /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
24919 store the returned value and loop if the STLXP fails. */
24920 if (mode == TImode)
24922 rtx_code_label *label3 = gen_label_rtx ();
24923 emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
24924 emit_barrier ();
24926 emit_label (label2);
24927 aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
24929 x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
24930 aarch64_emit_unlikely_jump (x);
24932 label2 = label3;
24935 emit_label (label2);
24937 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
24938 to set the condition flags. If this is not used it will be removed by
24939 later passes. */
24940 if (strong_zero_p)
24941 aarch64_gen_compare_reg (NE, rval, const0_rtx);
24943 /* Emit any final barrier needed for a __sync operation. */
24944 if (is_mm_sync (model))
24945 aarch64_emit_post_barrier (model);
24948 /* Split an atomic operation. */
24950 void
24951 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
24952 rtx value, rtx model_rtx, rtx cond)
24954 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
24955 gcc_assert (epilogue_completed);
24957 machine_mode mode = GET_MODE (mem);
24958 machine_mode wmode = (mode == DImode ? DImode : SImode);
24959 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
24960 const bool is_sync = is_mm_sync (model);
24961 rtx_code_label *label;
24962 rtx x;
24964 /* Split the atomic operation into a sequence. */
24965 label = gen_label_rtx ();
24966 emit_label (label);
24968 if (new_out)
24969 new_out = gen_lowpart (wmode, new_out);
24970 if (old_out)
24971 old_out = gen_lowpart (wmode, old_out);
24972 else
24973 old_out = new_out;
24974 value = simplify_gen_subreg (wmode, value, mode, 0);
24976 /* The initial load can be relaxed for a __sync operation since a final
24977 barrier will be emitted to stop code hoisting. */
24978 if (is_sync)
24979 aarch64_emit_load_exclusive (mode, old_out, mem,
24980 GEN_INT (MEMMODEL_RELAXED));
24981 else
24982 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
24984 switch (code)
24986 case SET:
24987 new_out = value;
24988 break;
24990 case NOT:
24991 x = gen_rtx_AND (wmode, old_out, value);
24992 emit_insn (gen_rtx_SET (new_out, x));
24993 x = gen_rtx_NOT (wmode, new_out);
24994 emit_insn (gen_rtx_SET (new_out, x));
24995 break;
24997 case MINUS:
24998 if (CONST_INT_P (value))
25000 value = GEN_INT (-UINTVAL (value));
25001 code = PLUS;
25003 /* Fall through. */
25005 default:
25006 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
25007 emit_insn (gen_rtx_SET (new_out, x));
25008 break;
25011 aarch64_emit_store_exclusive (mode, cond, mem,
25012 gen_lowpart (mode, new_out), model_rtx);
25014 x = aarch64_gen_compare_zero_and_branch (NE, cond, label);
25015 aarch64_emit_unlikely_jump (x);
25017 /* Emit any final barrier needed for a __sync operation. */
25018 if (is_sync)
25019 aarch64_emit_post_barrier (model);
25022 static void
25023 aarch64_init_libfuncs (void)
25025 /* Half-precision float operations. The compiler handles all operations
25026 with NULL libfuncs by converting to SFmode. */
25028 /* Conversions. */
25029 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
25030 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
25032 /* Arithmetic. */
25033 set_optab_libfunc (add_optab, HFmode, NULL);
25034 set_optab_libfunc (sdiv_optab, HFmode, NULL);
25035 set_optab_libfunc (smul_optab, HFmode, NULL);
25036 set_optab_libfunc (neg_optab, HFmode, NULL);
25037 set_optab_libfunc (sub_optab, HFmode, NULL);
25039 /* Comparisons. */
25040 set_optab_libfunc (eq_optab, HFmode, NULL);
25041 set_optab_libfunc (ne_optab, HFmode, NULL);
25042 set_optab_libfunc (lt_optab, HFmode, NULL);
25043 set_optab_libfunc (le_optab, HFmode, NULL);
25044 set_optab_libfunc (ge_optab, HFmode, NULL);
25045 set_optab_libfunc (gt_optab, HFmode, NULL);
25046 set_optab_libfunc (unord_optab, HFmode, NULL);
25049 /* Target hook for c_mode_for_suffix. */
25050 static machine_mode
25051 aarch64_c_mode_for_suffix (char suffix)
25053 if (suffix == 'q')
25054 return TFmode;
25056 return VOIDmode;
25059 /* We can only represent floating point constants which will fit in
25060 "quarter-precision" values. These values are characterised by
25061 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
25064 (-1)^s * (n/16) * 2^r
25066 Where:
25067 's' is the sign bit.
25068 'n' is an integer in the range 16 <= n <= 31.
25069 'r' is an integer in the range -3 <= r <= 4. */
25071 /* Return true iff X can be represented by a quarter-precision
25072 floating point immediate operand X. Note, we cannot represent 0.0. */
25073 bool
25074 aarch64_float_const_representable_p (rtx x)
25076 /* This represents our current view of how many bits
25077 make up the mantissa. */
25078 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
25079 int exponent;
25080 unsigned HOST_WIDE_INT mantissa, mask;
25081 REAL_VALUE_TYPE r, m;
25082 bool fail;
25084 x = unwrap_const_vec_duplicate (x);
25085 if (!CONST_DOUBLE_P (x))
25086 return false;
25088 if (GET_MODE (x) == VOIDmode
25089 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
25090 return false;
25092 r = *CONST_DOUBLE_REAL_VALUE (x);
25094 /* We cannot represent infinities, NaNs or +/-zero. We won't
25095 know if we have +zero until we analyse the mantissa, but we
25096 can reject the other invalid values. */
25097 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
25098 || REAL_VALUE_MINUS_ZERO (r))
25099 return false;
25101 /* For BFmode, only handle 0.0. */
25102 if (GET_MODE (x) == BFmode)
25103 return real_iszero (&r, false);
25105 /* Extract exponent. */
25106 r = real_value_abs (&r);
25107 exponent = REAL_EXP (&r);
25109 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
25110 highest (sign) bit, with a fixed binary point at bit point_pos.
25111 m1 holds the low part of the mantissa, m2 the high part.
25112 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
25113 bits for the mantissa, this can fail (low bits will be lost). */
25114 real_ldexp (&m, &r, point_pos - exponent);
25115 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
25117 /* If the low part of the mantissa has bits set we cannot represent
25118 the value. */
25119 if (w.ulow () != 0)
25120 return false;
25121 /* We have rejected the lower HOST_WIDE_INT, so update our
25122 understanding of how many bits lie in the mantissa and
25123 look only at the high HOST_WIDE_INT. */
25124 mantissa = w.elt (1);
25125 point_pos -= HOST_BITS_PER_WIDE_INT;
25127 /* We can only represent values with a mantissa of the form 1.xxxx. */
25128 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
25129 if ((mantissa & mask) != 0)
25130 return false;
25132 /* Having filtered unrepresentable values, we may now remove all
25133 but the highest 5 bits. */
25134 mantissa >>= point_pos - 5;
25136 /* We cannot represent the value 0.0, so reject it. This is handled
25137 elsewhere. */
25138 if (mantissa == 0)
25139 return false;
25141 /* Then, as bit 4 is always set, we can mask it off, leaving
25142 the mantissa in the range [0, 15]. */
25143 mantissa &= ~(1 << 4);
25144 gcc_assert (mantissa <= 15);
25146 /* GCC internally does not use IEEE754-like encoding (where normalized
25147 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
25148 Our mantissa values are shifted 4 places to the left relative to
25149 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
25150 by 5 places to correct for GCC's representation. */
25151 exponent = 5 - exponent;
25153 return (exponent >= 0 && exponent <= 7);
25156 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
25157 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
25158 output MOVI/MVNI, ORR or BIC immediate. */
25159 char*
25160 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
25161 enum simd_immediate_check which)
25163 bool is_valid;
25164 static char templ[40];
25165 const char *mnemonic;
25166 const char *shift_op;
25167 unsigned int lane_count = 0;
25168 char element_char;
25170 struct simd_immediate_info info;
25172 /* This will return true to show const_vector is legal for use as either
25173 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
25174 It will also update INFO to show how the immediate should be generated.
25175 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
25176 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
25177 gcc_assert (is_valid);
25179 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25180 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
25182 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25184 gcc_assert (info.insn == simd_immediate_info::MOV
25185 && info.u.mov.shift == 0);
25186 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25187 move immediate path. */
25188 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25189 info.u.mov.value = GEN_INT (0);
25190 else
25192 const unsigned int buf_size = 20;
25193 char float_buf[buf_size] = {'\0'};
25194 real_to_decimal_for_mode (float_buf,
25195 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25196 buf_size, buf_size, 1, info.elt_mode);
25198 if (lane_count == 1)
25199 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
25200 else
25201 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
25202 lane_count, element_char, float_buf);
25203 return templ;
25207 gcc_assert (CONST_INT_P (info.u.mov.value));
25209 if (which == AARCH64_CHECK_MOV)
25211 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
25212 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
25213 ? "msl" : "lsl");
25214 if (lane_count == 1)
25215 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
25216 mnemonic, UINTVAL (info.u.mov.value));
25217 else if (info.u.mov.shift)
25218 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25219 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
25220 element_char, UINTVAL (info.u.mov.value), shift_op,
25221 info.u.mov.shift);
25222 else
25223 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25224 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25225 element_char, UINTVAL (info.u.mov.value));
25227 else
25229 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
25230 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
25231 if (info.u.mov.shift)
25232 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25233 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25234 element_char, UINTVAL (info.u.mov.value), "lsl",
25235 info.u.mov.shift);
25236 else
25237 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25238 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25239 element_char, UINTVAL (info.u.mov.value));
25241 return templ;
25244 char*
25245 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25248 /* If a floating point number was passed and we desire to use it in an
25249 integer mode do the conversion to integer. */
25250 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25252 unsigned HOST_WIDE_INT ival;
25253 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25254 gcc_unreachable ();
25255 immediate = gen_int_mode (ival, mode);
25258 machine_mode vmode;
25259 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25260 a 128 bit vector mode. */
25261 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25263 vmode = aarch64_simd_container_mode (mode, width);
25264 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25265 return aarch64_output_simd_mov_immediate (v_op, width);
25268 /* Return the output string to use for moving immediate CONST_VECTOR
25269 into an SVE register. */
25271 char *
25272 aarch64_output_sve_mov_immediate (rtx const_vector)
25274 static char templ[40];
25275 struct simd_immediate_info info;
25276 char element_char;
25278 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
25279 gcc_assert (is_valid);
25281 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25283 machine_mode vec_mode = GET_MODE (const_vector);
25284 if (aarch64_sve_pred_mode_p (vec_mode))
25286 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25287 if (info.insn == simd_immediate_info::MOV)
25289 gcc_assert (info.u.mov.value == const0_rtx);
25290 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25292 else
25294 gcc_assert (info.insn == simd_immediate_info::PTRUE);
25295 unsigned int total_bytes;
25296 if (info.u.pattern == AARCH64_SV_ALL
25297 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25298 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25299 total_bytes / GET_MODE_SIZE (info.elt_mode));
25300 else
25301 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25302 svpattern_token (info.u.pattern));
25304 return buf;
25307 if (info.insn == simd_immediate_info::INDEX)
25309 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25310 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25311 element_char, INTVAL (info.u.index.base),
25312 INTVAL (info.u.index.step));
25313 return templ;
25316 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25318 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25319 info.u.mov.value = GEN_INT (0);
25320 else
25322 const int buf_size = 20;
25323 char float_buf[buf_size] = {};
25324 real_to_decimal_for_mode (float_buf,
25325 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25326 buf_size, buf_size, 1, info.elt_mode);
25328 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25329 element_char, float_buf);
25330 return templ;
25334 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25335 element_char, INTVAL (info.u.mov.value));
25336 return templ;
25339 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
25340 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25341 pattern. */
25343 char *
25344 aarch64_output_sve_ptrues (rtx const_unspec)
25346 static char templ[40];
25348 struct simd_immediate_info info;
25349 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
25350 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25352 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25353 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25354 svpattern_token (info.u.pattern));
25355 return templ;
25358 /* Split operands into moves from op[1] + op[2] into op[0]. */
25360 void
25361 aarch64_split_combinev16qi (rtx operands[3])
25363 machine_mode halfmode = GET_MODE (operands[1]);
25365 gcc_assert (halfmode == V16QImode);
25367 rtx destlo = simplify_gen_subreg (halfmode, operands[0],
25368 GET_MODE (operands[0]), 0);
25369 rtx desthi = simplify_gen_subreg (halfmode, operands[0],
25370 GET_MODE (operands[0]),
25371 GET_MODE_SIZE (halfmode));
25373 bool skiplo = rtx_equal_p (destlo, operands[1]);
25374 bool skiphi = rtx_equal_p (desthi, operands[2]);
25376 if (skiplo && skiphi)
25378 /* No-op move. Can't split to nothing; emit something. */
25379 emit_note (NOTE_INSN_DELETED);
25380 return;
25383 /* Special case of reversed high/low parts. */
25384 if (reg_overlap_mentioned_p (operands[2], destlo)
25385 && reg_overlap_mentioned_p (operands[1], desthi))
25387 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25388 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25389 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25391 else if (!reg_overlap_mentioned_p (operands[2], destlo))
25393 /* Try to avoid unnecessary moves if part of the result
25394 is in the right place already. */
25395 if (!skiplo)
25396 emit_move_insn (destlo, operands[1]);
25397 if (!skiphi)
25398 emit_move_insn (desthi, operands[2]);
25400 else
25402 if (!skiphi)
25403 emit_move_insn (desthi, operands[2]);
25404 if (!skiplo)
25405 emit_move_insn (destlo, operands[1]);
25409 /* vec_perm support. */
25411 struct expand_vec_perm_d
25413 rtx target, op0, op1;
25414 vec_perm_indices perm;
25415 machine_mode vmode;
25416 machine_mode op_mode;
25417 unsigned int vec_flags;
25418 unsigned int op_vec_flags;
25419 bool one_vector_p;
25420 bool zero_op0_p, zero_op1_p;
25421 bool testing_p;
25424 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25426 /* Generate a variable permutation. */
25428 static void
25429 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25431 machine_mode vmode = GET_MODE (target);
25432 bool one_vector_p = rtx_equal_p (op0, op1);
25434 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25435 gcc_checking_assert (GET_MODE (op0) == vmode);
25436 gcc_checking_assert (GET_MODE (op1) == vmode);
25437 gcc_checking_assert (GET_MODE (sel) == vmode);
25438 gcc_checking_assert (TARGET_SIMD);
25440 if (one_vector_p)
25442 if (vmode == V8QImode)
25444 /* Expand the argument to a V16QI mode by duplicating it. */
25445 rtx pair = gen_reg_rtx (V16QImode);
25446 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25447 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25449 else
25451 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25454 else
25456 rtx pair;
25458 if (vmode == V8QImode)
25460 pair = gen_reg_rtx (V16QImode);
25461 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25462 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25464 else
25466 pair = gen_reg_rtx (V2x16QImode);
25467 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25468 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25473 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25474 NELT is the number of elements in the vector. */
25476 void
25477 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25478 unsigned int nelt)
25480 machine_mode vmode = GET_MODE (target);
25481 bool one_vector_p = rtx_equal_p (op0, op1);
25482 rtx mask;
25484 /* The TBL instruction does not use a modulo index, so we must take care
25485 of that ourselves. */
25486 mask = aarch64_simd_gen_const_vector_dup (vmode,
25487 one_vector_p ? nelt - 1 : 2 * nelt - 1);
25488 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25490 /* For big-endian, we also need to reverse the index within the vector
25491 (but not which vector). */
25492 if (BYTES_BIG_ENDIAN)
25494 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
25495 if (!one_vector_p)
25496 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25497 sel = expand_simple_binop (vmode, XOR, sel, mask,
25498 NULL, 0, OPTAB_LIB_WIDEN);
25500 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25503 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
25505 static void
25506 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25508 emit_insn (gen_rtx_SET (target,
25509 gen_rtx_UNSPEC (GET_MODE (target),
25510 gen_rtvec (2, op0, op1), code)));
25513 /* Expand an SVE vec_perm with the given operands. */
25515 void
25516 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25518 machine_mode data_mode = GET_MODE (target);
25519 machine_mode sel_mode = GET_MODE (sel);
25520 /* Enforced by the pattern condition. */
25521 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25523 /* Note: vec_perm indices are supposed to wrap when they go beyond the
25524 size of the two value vectors, i.e. the upper bits of the indices
25525 are effectively ignored. SVE TBL instead produces 0 for any
25526 out-of-range indices, so we need to modulo all the vec_perm indices
25527 to ensure they are all in range. */
25528 rtx sel_reg = force_reg (sel_mode, sel);
25530 /* Check if the sel only references the first values vector. */
25531 if (CONST_VECTOR_P (sel)
25532 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25534 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25535 return;
25538 /* Check if the two values vectors are the same. */
25539 if (rtx_equal_p (op0, op1))
25541 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25542 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25543 NULL, 0, OPTAB_DIRECT);
25544 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25545 return;
25548 /* Run TBL on for each value vector and combine the results. */
25550 rtx res0 = gen_reg_rtx (data_mode);
25551 rtx res1 = gen_reg_rtx (data_mode);
25552 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25553 if (!CONST_VECTOR_P (sel)
25554 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25556 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25557 2 * nunits - 1);
25558 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25559 NULL, 0, OPTAB_DIRECT);
25561 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25562 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25563 NULL, 0, OPTAB_DIRECT);
25564 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25565 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25566 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25567 else
25568 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25571 /* Recognize patterns suitable for the TRN instructions. */
25572 static bool
25573 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25575 HOST_WIDE_INT odd;
25576 poly_uint64 nelt = d->perm.length ();
25577 rtx out, in0, in1;
25578 machine_mode vmode = d->vmode;
25580 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25581 return false;
25583 /* Note that these are little-endian tests.
25584 We correct for big-endian later. */
25585 if (!d->perm[0].is_constant (&odd)
25586 || (odd != 0 && odd != 1)
25587 || !d->perm.series_p (0, 2, odd, 2)
25588 || !d->perm.series_p (1, 2, nelt + odd, 2))
25589 return false;
25591 /* Success! */
25592 if (d->testing_p)
25593 return true;
25595 in0 = d->op0;
25596 in1 = d->op1;
25597 /* We don't need a big-endian lane correction for SVE; see the comment
25598 at the head of aarch64-sve.md for details. */
25599 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25601 std::swap (in0, in1);
25602 odd = !odd;
25604 out = d->target;
25606 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25607 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25608 return true;
25611 /* Try to re-encode the PERM constant so it combines odd and even elements.
25612 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25613 We retry with this new constant with the full suite of patterns. */
25614 static bool
25615 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25617 expand_vec_perm_d newd;
25619 if (d->vec_flags != VEC_ADVSIMD)
25620 return false;
25622 /* Get the new mode. Always twice the size of the inner
25623 and half the elements. */
25624 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
25625 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
25626 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
25627 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
25629 if (new_mode == word_mode)
25630 return false;
25632 vec_perm_indices newpermindices;
25634 if (!newpermindices.new_shrunk_vector (d->perm, 2))
25635 return false;
25637 newd.vmode = new_mode;
25638 newd.vec_flags = VEC_ADVSIMD;
25639 newd.op_mode = newd.vmode;
25640 newd.op_vec_flags = newd.vec_flags;
25641 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25642 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25643 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25644 newd.testing_p = d->testing_p;
25645 newd.one_vector_p = d->one_vector_p;
25647 newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2,
25648 newpermindices.nelts_per_input ());
25649 return aarch64_expand_vec_perm_const_1 (&newd);
25652 /* Recognize patterns suitable for the UZP instructions. */
25653 static bool
25654 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25656 HOST_WIDE_INT odd;
25657 rtx out, in0, in1;
25658 machine_mode vmode = d->vmode;
25660 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25661 return false;
25663 /* Note that these are little-endian tests.
25664 We correct for big-endian later. */
25665 if (!d->perm[0].is_constant (&odd)
25666 || (odd != 0 && odd != 1)
25667 || !d->perm.series_p (0, 1, odd, 2))
25668 return false;
25670 /* Success! */
25671 if (d->testing_p)
25672 return true;
25674 in0 = d->op0;
25675 in1 = d->op1;
25676 /* We don't need a big-endian lane correction for SVE; see the comment
25677 at the head of aarch64-sve.md for details. */
25678 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25680 std::swap (in0, in1);
25681 odd = !odd;
25683 out = d->target;
25685 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25686 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
25687 return true;
25690 /* Recognize patterns suitable for the ZIP instructions. */
25691 static bool
25692 aarch64_evpc_zip (struct expand_vec_perm_d *d)
25694 unsigned int high;
25695 poly_uint64 nelt = d->perm.length ();
25696 rtx out, in0, in1;
25697 machine_mode vmode = d->vmode;
25699 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25700 return false;
25702 /* Note that these are little-endian tests.
25703 We correct for big-endian later. */
25704 poly_uint64 first = d->perm[0];
25705 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
25706 || !d->perm.series_p (0, 2, first, 1)
25707 || !d->perm.series_p (1, 2, first + nelt, 1))
25708 return false;
25709 high = maybe_ne (first, 0U);
25711 /* Success! */
25712 if (d->testing_p)
25713 return true;
25715 in0 = d->op0;
25716 in1 = d->op1;
25717 /* We don't need a big-endian lane correction for SVE; see the comment
25718 at the head of aarch64-sve.md for details. */
25719 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25721 std::swap (in0, in1);
25722 high = !high;
25724 out = d->target;
25726 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25727 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
25728 return true;
25731 /* Recognize patterns for the EXT insn. */
25733 static bool
25734 aarch64_evpc_ext (struct expand_vec_perm_d *d)
25736 HOST_WIDE_INT location;
25737 rtx offset;
25739 /* The first element always refers to the first vector.
25740 Check if the extracted indices are increasing by one. */
25741 if ((d->vec_flags & VEC_SVE_PRED)
25742 || !d->perm[0].is_constant (&location)
25743 || !d->perm.series_p (0, 1, location, 1))
25744 return false;
25746 /* Success! */
25747 if (d->testing_p)
25748 return true;
25750 /* The case where (location == 0) is a no-op for both big- and little-endian,
25751 and is removed by the mid-end at optimization levels -O1 and higher.
25753 We don't need a big-endian lane correction for SVE; see the comment
25754 at the head of aarch64-sve.md for details. */
25755 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
25757 /* After setup, we want the high elements of the first vector (stored
25758 at the LSB end of the register), and the low elements of the second
25759 vector (stored at the MSB end of the register). So swap. */
25760 std::swap (d->op0, d->op1);
25761 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25762 to_constant () is safe since this is restricted to Advanced SIMD
25763 vectors. */
25764 location = d->perm.length ().to_constant () - location;
25767 offset = GEN_INT (location);
25768 emit_set_insn (d->target,
25769 gen_rtx_UNSPEC (d->vmode,
25770 gen_rtvec (3, d->op0, d->op1, offset),
25771 UNSPEC_EXT));
25772 return true;
25775 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25776 within each 64-bit, 32-bit or 16-bit granule. */
25778 static bool
25779 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
25781 HOST_WIDE_INT diff;
25782 unsigned int i, size, unspec;
25783 machine_mode pred_mode;
25785 if ((d->vec_flags & VEC_SVE_PRED)
25786 || !d->one_vector_p
25787 || !d->perm[0].is_constant (&diff)
25788 || !diff)
25789 return false;
25791 if (d->vec_flags & VEC_SVE_DATA)
25792 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
25793 else
25794 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
25795 if (size == 64)
25797 unspec = UNSPEC_REV64;
25798 pred_mode = VNx2BImode;
25800 else if (size == 32)
25802 unspec = UNSPEC_REV32;
25803 pred_mode = VNx4BImode;
25805 else if (size == 16)
25807 unspec = UNSPEC_REV16;
25808 pred_mode = VNx8BImode;
25810 else
25811 return false;
25813 unsigned int step = diff + 1;
25814 for (i = 0; i < step; ++i)
25815 if (!d->perm.series_p (i, step, diff - i, step))
25816 return false;
25818 /* Success! */
25819 if (d->testing_p)
25820 return true;
25822 if (d->vec_flags & VEC_SVE_DATA)
25824 rtx pred = aarch64_ptrue_reg (pred_mode);
25825 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
25826 d->target, pred, d->op0));
25827 return true;
25829 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
25830 emit_set_insn (d->target, src);
25831 return true;
25834 /* Recognize patterns for the REV insn, which reverses elements within
25835 a full vector. */
25837 static bool
25838 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
25840 poly_uint64 nelt = d->perm.length ();
25842 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
25843 return false;
25845 if (!d->perm.series_p (0, 1, nelt - 1, -1))
25846 return false;
25848 /* Success! */
25849 if (d->testing_p)
25850 return true;
25852 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
25853 emit_set_insn (d->target, src);
25854 return true;
25857 static bool
25858 aarch64_evpc_dup (struct expand_vec_perm_d *d)
25860 rtx out = d->target;
25861 rtx in0;
25862 HOST_WIDE_INT elt;
25863 machine_mode vmode = d->vmode;
25864 rtx lane;
25866 if ((d->vec_flags & VEC_SVE_PRED)
25867 || d->perm.encoding ().encoded_nelts () != 1
25868 || !d->perm[0].is_constant (&elt))
25869 return false;
25871 if ((d->vec_flags & VEC_SVE_DATA)
25872 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
25873 return false;
25875 /* Success! */
25876 if (d->testing_p)
25877 return true;
25879 /* The generic preparation in aarch64_expand_vec_perm_const_1
25880 swaps the operand order and the permute indices if it finds
25881 d->perm[0] to be in the second operand. Thus, we can always
25882 use d->op0 and need not do any extra arithmetic to get the
25883 correct lane number. */
25884 in0 = d->op0;
25885 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
25887 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
25888 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
25889 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
25890 return true;
25893 static bool
25894 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
25896 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
25897 machine_mode vmode = d->vmode;
25899 /* Make sure that the indices are constant. */
25900 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
25901 for (unsigned int i = 0; i < encoded_nelts; ++i)
25902 if (!d->perm[i].is_constant ())
25903 return false;
25905 if (d->testing_p)
25906 return true;
25908 /* Generic code will try constant permutation twice. Once with the
25909 original mode and again with the elements lowered to QImode.
25910 So wait and don't do the selector expansion ourselves. */
25911 if (vmode != V8QImode && vmode != V16QImode)
25912 return false;
25914 /* to_constant is safe since this routine is specific to Advanced SIMD
25915 vectors. */
25916 unsigned int nelt = d->perm.length ().to_constant ();
25918 /* If one register is the constant vector of 0 then we only need
25919 a one reg TBL and we map any accesses to the vector of 0 to -1. We can't
25920 do this earlier since vec_perm_indices clamps elements to within range so
25921 we can only do it during codegen. */
25922 if (d->zero_op0_p)
25923 d->op0 = d->op1;
25924 else if (d->zero_op1_p)
25925 d->op1 = d->op0;
25927 for (unsigned int i = 0; i < nelt; ++i)
25929 auto val = d->perm[i].to_constant ();
25931 /* If we're selecting from a 0 vector, we can just use an out of range
25932 index instead. */
25933 if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
25934 rperm[i] = constm1_rtx;
25935 else
25937 /* If we are remapping a zero register as the first parameter we need
25938 to adjust the indices of the non-zero register. */
25939 if (d->zero_op0_p)
25940 val = val % nelt;
25942 /* If big-endian and two vectors we end up with a weird mixed-endian
25943 mode on NEON. Reverse the index within each word but not the word
25944 itself. to_constant is safe because we checked is_constant
25945 above. */
25946 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
25950 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
25951 sel = force_reg (vmode, sel);
25953 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
25954 return true;
25957 /* Try to implement D using an SVE TBL instruction. */
25959 static bool
25960 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
25962 unsigned HOST_WIDE_INT nelt;
25964 /* Permuting two variable-length vectors could overflow the
25965 index range. */
25966 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
25967 return false;
25969 if (d->testing_p)
25970 return true;
25972 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
25973 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
25974 if (d->one_vector_p)
25975 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
25976 else
25977 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
25978 return true;
25981 /* Try to implement D using SVE dup instruction. */
25983 static bool
25984 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
25986 if (BYTES_BIG_ENDIAN
25987 || !d->one_vector_p
25988 || d->vec_flags != VEC_SVE_DATA
25989 || d->op_vec_flags != VEC_ADVSIMD
25990 || d->perm.encoding ().nelts_per_pattern () != 1
25991 || !known_eq (d->perm.encoding ().npatterns (),
25992 GET_MODE_NUNITS (d->op_mode))
25993 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
25994 return false;
25996 int npatterns = d->perm.encoding ().npatterns ();
25997 for (int i = 0; i < npatterns; i++)
25998 if (!known_eq (d->perm[i], i))
25999 return false;
26001 if (d->testing_p)
26002 return true;
26004 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
26005 return true;
26008 /* Try to implement D using SVE SEL instruction. */
26010 static bool
26011 aarch64_evpc_sel (struct expand_vec_perm_d *d)
26013 machine_mode vmode = d->vmode;
26014 int unit_size = GET_MODE_UNIT_SIZE (vmode);
26016 if (d->vec_flags != VEC_SVE_DATA
26017 || unit_size > 8)
26018 return false;
26020 int n_patterns = d->perm.encoding ().npatterns ();
26021 poly_int64 vec_len = d->perm.length ();
26023 for (int i = 0; i < n_patterns; ++i)
26024 if (!known_eq (d->perm[i], i)
26025 && !known_eq (d->perm[i], vec_len + i))
26026 return false;
26028 for (int i = n_patterns; i < n_patterns * 2; i++)
26029 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
26030 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
26031 return false;
26033 if (d->testing_p)
26034 return true;
26036 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
26038 /* Build a predicate that is true when op0 elements should be used. */
26039 rtx_vector_builder builder (pred_mode, n_patterns, 2);
26040 for (int i = 0; i < n_patterns * 2; i++)
26042 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
26043 : CONST0_RTX (BImode);
26044 builder.quick_push (elem);
26047 rtx const_vec = builder.build ();
26048 rtx pred = force_reg (pred_mode, const_vec);
26049 /* TARGET = PRED ? OP0 : OP1. */
26050 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
26051 return true;
26054 /* Recognize patterns suitable for the INS instructions. */
26055 static bool
26056 aarch64_evpc_ins (struct expand_vec_perm_d *d)
26058 machine_mode mode = d->vmode;
26059 unsigned HOST_WIDE_INT nelt;
26061 if (d->vec_flags != VEC_ADVSIMD)
26062 return false;
26064 /* to_constant is safe since this routine is specific to Advanced SIMD
26065 vectors. */
26066 nelt = d->perm.length ().to_constant ();
26067 rtx insv = d->op0;
26069 HOST_WIDE_INT idx = -1;
26071 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26073 HOST_WIDE_INT elt;
26074 if (!d->perm[i].is_constant (&elt))
26075 return false;
26076 if (elt == (HOST_WIDE_INT) i)
26077 continue;
26078 if (idx != -1)
26080 idx = -1;
26081 break;
26083 idx = i;
26086 if (idx == -1)
26088 insv = d->op1;
26089 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26091 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
26092 continue;
26093 if (idx != -1)
26094 return false;
26095 idx = i;
26098 if (idx == -1)
26099 return false;
26102 if (d->testing_p)
26103 return true;
26105 gcc_assert (idx != -1);
26107 unsigned extractindex = d->perm[idx].to_constant ();
26108 rtx extractv = d->op0;
26109 if (extractindex >= nelt)
26111 extractv = d->op1;
26112 extractindex -= nelt;
26114 gcc_assert (extractindex < nelt);
26116 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
26117 expand_operand ops[5];
26118 create_output_operand (&ops[0], d->target, mode);
26119 create_input_operand (&ops[1], insv, mode);
26120 create_integer_operand (&ops[2], 1 << idx);
26121 create_input_operand (&ops[3], extractv, mode);
26122 create_integer_operand (&ops[4], extractindex);
26123 expand_insn (icode, 5, ops);
26125 return true;
26128 static bool
26129 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
26131 gcc_assert (d->op_mode != E_VOIDmode);
26133 /* The pattern matching functions above are written to look for a small
26134 number to begin the sequence (0, 1, N/2). If we begin with an index
26135 from the second operand, we can swap the operands. */
26136 poly_int64 nelt = d->perm.length ();
26137 if (known_ge (d->perm[0], nelt))
26139 d->perm.rotate_inputs (1);
26140 std::swap (d->op0, d->op1);
26143 if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
26144 || d->vec_flags == VEC_SVE_DATA
26145 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
26146 || d->vec_flags == VEC_SVE_PRED)
26147 && known_gt (nelt, 1))
26149 if (d->vmode == d->op_mode)
26151 if (aarch64_evpc_rev_local (d))
26152 return true;
26153 else if (aarch64_evpc_rev_global (d))
26154 return true;
26155 else if (aarch64_evpc_ext (d))
26156 return true;
26157 else if (aarch64_evpc_dup (d))
26158 return true;
26159 else if (aarch64_evpc_zip (d))
26160 return true;
26161 else if (aarch64_evpc_uzp (d))
26162 return true;
26163 else if (aarch64_evpc_trn (d))
26164 return true;
26165 else if (aarch64_evpc_sel (d))
26166 return true;
26167 else if (aarch64_evpc_ins (d))
26168 return true;
26169 else if (aarch64_evpc_reencode (d))
26170 return true;
26172 if (d->vec_flags == VEC_SVE_DATA)
26173 return aarch64_evpc_sve_tbl (d);
26174 else if (d->vec_flags == VEC_ADVSIMD)
26175 return aarch64_evpc_tbl (d);
26177 else
26179 if (aarch64_evpc_sve_dup (d))
26180 return true;
26183 return false;
26186 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
26188 static bool
26189 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
26190 rtx target, rtx op0, rtx op1,
26191 const vec_perm_indices &sel)
26193 struct expand_vec_perm_d d;
26195 /* Check whether the mask can be applied to a single vector. */
26196 if (sel.ninputs () == 1
26197 || (op0 && rtx_equal_p (op0, op1)))
26198 d.one_vector_p = true;
26199 else if (sel.all_from_input_p (0))
26201 d.one_vector_p = true;
26202 op1 = op0;
26204 else if (sel.all_from_input_p (1))
26206 d.one_vector_p = true;
26207 op0 = op1;
26209 else
26210 d.one_vector_p = false;
26212 d.zero_op0_p = op0 == CONST0_RTX (op_mode);
26213 d.zero_op1_p = op1 == CONST0_RTX (op_mode);
26214 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
26215 sel.nelts_per_input ());
26216 d.vmode = vmode;
26217 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
26218 d.op_mode = op_mode;
26219 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
26220 d.target = target;
26221 d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
26222 if (op0 == op1)
26223 d.op1 = d.op0;
26224 else
26225 d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
26226 d.testing_p = !target;
26228 if (!d.testing_p)
26229 return aarch64_expand_vec_perm_const_1 (&d);
26231 rtx_insn *last = get_last_insn ();
26232 bool ret = aarch64_expand_vec_perm_const_1 (&d);
26233 gcc_assert (last == get_last_insn ());
26235 return ret;
26237 /* Generate a byte permute mask for a register of mode MODE,
26238 which has NUNITS units. */
26241 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26243 /* We have to reverse each vector because we dont have
26244 a permuted load that can reverse-load according to ABI rules. */
26245 rtx mask;
26246 rtvec v = rtvec_alloc (16);
26247 unsigned int i, j;
26248 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26250 gcc_assert (BYTES_BIG_ENDIAN);
26251 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26253 for (i = 0; i < nunits; i++)
26254 for (j = 0; j < usize; j++)
26255 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26256 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26257 return force_reg (V16QImode, mask);
26260 /* Expand an SVE integer comparison using the SVE equivalent of:
26262 (set TARGET (CODE OP0 OP1)). */
26264 void
26265 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26267 machine_mode pred_mode = GET_MODE (target);
26268 machine_mode data_mode = GET_MODE (op0);
26269 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26270 op0, op1);
26271 if (!rtx_equal_p (target, res))
26272 emit_move_insn (target, res);
26275 /* Return the UNSPEC_COND_* code for comparison CODE. */
26277 static unsigned int
26278 aarch64_unspec_cond_code (rtx_code code)
26280 switch (code)
26282 case NE:
26283 return UNSPEC_COND_FCMNE;
26284 case EQ:
26285 return UNSPEC_COND_FCMEQ;
26286 case LT:
26287 return UNSPEC_COND_FCMLT;
26288 case GT:
26289 return UNSPEC_COND_FCMGT;
26290 case LE:
26291 return UNSPEC_COND_FCMLE;
26292 case GE:
26293 return UNSPEC_COND_FCMGE;
26294 case UNORDERED:
26295 return UNSPEC_COND_FCMUO;
26296 default:
26297 gcc_unreachable ();
26301 /* Emit:
26303 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26305 where <X> is the operation associated with comparison CODE.
26306 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26308 static void
26309 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26310 bool known_ptrue_p, rtx op0, rtx op1)
26312 rtx flag = gen_int_mode (known_ptrue_p, SImode);
26313 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26314 gen_rtvec (4, pred, flag, op0, op1),
26315 aarch64_unspec_cond_code (code));
26316 emit_set_insn (target, unspec);
26319 /* Emit the SVE equivalent of:
26321 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26322 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26323 (set TARGET (ior:PRED_MODE TMP1 TMP2))
26325 where <Xi> is the operation associated with comparison CODEi.
26326 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26328 static void
26329 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26330 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26332 machine_mode pred_mode = GET_MODE (pred);
26333 rtx tmp1 = gen_reg_rtx (pred_mode);
26334 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26335 rtx tmp2 = gen_reg_rtx (pred_mode);
26336 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26337 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26340 /* Emit the SVE equivalent of:
26342 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26343 (set TARGET (not TMP))
26345 where <X> is the operation associated with comparison CODE.
26346 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26348 static void
26349 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26350 bool known_ptrue_p, rtx op0, rtx op1)
26352 machine_mode pred_mode = GET_MODE (pred);
26353 rtx tmp = gen_reg_rtx (pred_mode);
26354 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26355 aarch64_emit_unop (target, one_cmpl_optab, tmp);
26358 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26360 (set TARGET (CODE OP0 OP1))
26362 If CAN_INVERT_P is true, the caller can also handle inverted results;
26363 return true if the result is in fact inverted. */
26365 bool
26366 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26367 rtx op0, rtx op1, bool can_invert_p)
26369 machine_mode pred_mode = GET_MODE (target);
26370 machine_mode data_mode = GET_MODE (op0);
26372 rtx ptrue = aarch64_ptrue_reg (pred_mode);
26373 switch (code)
26375 case UNORDERED:
26376 /* UNORDERED has no immediate form. */
26377 op1 = force_reg (data_mode, op1);
26378 /* fall through */
26379 case LT:
26380 case LE:
26381 case GT:
26382 case GE:
26383 case EQ:
26384 case NE:
26386 /* There is native support for the comparison. */
26387 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26388 return false;
26391 case LTGT:
26392 /* This is a trapping operation (LT or GT). */
26393 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26394 return false;
26396 case UNEQ:
26397 if (!flag_trapping_math)
26399 /* This would trap for signaling NaNs. */
26400 op1 = force_reg (data_mode, op1);
26401 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26402 ptrue, true, op0, op1);
26403 return false;
26405 /* fall through */
26406 case UNLT:
26407 case UNLE:
26408 case UNGT:
26409 case UNGE:
26410 if (flag_trapping_math)
26412 /* Work out which elements are ordered. */
26413 rtx ordered = gen_reg_rtx (pred_mode);
26414 op1 = force_reg (data_mode, op1);
26415 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26416 ptrue, true, op0, op1);
26418 /* Test the opposite condition for the ordered elements,
26419 then invert the result. */
26420 if (code == UNEQ)
26421 code = NE;
26422 else
26423 code = reverse_condition_maybe_unordered (code);
26424 if (can_invert_p)
26426 aarch64_emit_sve_fp_cond (target, code,
26427 ordered, false, op0, op1);
26428 return true;
26430 aarch64_emit_sve_invert_fp_cond (target, code,
26431 ordered, false, op0, op1);
26432 return false;
26434 break;
26436 case ORDERED:
26437 /* ORDERED has no immediate form. */
26438 op1 = force_reg (data_mode, op1);
26439 break;
26441 default:
26442 gcc_unreachable ();
26445 /* There is native support for the inverse comparison. */
26446 code = reverse_condition_maybe_unordered (code);
26447 if (can_invert_p)
26449 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26450 return true;
26452 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26453 return false;
26456 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
26457 of the data being selected and CMP_MODE is the mode of the values being
26458 compared. */
26460 void
26461 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
26462 rtx *ops)
26464 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
26465 rtx pred = gen_reg_rtx (pred_mode);
26466 if (FLOAT_MODE_P (cmp_mode))
26468 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
26469 ops[4], ops[5], true))
26470 std::swap (ops[1], ops[2]);
26472 else
26473 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
26475 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
26476 ops[1] = force_reg (data_mode, ops[1]);
26477 /* The "false" value can only be zero if the "true" value is a constant. */
26478 if (register_operand (ops[1], data_mode)
26479 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
26480 ops[2] = force_reg (data_mode, ops[2]);
26482 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
26483 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
26486 /* Return true if:
26488 (a) MODE1 and MODE2 use the same layout for bytes that are common
26489 to both modes;
26491 (b) subregs involving the two modes behave as the target-independent
26492 subreg rules require; and
26494 (c) there is at least one register that can hold both modes.
26496 Return false otherwise. */
26498 static bool
26499 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26501 unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26502 unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26504 bool sve1_p = (flags1 & VEC_ANY_SVE);
26505 bool sve2_p = (flags2 & VEC_ANY_SVE);
26507 bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26508 bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26510 bool pred1_p = (flags1 & VEC_SVE_PRED);
26511 bool pred2_p = (flags2 & VEC_SVE_PRED);
26513 bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26514 | VEC_PARTIAL));
26515 bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26516 | VEC_PARTIAL));
26518 /* Don't allow changes between predicate modes and other modes.
26519 Only predicate registers can hold predicate modes and only
26520 non-predicate registers can hold non-predicate modes, so any
26521 attempt to mix them would require a round trip through memory. */
26522 if (pred1_p != pred2_p)
26523 return false;
26525 /* The contents of partial SVE modes are distributed evenly across
26526 the register, whereas GCC expects them to be clustered together.
26527 We therefore need to be careful about mode changes involving them. */
26528 if (partial_sve1_p && partial_sve2_p)
26530 /* Reject changes between partial SVE modes that have different
26531 patterns of significant and insignificant bits. */
26532 if ((aarch64_sve_container_bits (mode1)
26533 != aarch64_sve_container_bits (mode2))
26534 || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26535 return false;
26537 else if (partial_sve1_p)
26539 /* The first lane of MODE1 is where GCC expects it, but anything
26540 bigger than that is not. */
26541 if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26542 return false;
26544 else if (partial_sve2_p)
26546 /* Similarly in reverse. */
26547 if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26548 return false;
26551 /* Don't allow changes between partial Advanced SIMD structure modes
26552 and other modes that are bigger than 8 bytes. E.g. V16QI and V2x8QI
26553 are the same size, but the former occupies one Q register while the
26554 latter occupies two D registers. */
26555 if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26556 && maybe_gt (GET_MODE_SIZE (mode1), 8)
26557 && maybe_gt (GET_MODE_SIZE (mode2), 8))
26558 return false;
26560 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26562 /* Don't allow changes between SVE modes and other modes that might
26563 be bigger than 128 bits. In particular, OImode, CImode and XImode
26564 divide into 128-bit quantities while SVE modes divide into
26565 BITS_PER_SVE_VECTOR quantities. */
26566 if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26567 return false;
26568 if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26569 return false;
26572 if (BYTES_BIG_ENDIAN)
26574 /* Don't allow changes between SVE data modes and non-SVE modes.
26575 See the comment at the head of aarch64-sve.md for details. */
26576 if (sve1_p != sve2_p)
26577 return false;
26579 /* Don't allow changes in element size: lane 0 of the new vector
26580 would not then be lane 0 of the old vector. See the comment
26581 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26582 description.
26584 In the worst case, this forces a register to be spilled in
26585 one mode and reloaded in the other, which handles the
26586 endianness correctly. */
26587 if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26588 return false;
26590 return true;
26593 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always defer
26594 to aarch64_modes_compatible_p. However due to issues with register
26595 allocation it is preferable to avoid tieing integer scalar and FP
26596 scalar modes. Executing integer operations in general registers is
26597 better than treating them as scalar vector operations. This reduces
26598 latency and avoids redundant int<->FP moves. So tie modes if they
26599 are either the same class, or one of them is a vector mode. */
26601 static bool
26602 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
26604 if (aarch64_modes_compatible_p (mode1, mode2))
26606 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
26607 return true;
26608 if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
26609 return true;
26611 return false;
26614 /* Return a new RTX holding the result of moving POINTER forward by
26615 AMOUNT bytes. */
26617 static rtx
26618 aarch64_move_pointer (rtx pointer, poly_int64 amount)
26620 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
26622 return adjust_automodify_address (pointer, GET_MODE (pointer),
26623 next, amount);
26626 /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
26627 from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
26628 rather than memcpy. Return true iff we succeeded. */
26629 bool
26630 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
26632 if (!TARGET_MOPS)
26633 return false;
26635 /* All three registers are changed by the instruction, so each one
26636 must be a fresh pseudo. */
26637 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26638 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
26639 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26640 rtx src_mem = replace_equiv_address (operands[1], src_addr);
26641 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
26642 if (is_memmove)
26643 emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
26644 else
26645 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
26646 return true;
26649 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26650 OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true
26651 if this is a memmove rather than memcpy. Return true if we succeed,
26652 otherwise return false, indicating that a libcall should be emitted. */
26653 bool
26654 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
26656 int mode_bytes;
26657 rtx dst = operands[0];
26658 rtx src = operands[1];
26659 unsigned align = UINTVAL (operands[3]);
26660 rtx base;
26661 machine_mode mode = BLKmode, next_mode;
26663 /* Variable-sized or strict-align copies may use the MOPS expansion. */
26664 if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
26665 return aarch64_expand_cpymem_mops (operands, is_memmove);
26667 unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
26669 /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */
26670 unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
26671 unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
26672 : aarch64_mops_memcpy_size_threshold;
26674 /* Reduce the maximum size with -Os. */
26675 if (optimize_function_for_size_p (cfun))
26676 max_copy_size /= 4;
26678 /* Large copies use MOPS when available or a library call. */
26679 if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
26680 return aarch64_expand_cpymem_mops (operands, is_memmove);
26682 /* Default to 32-byte LDP/STP on large copies, however small copies or
26683 no SIMD support fall back to 16-byte chunks.
26684 ??? Although it would be possible to use LDP/STP Qn in streaming mode
26685 (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26686 whether that would improve performance. */
26687 bool use_qregs = size > 24 && TARGET_SIMD;
26689 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26690 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26692 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
26693 src = adjust_automodify_address (src, VOIDmode, base, 0);
26695 auto_vec<std::pair<rtx, rtx>, 16> ops;
26696 int offset = 0;
26698 while (size > 0)
26700 /* Find the largest mode in which to do the copy in without over reading
26701 or writing. */
26702 opt_scalar_int_mode mode_iter;
26703 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26704 if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
26705 mode = mode_iter.require ();
26707 gcc_assert (mode != BLKmode);
26709 mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26711 /* Prefer Q-register accesses. */
26712 if (mode_bytes == 16 && use_qregs)
26713 mode = V4SImode;
26715 rtx reg = gen_reg_rtx (mode);
26716 rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
26717 rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
26718 ops.safe_push ({ load, store });
26719 size -= mode_bytes;
26720 offset += mode_bytes;
26722 /* Emit trailing copies using overlapping unaligned accesses
26723 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26724 if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
26726 next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
26727 int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26728 gcc_assert (n_bytes <= mode_bytes);
26729 offset -= n_bytes - size;
26730 size = n_bytes;
26734 /* Memcpy interleaves loads with stores, memmove emits all loads first. */
26735 int nops = ops.length();
26736 int inc = is_memmove || nops <= 8 ? nops : 6;
26738 for (int i = 0; i < nops; i += inc)
26740 int m = MIN (nops, i + inc);
26741 /* Emit loads. */
26742 for (int j = i; j < m; j++)
26743 emit_insn (ops[j].first);
26744 /* Emit stores. */
26745 for (int j = i; j < m; j++)
26746 emit_insn (ops[j].second);
26748 return true;
26751 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
26752 as for the setmem pattern. Return true iff we succeed. */
26753 static bool
26754 aarch64_expand_setmem_mops (rtx *operands)
26756 if (!TARGET_MOPS)
26757 return false;
26759 /* The first two registers are changed by the instruction, so both
26760 of them must be a fresh pseudo. */
26761 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26762 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26763 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
26764 rtx val = operands[2];
26765 if (val != CONST0_RTX (QImode))
26766 val = force_reg (QImode, val);
26767 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
26768 return true;
26771 /* Expand setmem, as if from a __builtin_memset. Return true if
26772 we succeed, otherwise return false. */
26774 bool
26775 aarch64_expand_setmem (rtx *operands)
26777 int mode_bytes;
26778 unsigned HOST_WIDE_INT len;
26779 rtx dst = operands[0];
26780 rtx val = operands[2], src;
26781 unsigned align = UINTVAL (operands[3]);
26782 rtx base;
26783 machine_mode mode = BLKmode, next_mode;
26785 /* Variable-sized or strict-align memset may use the MOPS expansion. */
26786 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
26787 || (STRICT_ALIGNMENT && align < 16))
26788 return aarch64_expand_setmem_mops (operands);
26790 /* Set inline limits for memset. MOPS has a separate threshold. */
26791 unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
26792 unsigned mops_threshold = aarch64_mops_memset_size_threshold;
26794 len = UINTVAL (operands[1]);
26796 /* Large memset uses MOPS when available or a library call. */
26797 if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
26798 return aarch64_expand_setmem_mops (operands);
26800 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26801 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26803 /* Prepare the val using a DUP/MOVI v0.16B, val. */
26804 val = expand_vector_broadcast (V16QImode, val);
26805 val = force_reg (V16QImode, val);
26807 int offset = 0;
26808 while (len > 0)
26810 /* Find the largest mode in which to do the copy without
26811 over writing. */
26812 opt_scalar_int_mode mode_iter;
26813 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26814 if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
26815 mode = mode_iter.require ();
26817 gcc_assert (mode != BLKmode);
26819 mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26821 src = val;
26823 /* Prefer Q-register accesses. */
26824 if (mode_bytes == 16)
26825 mode = V16QImode;
26826 else
26827 src = lowpart_subreg (mode, src, GET_MODE (val));
26829 emit_move_insn (adjust_address (dst, mode, offset), src);
26830 len -= mode_bytes;
26831 offset += mode_bytes;
26833 /* Emit trailing writes using overlapping unaligned accesses
26834 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26835 if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
26837 next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
26838 int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26839 gcc_assert (n_bytes <= mode_bytes);
26840 offset -= n_bytes - len;
26841 len = n_bytes;
26845 return true;
26849 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
26850 SImode stores. Handle the case when the constant has identical
26851 bottom and top halves. This is beneficial when the two stores can be
26852 merged into an STP and we avoid synthesising potentially expensive
26853 immediates twice. Return true if such a split is possible. */
26855 bool
26856 aarch64_split_dimode_const_store (rtx dst, rtx src)
26858 rtx lo = gen_lowpart (SImode, src);
26859 rtx hi = gen_highpart_mode (SImode, DImode, src);
26861 if (!rtx_equal_p (lo, hi))
26862 return false;
26864 unsigned int orig_cost
26865 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
26866 unsigned int lo_cost
26867 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
26869 /* We want to transform:
26870 MOV x1, 49370
26871 MOVK x1, 0x140, lsl 16
26872 MOVK x1, 0xc0da, lsl 32
26873 MOVK x1, 0x140, lsl 48
26874 STR x1, [x0]
26875 into:
26876 MOV w1, 49370
26877 MOVK w1, 0x140, lsl 16
26878 STP w1, w1, [x0]
26879 So we want to perform this when we save at least one instruction. */
26880 if (orig_cost <= lo_cost)
26881 return false;
26883 rtx mem_lo = adjust_address (dst, SImode, 0);
26884 if (!aarch64_mem_pair_operand (mem_lo, SImode))
26885 return false;
26887 rtx tmp_reg = gen_reg_rtx (SImode);
26888 aarch64_expand_mov_immediate (tmp_reg, lo);
26889 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
26890 /* Don't emit an explicit store pair as this may not be always profitable.
26891 Let the sched-fusion logic decide whether to merge them. */
26892 emit_move_insn (mem_lo, tmp_reg);
26893 emit_move_insn (mem_hi, tmp_reg);
26895 return true;
26898 /* Generate RTL for a conditional branch with rtx comparison CODE in
26899 mode CC_MODE. The destination of the unlikely conditional branch
26900 is LABEL_REF. */
26902 void
26903 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
26904 rtx label_ref)
26906 rtx x;
26907 x = gen_rtx_fmt_ee (code, VOIDmode,
26908 gen_rtx_REG (cc_mode, CC_REGNUM),
26909 const0_rtx);
26911 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
26912 gen_rtx_LABEL_REF (VOIDmode, label_ref),
26913 pc_rtx);
26914 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
26917 /* Generate DImode scratch registers for 128-bit (TImode) addition.
26919 OP1 represents the TImode destination operand 1
26920 OP2 represents the TImode destination operand 2
26921 LOW_DEST represents the low half (DImode) of TImode operand 0
26922 LOW_IN1 represents the low half (DImode) of TImode operand 1
26923 LOW_IN2 represents the low half (DImode) of TImode operand 2
26924 HIGH_DEST represents the high half (DImode) of TImode operand 0
26925 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26926 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
26928 void
26929 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26930 rtx *low_in1, rtx *low_in2,
26931 rtx *high_dest, rtx *high_in1,
26932 rtx *high_in2)
26934 *low_dest = gen_reg_rtx (DImode);
26935 *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
26936 *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
26937 *high_dest = gen_reg_rtx (DImode);
26938 *high_in1 = force_highpart_subreg (DImode, op1, TImode);
26939 *high_in2 = force_highpart_subreg (DImode, op2, TImode);
26942 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
26944 OP1 represents the TImode destination operand 1
26945 OP2 represents the TImode destination operand 2
26946 LOW_DEST represents the low half (DImode) of TImode operand 0
26947 LOW_IN1 represents the low half (DImode) of TImode operand 1
26948 LOW_IN2 represents the low half (DImode) of TImode operand 2
26949 HIGH_DEST represents the high half (DImode) of TImode operand 0
26950 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26951 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
26954 void
26955 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26956 rtx *low_in1, rtx *low_in2,
26957 rtx *high_dest, rtx *high_in1,
26958 rtx *high_in2)
26960 *low_dest = gen_reg_rtx (DImode);
26961 *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
26962 *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
26963 *high_dest = gen_reg_rtx (DImode);
26965 *high_in1 = force_highpart_subreg (DImode, op1, TImode);
26966 *high_in2 = force_highpart_subreg (DImode, op2, TImode);
26969 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
26971 OP0 represents the TImode destination operand 0
26972 LOW_DEST represents the low half (DImode) of TImode operand 0
26973 LOW_IN1 represents the low half (DImode) of TImode operand 1
26974 LOW_IN2 represents the low half (DImode) of TImode operand 2
26975 HIGH_DEST represents the high half (DImode) of TImode operand 0
26976 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26977 HIGH_IN2 represents the high half (DImode) of TImode operand 2
26978 UNSIGNED_P is true if the operation is being performed on unsigned
26979 values. */
26980 void
26981 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
26982 rtx low_in2, rtx high_dest, rtx high_in1,
26983 rtx high_in2, bool unsigned_p)
26985 if (low_in2 == const0_rtx)
26987 low_dest = low_in1;
26988 high_in2 = force_reg (DImode, high_in2);
26989 if (unsigned_p)
26990 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
26991 else
26992 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
26994 else
26996 if (aarch64_plus_immediate (low_in2, DImode))
26997 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
26998 GEN_INT (-UINTVAL (low_in2))));
26999 else
27001 low_in2 = force_reg (DImode, low_in2);
27002 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
27004 high_in2 = force_reg (DImode, high_in2);
27006 if (unsigned_p)
27007 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
27008 else
27009 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
27012 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
27013 emit_move_insn (gen_highpart (DImode, op0), high_dest);
27017 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
27019 static unsigned HOST_WIDE_INT
27020 aarch64_asan_shadow_offset (void)
27022 if (TARGET_ILP32)
27023 return (HOST_WIDE_INT_1 << 29);
27024 else
27025 return (HOST_WIDE_INT_1 << 36);
27028 static rtx
27029 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
27030 rtx_code code, tree treeop0, tree treeop1)
27032 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27033 rtx op0, op1;
27034 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27035 insn_code icode;
27036 struct expand_operand ops[4];
27038 start_sequence ();
27039 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27041 op_mode = GET_MODE (op0);
27042 if (op_mode == VOIDmode)
27043 op_mode = GET_MODE (op1);
27045 switch (op_mode)
27047 case E_QImode:
27048 case E_HImode:
27049 case E_SImode:
27050 cmp_mode = SImode;
27051 icode = CODE_FOR_cmpsi;
27052 break;
27054 case E_DImode:
27055 cmp_mode = DImode;
27056 icode = CODE_FOR_cmpdi;
27057 break;
27059 case E_SFmode:
27060 cmp_mode = SFmode;
27061 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27062 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
27063 break;
27065 case E_DFmode:
27066 cmp_mode = DFmode;
27067 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27068 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
27069 break;
27071 default:
27072 end_sequence ();
27073 return NULL_RTX;
27076 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
27077 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
27078 if (!op0 || !op1)
27080 end_sequence ();
27081 return NULL_RTX;
27083 *prep_seq = get_insns ();
27084 end_sequence ();
27086 create_fixed_operand (&ops[0], op0);
27087 create_fixed_operand (&ops[1], op1);
27089 start_sequence ();
27090 if (!maybe_expand_insn (icode, 2, ops))
27092 end_sequence ();
27093 return NULL_RTX;
27095 *gen_seq = get_insns ();
27096 end_sequence ();
27098 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
27099 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
27102 static rtx
27103 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27104 rtx_code cmp_code, tree treeop0, tree treeop1,
27105 rtx_code bit_code)
27107 rtx op0, op1, target;
27108 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27109 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27110 insn_code icode;
27111 struct expand_operand ops[6];
27112 int aarch64_cond;
27114 push_to_sequence (*prep_seq);
27115 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27117 op_mode = GET_MODE (op0);
27118 if (op_mode == VOIDmode)
27119 op_mode = GET_MODE (op1);
27121 switch (op_mode)
27123 case E_QImode:
27124 case E_HImode:
27125 case E_SImode:
27126 cmp_mode = SImode;
27127 break;
27129 case E_DImode:
27130 cmp_mode = DImode;
27131 break;
27133 case E_SFmode:
27134 cmp_mode = SFmode;
27135 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27136 break;
27138 case E_DFmode:
27139 cmp_mode = DFmode;
27140 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27141 break;
27143 default:
27144 end_sequence ();
27145 return NULL_RTX;
27148 icode = code_for_ccmp (cc_mode, cmp_mode);
27150 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27151 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27152 if (!op0 || !op1)
27154 end_sequence ();
27155 return NULL_RTX;
27157 *prep_seq = get_insns ();
27158 end_sequence ();
27160 target = gen_rtx_REG (cc_mode, CC_REGNUM);
27161 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
27163 if (bit_code != AND)
27165 /* Treat the ccmp patterns as canonical and use them where possible,
27166 but fall back to ccmp_rev patterns if there's no other option. */
27167 rtx_code prev_code = GET_CODE (prev);
27168 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27169 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27170 && !(prev_code == EQ
27171 || prev_code == NE
27172 || prev_code == ORDERED
27173 || prev_code == UNORDERED))
27174 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27175 else
27177 rtx_code code = reverse_condition (prev_code);
27178 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27180 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27183 create_fixed_operand (&ops[0], XEXP (prev, 0));
27184 create_fixed_operand (&ops[1], target);
27185 create_fixed_operand (&ops[2], op0);
27186 create_fixed_operand (&ops[3], op1);
27187 create_fixed_operand (&ops[4], prev);
27188 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27190 push_to_sequence (*gen_seq);
27191 if (!maybe_expand_insn (icode, 6, ops))
27193 end_sequence ();
27194 return NULL_RTX;
27197 *gen_seq = get_insns ();
27198 end_sequence ();
27200 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27203 #undef TARGET_GEN_CCMP_FIRST
27204 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27206 #undef TARGET_GEN_CCMP_NEXT
27207 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27209 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
27210 instruction fusion of some sort. */
27212 static bool
27213 aarch64_macro_fusion_p (void)
27215 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27219 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
27220 should be kept together during scheduling. */
27222 static bool
27223 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27225 rtx set_dest;
27226 rtx prev_set = single_set (prev);
27227 rtx curr_set = single_set (curr);
27228 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
27229 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27231 if (!aarch64_macro_fusion_p ())
27232 return false;
27234 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27236 /* We are trying to match:
27237 prev (mov) == (set (reg r0) (const_int imm16))
27238 curr (movk) == (set (zero_extract (reg r0)
27239 (const_int 16)
27240 (const_int 16))
27241 (const_int imm16_1)) */
27243 set_dest = SET_DEST (curr_set);
27245 if (GET_CODE (set_dest) == ZERO_EXTRACT
27246 && CONST_INT_P (SET_SRC (curr_set))
27247 && CONST_INT_P (SET_SRC (prev_set))
27248 && CONST_INT_P (XEXP (set_dest, 2))
27249 && INTVAL (XEXP (set_dest, 2)) == 16
27250 && REG_P (XEXP (set_dest, 0))
27251 && REG_P (SET_DEST (prev_set))
27252 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27254 return true;
27258 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27261 /* We're trying to match:
27262 prev (adrp) == (set (reg r1)
27263 (high (symbol_ref ("SYM"))))
27264 curr (add) == (set (reg r0)
27265 (lo_sum (reg r1)
27266 (symbol_ref ("SYM"))))
27267 Note that r0 need not necessarily be the same as r1, especially
27268 during pre-regalloc scheduling. */
27270 if (satisfies_constraint_Ush (SET_SRC (prev_set))
27271 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27273 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27274 && REG_P (XEXP (SET_SRC (curr_set), 0))
27275 && REGNO (XEXP (SET_SRC (curr_set), 0))
27276 == REGNO (SET_DEST (prev_set))
27277 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27278 XEXP (SET_SRC (curr_set), 1)))
27279 return true;
27283 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27286 /* We're trying to match:
27287 prev (movk) == (set (zero_extract (reg r0)
27288 (const_int 16)
27289 (const_int 32))
27290 (const_int imm16_1))
27291 curr (movk) == (set (zero_extract (reg r0)
27292 (const_int 16)
27293 (const_int 48))
27294 (const_int imm16_2)) */
27296 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27297 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27298 && REG_P (XEXP (SET_DEST (prev_set), 0))
27299 && REG_P (XEXP (SET_DEST (curr_set), 0))
27300 && REGNO (XEXP (SET_DEST (prev_set), 0))
27301 == REGNO (XEXP (SET_DEST (curr_set), 0))
27302 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27303 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27304 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27305 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27306 && CONST_INT_P (SET_SRC (prev_set))
27307 && CONST_INT_P (SET_SRC (curr_set)))
27308 return true;
27311 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27313 /* We're trying to match:
27314 prev (adrp) == (set (reg r0)
27315 (high (symbol_ref ("SYM"))))
27316 curr (ldr) == (set (reg r1)
27317 (mem (lo_sum (reg r0)
27318 (symbol_ref ("SYM")))))
27320 curr (ldr) == (set (reg r1)
27321 (zero_extend (mem
27322 (lo_sum (reg r0)
27323 (symbol_ref ("SYM")))))) */
27324 if (satisfies_constraint_Ush (SET_SRC (prev_set))
27325 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27327 rtx curr_src = SET_SRC (curr_set);
27329 if (GET_CODE (curr_src) == ZERO_EXTEND)
27330 curr_src = XEXP (curr_src, 0);
27332 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27333 && REG_P (XEXP (XEXP (curr_src, 0), 0))
27334 && REGNO (XEXP (XEXP (curr_src, 0), 0))
27335 == REGNO (SET_DEST (prev_set))
27336 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27337 XEXP (SET_SRC (prev_set), 0)))
27338 return true;
27342 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
27343 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27344 && prev_set && curr_set && any_condjump_p (curr)
27345 && GET_CODE (SET_SRC (prev_set)) == COMPARE
27346 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27347 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27348 return true;
27350 /* Fuse flag-setting ALU instructions and conditional branch. */
27351 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27352 && any_condjump_p (curr))
27354 unsigned int condreg1, condreg2;
27355 rtx cc_reg_1;
27356 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27357 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27359 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27360 && prev
27361 && modified_in_p (cc_reg_1, prev))
27363 enum attr_type prev_type = get_attr_type (prev);
27365 /* FIXME: this misses some which is considered simple arthematic
27366 instructions for ThunderX. Simple shifts are missed here. */
27367 if (prev_type == TYPE_ALUS_SREG
27368 || prev_type == TYPE_ALUS_IMM
27369 || prev_type == TYPE_LOGICS_REG
27370 || prev_type == TYPE_LOGICS_IMM)
27371 return true;
27375 /* Fuse ALU instructions and CBZ/CBNZ. */
27376 if (prev_set
27377 && curr_set
27378 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27379 && any_condjump_p (curr))
27381 /* We're trying to match:
27382 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27383 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
27384 (const_int 0))
27385 (label_ref ("SYM"))
27386 (pc)) */
27387 if (SET_DEST (curr_set) == (pc_rtx)
27388 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27389 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27390 && REG_P (SET_DEST (prev_set))
27391 && REGNO (SET_DEST (prev_set))
27392 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27394 /* Fuse ALU operations followed by conditional branch instruction. */
27395 switch (get_attr_type (prev))
27397 case TYPE_ALU_IMM:
27398 case TYPE_ALU_SREG:
27399 case TYPE_ADC_REG:
27400 case TYPE_ADC_IMM:
27401 case TYPE_ADCS_REG:
27402 case TYPE_ADCS_IMM:
27403 case TYPE_LOGIC_REG:
27404 case TYPE_LOGIC_IMM:
27405 case TYPE_CSEL:
27406 case TYPE_ADR:
27407 case TYPE_MOV_IMM:
27408 case TYPE_SHIFT_REG:
27409 case TYPE_SHIFT_IMM:
27410 case TYPE_BFM:
27411 case TYPE_RBIT:
27412 case TYPE_REV:
27413 case TYPE_EXTEND:
27414 return true;
27416 default:;
27421 /* Fuse A+B+1 and A-B-1 */
27422 if (simple_sets_p
27423 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27425 /* We're trying to match:
27426 prev == (set (r0) (plus (r0) (r1)))
27427 curr == (set (r0) (plus (r0) (const_int 1)))
27429 prev == (set (r0) (minus (r0) (r1)))
27430 curr == (set (r0) (plus (r0) (const_int -1))) */
27432 rtx prev_src = SET_SRC (prev_set);
27433 rtx curr_src = SET_SRC (curr_set);
27435 int polarity = 1;
27436 if (GET_CODE (prev_src) == MINUS)
27437 polarity = -1;
27439 if (GET_CODE (curr_src) == PLUS
27440 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27441 && CONST_INT_P (XEXP (curr_src, 1))
27442 && INTVAL (XEXP (curr_src, 1)) == polarity
27443 && REG_P (XEXP (curr_src, 0))
27444 && REG_P (SET_DEST (prev_set))
27445 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27446 return true;
27449 return false;
27452 /* Return true iff the instruction fusion described by OP is enabled. */
27454 bool
27455 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27457 return (aarch64_tune_params.fusible_ops & op) != 0;
27460 /* If MEM is in the form of [base+offset], extract the two parts
27461 of address and set to BASE and OFFSET, otherwise return false
27462 after clearing BASE and OFFSET. */
27464 bool
27465 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27467 rtx addr;
27469 gcc_assert (MEM_P (mem));
27471 addr = XEXP (mem, 0);
27473 if (REG_P (addr))
27475 *base = addr;
27476 *offset = const0_rtx;
27477 return true;
27480 if (GET_CODE (addr) == PLUS
27481 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27483 *base = XEXP (addr, 0);
27484 *offset = XEXP (addr, 1);
27485 return true;
27488 *base = NULL_RTX;
27489 *offset = NULL_RTX;
27491 return false;
27494 /* Types for scheduling fusion. */
27495 enum sched_fusion_type
27497 SCHED_FUSION_NONE = 0,
27498 SCHED_FUSION_LD_SIGN_EXTEND,
27499 SCHED_FUSION_LD_ZERO_EXTEND,
27500 SCHED_FUSION_LD,
27501 SCHED_FUSION_ST,
27502 SCHED_FUSION_NUM
27505 /* If INSN is a load or store of address in the form of [base+offset],
27506 extract the two parts and set to BASE and OFFSET. Return scheduling
27507 fusion type this INSN is. */
27509 static enum sched_fusion_type
27510 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27512 rtx x, dest, src;
27513 enum sched_fusion_type fusion = SCHED_FUSION_LD;
27515 gcc_assert (INSN_P (insn));
27516 x = PATTERN (insn);
27517 if (GET_CODE (x) != SET)
27518 return SCHED_FUSION_NONE;
27520 src = SET_SRC (x);
27521 dest = SET_DEST (x);
27523 machine_mode dest_mode = GET_MODE (dest);
27525 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27526 return SCHED_FUSION_NONE;
27528 if (GET_CODE (src) == SIGN_EXTEND)
27530 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27531 src = XEXP (src, 0);
27532 if (!MEM_P (src) || GET_MODE (src) != SImode)
27533 return SCHED_FUSION_NONE;
27535 else if (GET_CODE (src) == ZERO_EXTEND)
27537 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27538 src = XEXP (src, 0);
27539 if (!MEM_P (src) || GET_MODE (src) != SImode)
27540 return SCHED_FUSION_NONE;
27543 if (MEM_P (src) && REG_P (dest))
27544 extract_base_offset_in_addr (src, base, offset);
27545 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27547 fusion = SCHED_FUSION_ST;
27548 extract_base_offset_in_addr (dest, base, offset);
27550 else
27551 return SCHED_FUSION_NONE;
27553 if (*base == NULL_RTX || *offset == NULL_RTX)
27554 fusion = SCHED_FUSION_NONE;
27556 return fusion;
27559 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27561 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27562 and PRI are only calculated for these instructions. For other instruction,
27563 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
27564 type instruction fusion can be added by returning different priorities.
27566 It's important that irrelevant instructions get the largest FUSION_PRI. */
27568 static void
27569 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
27570 int *fusion_pri, int *pri)
27572 int tmp, off_val;
27573 rtx base, offset;
27574 enum sched_fusion_type fusion;
27576 gcc_assert (INSN_P (insn));
27578 tmp = max_pri - 1;
27579 fusion = fusion_load_store (insn, &base, &offset);
27580 if (fusion == SCHED_FUSION_NONE)
27582 *pri = tmp;
27583 *fusion_pri = tmp;
27584 return;
27587 /* Set FUSION_PRI according to fusion type and base register. */
27588 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
27590 /* Calculate PRI. */
27591 tmp /= 2;
27593 /* INSN with smaller offset goes first. */
27594 off_val = (int)(INTVAL (offset));
27595 if (off_val >= 0)
27596 tmp -= (off_val & 0xfffff);
27597 else
27598 tmp += ((- off_val) & 0xfffff);
27600 *pri = tmp;
27601 return;
27604 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27605 Adjust priority of sha1h instructions so they are scheduled before
27606 other SHA1 instructions. */
27608 static int
27609 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
27611 rtx x = PATTERN (insn);
27613 if (GET_CODE (x) == SET)
27615 x = SET_SRC (x);
27617 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
27618 return priority + 10;
27621 return priority;
27624 /* If REVERSED is null, return true if memory reference *MEM2 comes
27625 immediately after memory reference *MEM1. Do not change the references
27626 in this case.
27628 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27629 if they are, try to make them use constant offsets from the same base
27630 register. Return true on success. When returning true, set *REVERSED
27631 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
27632 static bool
27633 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
27635 if (reversed)
27636 *reversed = false;
27638 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
27639 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
27640 return false;
27642 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
27643 return false;
27645 auto size1 = MEM_SIZE (*mem1);
27646 auto size2 = MEM_SIZE (*mem2);
27648 rtx base1, base2, offset1, offset2;
27649 extract_base_offset_in_addr (*mem1, &base1, &offset1);
27650 extract_base_offset_in_addr (*mem2, &base2, &offset2);
27652 /* Make sure at least one memory is in base+offset form. */
27653 if (!(base1 && offset1) && !(base2 && offset2))
27654 return false;
27656 /* If both mems already use the same base register, just check the
27657 offsets. */
27658 if (base1 && base2 && rtx_equal_p (base1, base2))
27660 if (!offset1 || !offset2)
27661 return false;
27663 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
27664 return true;
27666 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
27668 *reversed = true;
27669 return true;
27672 return false;
27675 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27676 guarantee that the values are consecutive. */
27677 if (MEM_EXPR (*mem1)
27678 && MEM_EXPR (*mem2)
27679 && MEM_OFFSET_KNOWN_P (*mem1)
27680 && MEM_OFFSET_KNOWN_P (*mem2))
27682 poly_int64 expr_offset1;
27683 poly_int64 expr_offset2;
27684 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
27685 &expr_offset1);
27686 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
27687 &expr_offset2);
27688 if (!expr_base1
27689 || !expr_base2
27690 || !DECL_P (expr_base1)
27691 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
27692 return false;
27694 expr_offset1 += MEM_OFFSET (*mem1);
27695 expr_offset2 += MEM_OFFSET (*mem2);
27697 if (known_eq (expr_offset1 + size1, expr_offset2))
27699 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
27700 *reversed = true;
27701 else
27702 return false;
27704 if (reversed)
27706 if (base2)
27708 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
27709 expr_offset1 - expr_offset2);
27710 *mem1 = replace_equiv_address_nv (*mem1, addr1);
27712 else
27714 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
27715 expr_offset2 - expr_offset1);
27716 *mem2 = replace_equiv_address_nv (*mem2, addr2);
27719 return true;
27722 return false;
27725 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27726 instruction. */
27728 bool
27729 aarch64_ldpstp_operand_mode_p (machine_mode mode)
27731 if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
27732 || hard_regno_nregs (V0_REGNUM, mode) > 1)
27733 return false;
27735 const auto size = GET_MODE_SIZE (mode);
27736 return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
27739 /* Return true if MEM1 and MEM2 can be combined into a single access
27740 of mode MODE, with the combined access having the same address as MEM1. */
27742 bool
27743 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
27745 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
27746 return false;
27747 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
27750 /* Return true if MEM agrees with the ldp-stp policy model.
27751 Otherwise, false. */
27753 bool
27754 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
27756 auto policy = (load
27757 ? aarch64_tune_params.ldp_policy_model
27758 : aarch64_tune_params.stp_policy_model);
27760 /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair. */
27761 if (policy == AARCH64_LDP_STP_POLICY_NEVER)
27762 return false;
27764 /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27765 do not emit the load pair unless the alignment is checked to be
27766 at least double the alignment of the type. */
27767 if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
27768 && !optimize_function_for_size_p (cfun)
27769 && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
27770 return false;
27772 return true;
27775 /* Given OPERANDS of consecutive load/store, check if we can merge
27776 them into ldp/stp. LOAD is true if they are load instructions. */
27778 bool
27779 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load)
27781 enum reg_class rclass_1, rclass_2;
27782 rtx mem_1, mem_2, reg_1, reg_2;
27784 if (load)
27786 mem_1 = operands[1];
27787 mem_2 = operands[3];
27788 reg_1 = operands[0];
27789 reg_2 = operands[2];
27790 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
27791 if (REGNO (reg_1) == REGNO (reg_2))
27792 return false;
27793 if (reg_overlap_mentioned_p (reg_1, mem_2))
27794 return false;
27796 else
27798 mem_1 = operands[0];
27799 mem_2 = operands[2];
27800 reg_1 = operands[1];
27801 reg_2 = operands[3];
27804 /* The mems cannot be volatile. */
27805 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
27806 return false;
27808 /* Check if the addresses are in the form of [base+offset]. */
27809 bool reversed = false;
27810 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
27811 return false;
27813 /* The operands must be of the same size. */
27814 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
27815 GET_MODE_SIZE (GET_MODE (mem_2))));
27817 /* The lower memory access must be a mem-pair operand. */
27818 rtx lower_mem = reversed ? mem_2 : mem_1;
27819 machine_mode lower_mem_mode = GET_MODE (lower_mem);
27820 if (!aarch64_mem_pair_operand (lower_mem, lower_mem_mode))
27821 return false;
27823 /* Check if lower_mem is ok with the ldp-stp policy model. */
27824 if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem, load,
27825 lower_mem_mode))
27826 return false;
27828 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
27829 rclass_1 = FP_REGS;
27830 else
27831 rclass_1 = GENERAL_REGS;
27833 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
27834 rclass_2 = FP_REGS;
27835 else
27836 rclass_2 = GENERAL_REGS;
27838 /* Check if the registers are of same class. */
27839 if (rclass_1 != rclass_2)
27840 return false;
27842 return true;
27845 /* Given OPERANDS of consecutive load/store that can be merged,
27846 swap them if they are not in ascending order. */
27847 void
27848 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
27850 int mem_op = load ? 1 : 0;
27851 bool reversed = false;
27852 if (!aarch64_check_consecutive_mems (operands + mem_op,
27853 operands + mem_op + 2, &reversed))
27854 gcc_unreachable ();
27856 if (reversed)
27858 /* Irrespective of whether this is a load or a store,
27859 we do the same swap. */
27860 std::swap (operands[0], operands[2]);
27861 std::swap (operands[1], operands[3]);
27865 /* Helper function used for generation of load/store pair instructions, called
27866 from peepholes in aarch64-ldpstp.md. OPERANDS is an array of
27867 operands as matched by the peepholes in that file. LOAD_P is true if we're
27868 generating a load pair, otherwise we're generating a store pair. CODE is
27869 either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
27870 standard load/store pair. */
27872 void
27873 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
27875 aarch64_swap_ldrstr_operands (operands, load_p);
27877 if (load_p)
27878 emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
27879 operands[1], code));
27880 else
27882 gcc_assert (code == UNKNOWN);
27883 emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
27884 operands[3]));
27888 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
27889 comparison between the two. */
27891 aarch64_host_wide_int_compare (const void *x, const void *y)
27893 return wi::cmps (* ((const HOST_WIDE_INT *) x),
27894 * ((const HOST_WIDE_INT *) y));
27897 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
27898 other pointing to a REG rtx containing an offset, compare the offsets
27899 of the two pairs.
27901 Return:
27903 1 iff offset (X) > offset (Y)
27904 0 iff offset (X) == offset (Y)
27905 -1 iff offset (X) < offset (Y) */
27907 aarch64_ldrstr_offset_compare (const void *x, const void *y)
27909 const rtx * operands_1 = (const rtx *) x;
27910 const rtx * operands_2 = (const rtx *) y;
27911 rtx mem_1, mem_2, base, offset_1, offset_2;
27913 if (MEM_P (operands_1[0]))
27914 mem_1 = operands_1[0];
27915 else
27916 mem_1 = operands_1[1];
27918 if (MEM_P (operands_2[0]))
27919 mem_2 = operands_2[0];
27920 else
27921 mem_2 = operands_2[1];
27923 /* Extract the offsets. */
27924 extract_base_offset_in_addr (mem_1, &base, &offset_1);
27925 extract_base_offset_in_addr (mem_2, &base, &offset_2);
27927 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
27929 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
27932 /* Given OPERANDS of consecutive load/store, check if we can merge
27933 them into ldp/stp by adjusting the offset. LOAD is true if they
27934 are load instructions. MODE is the mode of memory operands.
27936 Given below consecutive stores:
27938 str w1, [xb, 0x100]
27939 str w1, [xb, 0x104]
27940 str w1, [xb, 0x108]
27941 str w1, [xb, 0x10c]
27943 Though the offsets are out of the range supported by stp, we can
27944 still pair them after adjusting the offset, like:
27946 add scratch, xb, 0x100
27947 stp w1, w1, [scratch]
27948 stp w1, w1, [scratch, 0x8]
27950 The peephole patterns detecting this opportunity should guarantee
27951 the scratch register is avaliable. */
27953 bool
27954 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
27955 machine_mode mode)
27957 const int num_insns = 4;
27958 enum reg_class rclass;
27959 HOST_WIDE_INT offvals[num_insns], msize;
27960 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
27962 if (load)
27964 for (int i = 0; i < num_insns; i++)
27966 reg[i] = operands[2 * i];
27967 mem[i] = operands[2 * i + 1];
27969 gcc_assert (REG_P (reg[i]));
27972 /* Do not attempt to merge the loads if the loads clobber each other. */
27973 for (int i = 0; i < 8; i += 2)
27974 for (int j = i + 2; j < 8; j += 2)
27975 if (reg_overlap_mentioned_p (operands[i], operands[j]))
27976 return false;
27978 else
27979 for (int i = 0; i < num_insns; i++)
27981 mem[i] = operands[2 * i];
27982 reg[i] = operands[2 * i + 1];
27985 /* Skip if memory operand is by itself valid for ldp/stp. */
27986 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
27987 return false;
27989 for (int i = 0; i < num_insns; i++)
27991 /* The mems cannot be volatile. */
27992 if (MEM_VOLATILE_P (mem[i]))
27993 return false;
27995 /* Check if the addresses are in the form of [base+offset]. */
27996 extract_base_offset_in_addr (mem[i], base + i, offset + i);
27997 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
27998 return false;
28001 /* Check if the registers are of same class. */
28002 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
28003 ? FP_REGS : GENERAL_REGS;
28005 for (int i = 1; i < num_insns; i++)
28006 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
28008 if (rclass != FP_REGS)
28009 return false;
28011 else
28013 if (rclass != GENERAL_REGS)
28014 return false;
28017 /* Only the last register in the order in which they occur
28018 may be clobbered by the load. */
28019 if (rclass == GENERAL_REGS && load)
28020 for (int i = 0; i < num_insns - 1; i++)
28021 if (reg_mentioned_p (reg[i], mem[i]))
28022 return false;
28024 /* Check if the bases are same. */
28025 for (int i = 0; i < num_insns - 1; i++)
28026 if (!rtx_equal_p (base[i], base[i + 1]))
28027 return false;
28029 for (int i = 0; i < num_insns; i++)
28030 offvals[i] = INTVAL (offset[i]);
28032 msize = GET_MODE_SIZE (mode).to_constant ();
28034 /* Check if the offsets can be put in the right order to do a ldp/stp. */
28035 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
28036 aarch64_host_wide_int_compare);
28038 if (!(offvals[1] == offvals[0] + msize
28039 && offvals[3] == offvals[2] + msize))
28040 return false;
28042 /* Check that offsets are within range of each other. The ldp/stp
28043 instructions have 7 bit immediate offsets, so use 0x80. */
28044 if (offvals[2] - offvals[0] >= msize * 0x80)
28045 return false;
28047 /* The offsets must be aligned with respect to each other. */
28048 if (offvals[0] % msize != offvals[2] % msize)
28049 return false;
28051 /* Check if mem[0] is ok with the ldp-stp policy model. */
28052 if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
28053 return false;
28055 return true;
28058 /* Given OPERANDS of consecutive load/store, this function pairs them
28059 into LDP/STP after adjusting the offset. It depends on the fact
28060 that the operands can be sorted so the offsets are correct for STP.
28061 MODE is the mode of memory operands. CODE is the rtl operator
28062 which should be applied to all memory operands, it's SIGN_EXTEND,
28063 ZERO_EXTEND or UNKNOWN. */
28065 bool
28066 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
28067 machine_mode mode, RTX_CODE code)
28069 rtx base, offset_1, offset_2;
28070 rtx mem_1, mem_2;
28071 rtx temp_operands[8];
28072 HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
28073 stp_off_upper_limit, stp_off_lower_limit, msize;
28075 /* We make changes on a copy as we may still bail out. */
28076 for (int i = 0; i < 8; i ++)
28077 temp_operands[i] = operands[i];
28079 /* Sort the operands. Note for cases as below:
28080 [base + 0x310] = A
28081 [base + 0x320] = B
28082 [base + 0x330] = C
28083 [base + 0x320] = D
28084 We need stable sorting otherwise wrong data may be store to offset 0x320.
28085 Also note the dead store in above case should be optimized away, but no
28086 guarantees here. */
28087 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
28088 aarch64_ldrstr_offset_compare);
28090 /* Copy the memory operands so that if we have to bail for some
28091 reason the original addresses are unchanged. */
28092 if (load)
28094 mem_1 = copy_rtx (temp_operands[1]);
28095 mem_2 = copy_rtx (temp_operands[5]);
28097 else
28099 mem_1 = copy_rtx (temp_operands[0]);
28100 mem_2 = copy_rtx (temp_operands[4]);
28101 gcc_assert (code == UNKNOWN);
28104 extract_base_offset_in_addr (mem_1, &base, &offset_1);
28105 extract_base_offset_in_addr (mem_2, &base, &offset_2);
28106 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28107 && offset_2 != NULL_RTX);
28109 /* Adjust offset so it can fit in LDP/STP instruction. */
28110 msize = GET_MODE_SIZE (mode).to_constant();
28111 stp_off_upper_limit = msize * (0x40 - 1);
28112 stp_off_lower_limit = - msize * 0x40;
28114 off_val_1 = INTVAL (offset_1);
28115 off_val_2 = INTVAL (offset_2);
28117 /* The base offset is optimally half way between the two STP/LDP offsets. */
28118 if (msize <= 4)
28119 base_off = (off_val_1 + off_val_2) / 2;
28120 else
28121 /* However, due to issues with negative LDP/STP offset generation for
28122 larger modes, for DF, DD, DI and vector modes. we must not use negative
28123 addresses smaller than 9 signed unadjusted bits can store. This
28124 provides the most range in this case. */
28125 base_off = off_val_1;
28127 /* Adjust the base so that it is aligned with the addresses but still
28128 optimal. */
28129 if (base_off % msize != off_val_1 % msize)
28130 /* Fix the offset, bearing in mind we want to make it bigger not
28131 smaller. */
28132 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28133 else if (msize <= 4)
28134 /* The negative range of LDP/STP is one larger than the positive range. */
28135 base_off += msize;
28137 /* Check if base offset is too big or too small. We can attempt to resolve
28138 this issue by setting it to the maximum value and seeing if the offsets
28139 still fit. */
28140 if (base_off >= 0x1000)
28142 base_off = 0x1000 - 1;
28143 /* We must still make sure that the base offset is aligned with respect
28144 to the address. But it may not be made any bigger. */
28145 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28148 /* Likewise for the case where the base is too small. */
28149 if (base_off <= -0x1000)
28151 base_off = -0x1000 + 1;
28152 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28155 /* Offset of the first STP/LDP. */
28156 new_off_1 = off_val_1 - base_off;
28158 /* Offset of the second STP/LDP. */
28159 new_off_2 = off_val_2 - base_off;
28161 /* The offsets must be within the range of the LDP/STP instructions. */
28162 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28163 || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28164 return false;
28166 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28167 new_off_1), true);
28168 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28169 new_off_2), true);
28171 if (!aarch64_mem_pair_operand (mem_1, mode)
28172 || !aarch64_mem_pair_operand (mem_2, mode))
28173 return false;
28175 if (load)
28177 operands[0] = temp_operands[0];
28178 operands[1] = mem_1;
28179 operands[2] = temp_operands[2];
28180 operands[4] = temp_operands[4];
28181 operands[5] = mem_2;
28182 operands[6] = temp_operands[6];
28184 else
28186 operands[0] = mem_1;
28187 operands[1] = temp_operands[1];
28188 operands[3] = temp_operands[3];
28189 operands[4] = mem_2;
28190 operands[5] = temp_operands[5];
28191 operands[7] = temp_operands[7];
28194 /* Emit adjusting instruction. */
28195 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28196 /* Emit ldp/stp instructions. */
28197 if (load)
28199 emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28200 operands[1], code));
28201 emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28202 operands[5], code));
28204 else
28206 emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28207 operands[3]));
28208 emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28209 operands[7]));
28211 return true;
28214 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
28215 it isn't worth branching around empty masked ops (including masked
28216 stores). */
28218 static bool
28219 aarch64_empty_mask_is_expensive (unsigned)
28221 return false;
28224 /* Return 1 if pseudo register should be created and used to hold
28225 GOT address for PIC code. */
28227 bool
28228 aarch64_use_pseudo_pic_reg (void)
28230 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28233 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
28235 static int
28236 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28238 switch (XINT (x, 1))
28240 case UNSPEC_GOTSMALLPIC:
28241 case UNSPEC_GOTSMALLPIC28K:
28242 case UNSPEC_GOTTINYPIC:
28243 return 0;
28244 default:
28245 break;
28248 return default_unspec_may_trap_p (x, flags);
28252 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28253 return the log2 of that value. Otherwise return -1. */
28256 aarch64_fpconst_pow_of_2 (rtx x)
28258 const REAL_VALUE_TYPE *r;
28260 if (!CONST_DOUBLE_P (x))
28261 return -1;
28263 r = CONST_DOUBLE_REAL_VALUE (x);
28265 if (REAL_VALUE_NEGATIVE (*r)
28266 || REAL_VALUE_ISNAN (*r)
28267 || REAL_VALUE_ISINF (*r)
28268 || !real_isinteger (r, DFmode))
28269 return -1;
28271 return exact_log2 (real_to_integer (r));
28274 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28275 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28276 return n. Otherwise return -1. */
28279 aarch64_fpconst_pow2_recip (rtx x)
28281 REAL_VALUE_TYPE r0;
28283 if (!CONST_DOUBLE_P (x))
28284 return -1;
28286 r0 = *CONST_DOUBLE_REAL_VALUE (x);
28287 if (exact_real_inverse (DFmode, &r0)
28288 && !REAL_VALUE_NEGATIVE (r0))
28290 int ret = exact_log2 (real_to_integer (&r0));
28291 if (ret >= 1 && ret <= 32)
28292 return ret;
28294 return -1;
28297 /* If X is a vector of equal CONST_DOUBLE values and that value is
28298 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
28301 aarch64_vec_fpconst_pow_of_2 (rtx x)
28303 int nelts;
28304 if (!CONST_VECTOR_P (x)
28305 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28306 return -1;
28308 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28309 return -1;
28311 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28312 if (firstval <= 0)
28313 return -1;
28315 for (int i = 1; i < nelts; i++)
28316 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28317 return -1;
28319 return firstval;
28322 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28323 to float.
28325 __fp16 always promotes through this hook.
28326 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28327 through the generic excess precision logic rather than here. */
28329 static tree
28330 aarch64_promoted_type (const_tree t)
28332 if (SCALAR_FLOAT_TYPE_P (t)
28333 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28334 return float_type_node;
28336 return NULL_TREE;
28339 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
28341 static bool
28342 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28343 optimization_type opt_type)
28345 switch (op)
28347 case rsqrt_optab:
28348 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28350 default:
28351 return true;
28355 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
28357 static unsigned int
28358 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28359 int *offset)
28361 /* Polynomial invariant 1 == (VG / 2) - 1. */
28362 gcc_assert (i == 1);
28363 *factor = 2;
28364 *offset = 1;
28365 return AARCH64_DWARF_VG;
28368 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28369 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28371 static bool
28372 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28374 return ((mode == HFmode || mode == BFmode)
28375 ? true
28376 : default_libgcc_floating_mode_supported_p (mode));
28379 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28380 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28382 static bool
28383 aarch64_scalar_mode_supported_p (scalar_mode mode)
28385 if (DECIMAL_FLOAT_MODE_P (mode))
28386 return default_decimal_float_supported_p ();
28388 return ((mode == HFmode || mode == BFmode)
28389 ? true
28390 : default_scalar_mode_supported_p (mode));
28393 /* Set the value of FLT_EVAL_METHOD.
28394 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28396 0: evaluate all operations and constants, whose semantic type has at
28397 most the range and precision of type float, to the range and
28398 precision of float; evaluate all other operations and constants to
28399 the range and precision of the semantic type;
28401 N, where _FloatN is a supported interchange floating type
28402 evaluate all operations and constants, whose semantic type has at
28403 most the range and precision of _FloatN type, to the range and
28404 precision of the _FloatN type; evaluate all other operations and
28405 constants to the range and precision of the semantic type;
28407 If we have the ARMv8.2-A extensions then we support _Float16 in native
28408 precision, so we should set this to 16. Otherwise, we support the type,
28409 but want to evaluate expressions in float precision, so set this to
28410 0. */
28412 static enum flt_eval_method
28413 aarch64_excess_precision (enum excess_precision_type type)
28415 switch (type)
28417 case EXCESS_PRECISION_TYPE_FAST:
28418 case EXCESS_PRECISION_TYPE_STANDARD:
28419 /* We can calculate either in 16-bit range and precision or
28420 32-bit range and precision. Make that decision based on whether
28421 we have native support for the ARMv8.2-A 16-bit floating-point
28422 instructions or not. */
28423 return (TARGET_FP_F16INST
28424 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28425 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28426 case EXCESS_PRECISION_TYPE_IMPLICIT:
28427 case EXCESS_PRECISION_TYPE_FLOAT16:
28428 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28429 default:
28430 gcc_unreachable ();
28432 return FLT_EVAL_METHOD_UNPREDICTABLE;
28435 /* Implement TARGET_C_BITINT_TYPE_INFO.
28436 Return true if _BitInt(N) is supported and fill its details into *INFO. */
28437 bool
28438 aarch64_bitint_type_info (int n, struct bitint_info *info)
28440 if (TARGET_BIG_END)
28441 return false;
28443 if (n <= 8)
28444 info->limb_mode = QImode;
28445 else if (n <= 16)
28446 info->limb_mode = HImode;
28447 else if (n <= 32)
28448 info->limb_mode = SImode;
28449 else if (n <= 64)
28450 info->limb_mode = DImode;
28451 else if (n <= 128)
28452 info->limb_mode = TImode;
28453 else
28454 /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28455 type {signed,unsigned} __int128[M] where M*128 >= N. However, to be
28456 able to use libgcc's implementation to support large _BitInt's we need
28457 to use a LIMB_MODE that is no larger than 'long long'. This is why we
28458 use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28459 be TImode to ensure we are ABI compliant. */
28460 info->limb_mode = DImode;
28462 if (n > 128)
28463 info->abi_limb_mode = TImode;
28464 else
28465 info->abi_limb_mode = info->limb_mode;
28466 info->big_endian = TARGET_BIG_END;
28467 info->extended = false;
28468 return true;
28471 /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE. Return TFmode for
28472 TI_LONG_DOUBLE_TYPE which is for long double type, go with the default
28473 one for the others. */
28475 static machine_mode
28476 aarch64_c_mode_for_floating_type (enum tree_index ti)
28478 if (ti == TI_LONG_DOUBLE_TYPE)
28479 return TFmode;
28480 return default_mode_for_floating_type (ti);
28483 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
28484 scheduled for speculative execution. Reject the long-running division
28485 and square-root instructions. */
28487 static bool
28488 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28490 switch (get_attr_type (insn))
28492 case TYPE_SDIV:
28493 case TYPE_UDIV:
28494 case TYPE_FDIVS:
28495 case TYPE_FDIVD:
28496 case TYPE_FSQRTS:
28497 case TYPE_FSQRTD:
28498 case TYPE_NEON_FP_SQRT_S:
28499 case TYPE_NEON_FP_SQRT_D:
28500 case TYPE_NEON_FP_SQRT_S_Q:
28501 case TYPE_NEON_FP_SQRT_D_Q:
28502 case TYPE_NEON_FP_DIV_S:
28503 case TYPE_NEON_FP_DIV_D:
28504 case TYPE_NEON_FP_DIV_S_Q:
28505 case TYPE_NEON_FP_DIV_D_Q:
28506 return false;
28507 default:
28508 return true;
28512 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
28514 static int
28515 aarch64_compute_pressure_classes (reg_class *classes)
28517 int i = 0;
28518 classes[i++] = GENERAL_REGS;
28519 classes[i++] = FP_REGS;
28520 /* PR_REGS isn't a useful pressure class because many predicate pseudo
28521 registers need to go in PR_LO_REGS at some point during their
28522 lifetime. Splitting it into two halves has the effect of making
28523 all predicates count against PR_LO_REGS, so that we try whenever
28524 possible to restrict the number of live predicates to 8. This
28525 greatly reduces the amount of spilling in certain loops. */
28526 classes[i++] = PR_LO_REGS;
28527 classes[i++] = PR_HI_REGS;
28528 return i;
28531 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
28533 static bool
28534 aarch64_can_change_mode_class (machine_mode from,
28535 machine_mode to, reg_class_t)
28537 return aarch64_modes_compatible_p (from, to);
28540 /* Implement TARGET_EARLY_REMAT_MODES. */
28542 static void
28543 aarch64_select_early_remat_modes (sbitmap modes)
28545 /* SVE values are not normally live across a call, so it should be
28546 worth doing early rematerialization even in VL-specific mode. */
28547 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
28548 if (aarch64_sve_mode_p ((machine_mode) i))
28549 bitmap_set_bit (modes, i);
28552 /* Override the default target speculation_safe_value. */
28553 static rtx
28554 aarch64_speculation_safe_value (machine_mode mode,
28555 rtx result, rtx val, rtx failval)
28557 /* Maybe we should warn if falling back to hard barriers. They are
28558 likely to be noticably more expensive than the alternative below. */
28559 if (!aarch64_track_speculation)
28560 return default_speculation_safe_value (mode, result, val, failval);
28562 if (!REG_P (val))
28563 val = copy_to_mode_reg (mode, val);
28565 if (!aarch64_reg_or_zero (failval, mode))
28566 failval = copy_to_mode_reg (mode, failval);
28568 emit_insn (gen_despeculate_copy (mode, result, val, failval));
28569 return result;
28572 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28573 Look into the tuning structure for an estimate.
28574 KIND specifies the type of requested estimate: min, max or likely.
28575 For cores with a known SVE width all three estimates are the same.
28576 For generic SVE tuning we want to distinguish the maximum estimate from
28577 the minimum and likely ones.
28578 The likely estimate is the same as the minimum in that case to give a
28579 conservative behavior of auto-vectorizing with SVE when it is a win
28580 even for 128-bit SVE.
28581 When SVE width information is available VAL.coeffs[1] is multiplied by
28582 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
28584 static HOST_WIDE_INT
28585 aarch64_estimated_poly_value (poly_int64 val,
28586 poly_value_estimate_kind kind
28587 = POLY_VALUE_LIKELY)
28589 unsigned int width_source = aarch64_tune_params.sve_width;
28591 /* If there is no core-specific information then the minimum and likely
28592 values are based on 128-bit vectors and the maximum is based on
28593 the architectural maximum of 2048 bits. */
28594 if (width_source == SVE_SCALABLE)
28595 switch (kind)
28597 case POLY_VALUE_MIN:
28598 case POLY_VALUE_LIKELY:
28599 return val.coeffs[0];
28600 case POLY_VALUE_MAX:
28601 return val.coeffs[0] + val.coeffs[1] * 15;
28604 /* Allow sve_width to be a bitmask of different VL, treating the lowest
28605 as likely. This could be made more general if future -mtune options
28606 need it to be. */
28607 if (kind == POLY_VALUE_MAX)
28608 width_source = 1 << floor_log2 (width_source);
28609 else
28610 width_source = least_bit_hwi (width_source);
28612 /* If the core provides width information, use that. */
28613 HOST_WIDE_INT over_128 = width_source - 128;
28614 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
28618 /* Return true for types that could be supported as SIMD return or
28619 argument types. */
28621 static bool
28622 supported_simd_type (tree t)
28624 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
28626 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
28627 return s == 1 || s == 2 || s == 4 || s == 8;
28629 return false;
28632 /* Determine the lane size for the clone argument/return type. This follows
28633 the LS(P) rule in the VFABIA64. */
28635 static unsigned
28636 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
28638 gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
28640 /* For non map-to-vector types that are pointers we use the element type it
28641 points to. */
28642 if (POINTER_TYPE_P (type))
28643 switch (clone_arg_type)
28645 default:
28646 break;
28647 case SIMD_CLONE_ARG_TYPE_UNIFORM:
28648 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
28649 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
28650 type = TREE_TYPE (type);
28651 break;
28654 /* For types (or pointers of non map-to-vector types point to) that are
28655 integers or floating point, we use their size if they are 1, 2, 4 or 8.
28657 if (INTEGRAL_TYPE_P (type)
28658 || SCALAR_FLOAT_TYPE_P (type))
28659 switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
28661 default:
28662 break;
28663 case 1:
28664 case 2:
28665 case 4:
28666 case 8:
28667 return TYPE_PRECISION (type);
28669 /* For any other we use the size of uintptr_t. For map-to-vector types that
28670 are pointers, using the size of uintptr_t is the same as using the size of
28671 their type, seeing all pointers are the same size as uintptr_t. */
28672 return POINTER_SIZE;
28676 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
28678 static int
28679 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
28680 struct cgraph_simd_clone *clonei,
28681 tree base_type ATTRIBUTE_UNUSED,
28682 int num, bool explicit_p)
28684 tree t, ret_type;
28685 unsigned int nds_elt_bits;
28686 unsigned HOST_WIDE_INT const_simdlen;
28688 if (!TARGET_SIMD)
28689 return 0;
28691 /* For now, SVE simdclones won't produce illegal simdlen, So only check
28692 const simdlens here. */
28693 if (maybe_ne (clonei->simdlen, 0U)
28694 && clonei->simdlen.is_constant (&const_simdlen)
28695 && (const_simdlen < 2
28696 || const_simdlen > 1024
28697 || (const_simdlen & (const_simdlen - 1)) != 0))
28699 if (explicit_p)
28700 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28701 "unsupported simdlen %wd", const_simdlen);
28702 return 0;
28705 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
28706 /* According to AArch64's Vector ABI the type that determines the simdlen is
28707 the narrowest of types, so we ignore base_type for AArch64. */
28708 if (TREE_CODE (ret_type) != VOID_TYPE
28709 && !supported_simd_type (ret_type))
28711 if (!explicit_p)
28713 else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28714 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28715 "GCC does not currently support return type %qT "
28716 "for simd", ret_type);
28717 else
28718 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28719 "unsupported return type %qT for simd",
28720 ret_type);
28721 return 0;
28724 auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
28726 /* We are looking for the NDS type here according to the VFABIA64. */
28727 if (TREE_CODE (ret_type) != VOID_TYPE)
28729 nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
28730 vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
28732 else
28733 nds_elt_bits = POINTER_SIZE;
28735 int i;
28736 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
28737 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
28738 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
28739 t && t != void_list_node; t = TREE_CHAIN (t), i++)
28741 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
28742 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
28743 && !supported_simd_type (arg_type))
28745 if (!explicit_p)
28747 else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28748 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28749 "GCC does not currently support argument type %qT "
28750 "for simd", arg_type);
28751 else
28752 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28753 "unsupported argument type %qT for simd",
28754 arg_type);
28755 return 0;
28757 unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
28758 if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
28759 vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
28760 if (nds_elt_bits > lane_bits)
28761 nds_elt_bits = lane_bits;
28764 clonei->vecsize_mangle = 'n';
28765 clonei->mask_mode = VOIDmode;
28766 poly_uint64 simdlen;
28767 auto_vec<poly_uint64> simdlens (2);
28768 /* Keep track of the possible simdlens the clones of this function can have,
28769 and check them later to see if we support them. */
28770 if (known_eq (clonei->simdlen, 0U))
28772 simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28773 if (maybe_ne (simdlen, 1U))
28774 simdlens.safe_push (simdlen);
28775 simdlens.safe_push (simdlen * 2);
28777 else
28778 simdlens.safe_push (clonei->simdlen);
28780 clonei->vecsize_int = 0;
28781 clonei->vecsize_float = 0;
28783 /* We currently do not support generating simdclones where vector arguments
28784 do not fit into a single vector register, i.e. vector types that are more
28785 than 128-bits large. This is because of how we currently represent such
28786 types in ACLE, where we use a struct to allow us to pass them as arguments
28787 and return.
28788 Hence why we have to check whether the simdlens available for this
28789 simdclone would cause a vector type to be larger than 128-bits, and reject
28790 such a clone. */
28791 unsigned j = 0;
28792 while (j < simdlens.length ())
28794 bool remove_simdlen = false;
28795 for (auto elt : vec_elts)
28796 if (known_gt (simdlens[j] * elt.second, 128U))
28798 /* Don't issue a warning for every simdclone when there is no
28799 specific simdlen clause. */
28800 if (explicit_p && maybe_ne (clonei->simdlen, 0U))
28801 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28802 "GCC does not currently support simdlen %wd for "
28803 "type %qT",
28804 constant_lower_bound (simdlens[j]), elt.first);
28805 remove_simdlen = true;
28806 break;
28808 if (remove_simdlen)
28809 simdlens.ordered_remove (j);
28810 else
28811 j++;
28815 int count = simdlens.length ();
28816 if (count == 0)
28818 if (explicit_p && known_eq (clonei->simdlen, 0U))
28820 /* Warn the user if we can't generate any simdclone. */
28821 simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28822 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28823 "GCC does not currently support a simdclone with simdlens"
28824 " %wd and %wd for these types.",
28825 constant_lower_bound (simdlen),
28826 constant_lower_bound (simdlen*2));
28828 return 0;
28831 gcc_assert (num < count);
28832 clonei->simdlen = simdlens[num];
28833 return count;
28836 /* Implement TARGET_SIMD_CLONE_ADJUST. */
28838 static void
28839 aarch64_simd_clone_adjust (struct cgraph_node *node)
28841 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
28842 use the correct ABI. */
28844 tree t = TREE_TYPE (node->decl);
28845 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
28846 TYPE_ATTRIBUTES (t));
28849 /* Implement TARGET_SIMD_CLONE_USABLE. */
28851 static int
28852 aarch64_simd_clone_usable (struct cgraph_node *node)
28854 switch (node->simdclone->vecsize_mangle)
28856 case 'n':
28857 if (!TARGET_SIMD)
28858 return -1;
28859 return 0;
28860 default:
28861 gcc_unreachable ();
28865 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
28867 static int
28868 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
28870 auto check_attr = [&](const char *ns, const char *name) {
28871 tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
28872 tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
28873 if (!attr1 && !attr2)
28874 return true;
28876 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
28879 if (!check_attr ("gnu", "aarch64_vector_pcs"))
28880 return 0;
28881 if (!check_attr ("gnu", "Advanced SIMD type"))
28882 return 0;
28883 if (!check_attr ("gnu", "SVE type"))
28884 return 0;
28885 if (!check_attr ("gnu", "SVE sizeless type"))
28886 return 0;
28887 if (!check_attr ("arm", "streaming"))
28888 return 0;
28889 if (!check_attr ("arm", "streaming_compatible"))
28890 return 0;
28891 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
28892 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
28893 return 0;
28894 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
28895 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
28896 return 0;
28897 return 1;
28900 /* Implement TARGET_MERGE_DECL_ATTRIBUTES. */
28902 static tree
28903 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
28905 tree old_attrs = DECL_ATTRIBUTES (olddecl);
28906 tree old_new = lookup_attribute ("arm", "new", old_attrs);
28908 tree new_attrs = DECL_ATTRIBUTES (newdecl);
28909 tree new_new = lookup_attribute ("arm", "new", new_attrs);
28911 if (DECL_INITIAL (olddecl) && new_new)
28913 error ("cannot apply attribute %qs to %q+D after the function"
28914 " has been defined", "new", newdecl);
28915 inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
28916 newdecl);
28918 else
28920 if (old_new && new_new)
28922 old_attrs = remove_attribute ("arm", "new", old_attrs);
28923 TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
28924 TREE_VALUE (old_new));
28926 if (new_new)
28927 aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
28930 return merge_attributes (old_attrs, new_attrs);
28933 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
28935 static const char *
28936 aarch64_get_multilib_abi_name (void)
28938 if (TARGET_BIG_END)
28939 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
28940 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
28943 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
28944 global variable based guard use the default else
28945 return a null tree. */
28946 static tree
28947 aarch64_stack_protect_guard (void)
28949 if (aarch64_stack_protector_guard == SSP_GLOBAL)
28950 return default_stack_protect_guard ();
28952 return NULL_TREE;
28955 /* Return the diagnostic message string if the binary operation OP is
28956 not permitted on TYPE1 and TYPE2, NULL otherwise. */
28958 static const char *
28959 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
28960 const_tree type2)
28962 if (VECTOR_TYPE_P (type1)
28963 && VECTOR_TYPE_P (type2)
28964 && !TYPE_INDIVISIBLE_P (type1)
28965 && !TYPE_INDIVISIBLE_P (type2)
28966 && (aarch64_sve::builtin_type_p (type1)
28967 != aarch64_sve::builtin_type_p (type2)))
28968 return N_("cannot combine GNU and SVE vectors in a binary operation");
28970 /* Operation allowed. */
28971 return NULL;
28974 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
28975 compiler that we automatically ignore the top byte of our pointers, which
28976 allows using -fsanitize=hwaddress. */
28977 bool
28978 aarch64_can_tag_addresses ()
28980 return !TARGET_ILP32;
28983 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
28984 section at the end if needed. */
28985 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
28986 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
28987 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
28988 void
28989 aarch64_file_end_indicate_exec_stack ()
28991 file_end_indicate_exec_stack ();
28993 unsigned feature_1_and = 0;
28994 if (aarch_bti_enabled ())
28995 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
28997 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
28998 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
29000 if (feature_1_and)
29002 /* Generate .note.gnu.property section. */
29003 switch_to_section (get_section (".note.gnu.property",
29004 SECTION_NOTYPE, NULL));
29006 /* PT_NOTE header: namesz, descsz, type.
29007 namesz = 4 ("GNU\0")
29008 descsz = 16 (Size of the program property array)
29009 [(12 + padding) * Number of array elements]
29010 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
29011 assemble_align (POINTER_SIZE);
29012 assemble_integer (GEN_INT (4), 4, 32, 1);
29013 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
29014 assemble_integer (GEN_INT (5), 4, 32, 1);
29016 /* PT_NOTE name. */
29017 assemble_string ("GNU", 4);
29019 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
29020 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
29021 datasz = 4
29022 data = feature_1_and. */
29023 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
29024 assemble_integer (GEN_INT (4), 4, 32, 1);
29025 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
29027 /* Pad the size of the note to the required alignment. */
29028 assemble_align (POINTER_SIZE);
29031 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
29032 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
29033 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
29035 /* Helper function for straight line speculation.
29036 Return what barrier should be emitted for straight line speculation
29037 mitigation.
29038 When not mitigating against straight line speculation this function returns
29039 an empty string.
29040 When mitigating against straight line speculation, use:
29041 * SB when the v8.5-A SB extension is enabled.
29042 * DSB+ISB otherwise. */
29043 const char *
29044 aarch64_sls_barrier (int mitigation_required)
29046 return mitigation_required
29047 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
29048 : "";
29051 static GTY (()) tree aarch64_sls_shared_thunks[30];
29052 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
29053 const char *indirect_symbol_names[30] = {
29054 "__call_indirect_x0",
29055 "__call_indirect_x1",
29056 "__call_indirect_x2",
29057 "__call_indirect_x3",
29058 "__call_indirect_x4",
29059 "__call_indirect_x5",
29060 "__call_indirect_x6",
29061 "__call_indirect_x7",
29062 "__call_indirect_x8",
29063 "__call_indirect_x9",
29064 "__call_indirect_x10",
29065 "__call_indirect_x11",
29066 "__call_indirect_x12",
29067 "__call_indirect_x13",
29068 "__call_indirect_x14",
29069 "__call_indirect_x15",
29070 "", /* "__call_indirect_x16", */
29071 "", /* "__call_indirect_x17", */
29072 "__call_indirect_x18",
29073 "__call_indirect_x19",
29074 "__call_indirect_x20",
29075 "__call_indirect_x21",
29076 "__call_indirect_x22",
29077 "__call_indirect_x23",
29078 "__call_indirect_x24",
29079 "__call_indirect_x25",
29080 "__call_indirect_x26",
29081 "__call_indirect_x27",
29082 "__call_indirect_x28",
29083 "__call_indirect_x29",
29086 /* Function to create a BLR thunk. This thunk is used to mitigate straight
29087 line speculation. Instead of a simple BLR that can be speculated past,
29088 we emit a BL to this thunk, and this thunk contains a BR to the relevant
29089 register. These thunks have the relevant speculation barries put after
29090 their indirect branch so that speculation is blocked.
29092 We use such a thunk so the speculation barriers are kept off the
29093 architecturally executed path in order to reduce the performance overhead.
29095 When optimizing for size we use stubs shared by the linked object.
29096 When optimizing for performance we emit stubs for each function in the hope
29097 that the branch predictor can better train on jumps specific for a given
29098 function. */
29100 aarch64_sls_create_blr_label (int regnum)
29102 gcc_assert (STUB_REGNUM_P (regnum));
29103 if (optimize_function_for_size_p (cfun))
29105 /* For the thunks shared between different functions in this compilation
29106 unit we use a named symbol -- this is just for users to more easily
29107 understand the generated assembly. */
29108 aarch64_sls_shared_thunks_needed = true;
29109 const char *thunk_name = indirect_symbol_names[regnum];
29110 if (aarch64_sls_shared_thunks[regnum] == NULL)
29112 /* Build a decl representing this function stub and record it for
29113 later. We build a decl here so we can use the GCC machinery for
29114 handling sections automatically (through `get_named_section` and
29115 `make_decl_one_only`). That saves us a lot of trouble handling
29116 the specifics of different output file formats. */
29117 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
29118 get_identifier (thunk_name),
29119 build_function_type_list (void_type_node,
29120 NULL_TREE));
29121 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
29122 NULL_TREE, void_type_node);
29123 TREE_PUBLIC (decl) = 1;
29124 TREE_STATIC (decl) = 1;
29125 DECL_IGNORED_P (decl) = 1;
29126 DECL_ARTIFICIAL (decl) = 1;
29127 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29128 resolve_unique_section (decl, 0, false);
29129 aarch64_sls_shared_thunks[regnum] = decl;
29132 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
29135 if (cfun->machine->call_via[regnum] == NULL)
29136 cfun->machine->call_via[regnum]
29137 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
29138 return cfun->machine->call_via[regnum];
29141 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29142 aarch64_sls_emit_shared_blr_thunks below. */
29143 static void
29144 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
29146 /* Save in x16 and branch to that function so this transformation does
29147 not prevent jumping to `BTI c` instructions. */
29148 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
29149 asm_fprintf (out_file, "\tbr\tx16\n");
29152 /* Emit all BLR stubs for this particular function.
29153 Here we emit all the BLR stubs needed for the current function. Since we
29154 emit these stubs in a consecutive block we know there will be no speculation
29155 gadgets between each stub, and hence we only emit a speculation barrier at
29156 the end of the stub sequences.
29158 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
29159 void
29160 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29162 if (! aarch64_harden_sls_blr_p ())
29163 return;
29165 bool any_functions_emitted = false;
29166 /* We must save and restore the current function section since this assembly
29167 is emitted at the end of the function. This means it can be emitted *just
29168 after* the cold section of a function. That cold part would be emitted in
29169 a different section. That switch would trigger a `.cfi_endproc` directive
29170 to be emitted in the original section and a `.cfi_startproc` directive to
29171 be emitted in the new section. Switching to the original section without
29172 restoring would mean that the `.cfi_endproc` emitted as a function ends
29173 would happen in a different section -- leaving an unmatched
29174 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29175 in the standard text section. */
29176 section *save_text_section = in_section;
29177 switch_to_section (function_section (current_function_decl));
29178 for (int regnum = 0; regnum < 30; ++regnum)
29180 rtx specu_label = cfun->machine->call_via[regnum];
29181 if (specu_label == NULL)
29182 continue;
29184 targetm.asm_out.print_operand (out_file, specu_label, 0);
29185 asm_fprintf (out_file, ":\n");
29186 aarch64_sls_emit_function_stub (out_file, regnum);
29187 any_functions_emitted = true;
29189 if (any_functions_emitted)
29190 /* Can use the SB if needs be here, since this stub will only be used
29191 by the current function, and hence for the current target. */
29192 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29193 switch_to_section (save_text_section);
29196 /* Emit shared BLR stubs for the current compilation unit.
29197 Over the course of compiling this unit we may have converted some BLR
29198 instructions to a BL to a shared stub function. This is where we emit those
29199 stub functions.
29200 This function is for the stubs shared between different functions in this
29201 compilation unit. We share when optimizing for size instead of speed.
29203 This function is called through the TARGET_ASM_FILE_END hook. */
29204 void
29205 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29207 if (! aarch64_sls_shared_thunks_needed)
29208 return;
29210 for (int regnum = 0; regnum < 30; ++regnum)
29212 tree decl = aarch64_sls_shared_thunks[regnum];
29213 if (!decl)
29214 continue;
29216 const char *name = indirect_symbol_names[regnum];
29217 switch_to_section (get_named_section (decl, NULL, 0));
29218 ASM_OUTPUT_ALIGN (out_file, 2);
29219 targetm.asm_out.globalize_label (out_file, name);
29220 /* Only emits if the compiler is configured for an assembler that can
29221 handle visibility directives. */
29222 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29223 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29224 ASM_OUTPUT_LABEL (out_file, name);
29225 aarch64_sls_emit_function_stub (out_file, regnum);
29226 /* Use the most conservative target to ensure it can always be used by any
29227 function in the translation unit. */
29228 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29229 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29233 /* Implement TARGET_ASM_FILE_END. */
29234 void
29235 aarch64_asm_file_end ()
29237 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29238 /* Since this function will be called for the ASM_FILE_END hook, we ensure
29239 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29240 for FreeBSD) still gets called. */
29241 #ifdef TARGET_ASM_FILE_END
29242 TARGET_ASM_FILE_END ();
29243 #endif
29246 const char *
29247 aarch64_indirect_call_asm (rtx addr)
29249 gcc_assert (REG_P (addr));
29250 if (aarch64_harden_sls_blr_p ())
29252 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29253 output_asm_insn ("bl\t%0", &stub_label);
29255 else
29256 output_asm_insn ("blr\t%0", &addr);
29257 return "";
29260 /* Emit the assembly instruction to load the thread pointer into DEST.
29261 Select between different tpidr_elN registers depending on -mtp= setting. */
29263 const char *
29264 aarch64_output_load_tp (rtx dest)
29266 const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29267 "tpidr_el3", "tpidrro_el0"};
29268 char buffer[64];
29269 snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29270 tpidrs[aarch64_tpidr_register]);
29271 output_asm_insn (buffer, &dest);
29272 return "";
29275 /* Set up the value of REG_ALLOC_ORDER from scratch.
29277 It was previously good practice to put call-clobbered registers ahead
29278 of call-preserved registers, but that isn't necessary these days.
29279 IRA's model of register save/restore costs is much more sophisticated
29280 than the model that a simple ordering could provide. We leave
29281 HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29282 of IRA's model.
29284 However, it is still useful to list registers that are members of
29285 multiple classes after registers that are members of fewer classes.
29286 For example, we have:
29288 - FP_LO8_REGS: v0-v7
29289 - FP_LO_REGS: v0-v15
29290 - FP_REGS: v0-v31
29292 If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29293 we run the risk of starving other (lower-priority) pseudos that
29294 require FP_LO8_REGS or FP_LO_REGS. Allocating FP_LO_REGS in the
29295 order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29296 Allocating downwards rather than upwards avoids this problem, at least
29297 in code that has reasonable register pressure.
29299 The situation for predicate registers is similar. */
29301 void
29302 aarch64_adjust_reg_alloc_order ()
29304 for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29305 if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29306 reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29307 else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29308 reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29309 else
29310 reg_alloc_order[i] = i;
29313 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29314 of vector mode MODE to select half the elements of that vector.
29315 Allow any combination of indices except duplicates (or out of range of
29316 the mode units). */
29318 bool
29319 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29321 int nunits = XVECLEN (par, 0);
29322 if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29323 return false;
29324 int mode_nunits = nunits * 2;
29325 /* Put all the elements of PAR into a hash_set and use its
29326 uniqueness guarantees to check that we don't try to insert the same
29327 element twice. */
29328 hash_set<rtx> parset;
29329 for (int i = 0; i < nunits; ++i)
29331 rtx elt = XVECEXP (par, 0, i);
29332 if (!CONST_INT_P (elt)
29333 || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29334 || parset.add (elt))
29335 return false;
29337 return true;
29340 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29341 contain any common elements. */
29343 bool
29344 aarch64_pars_overlap_p (rtx par1, rtx par2)
29346 int len1 = XVECLEN (par1, 0);
29347 int len2 = XVECLEN (par2, 0);
29348 hash_set<rtx> parset;
29349 for (int i = 0; i < len1; ++i)
29350 parset.add (XVECEXP (par1, 0, i));
29351 for (int i = 0; i < len2; ++i)
29352 if (parset.contains (XVECEXP (par2, 0, i)))
29353 return true;
29354 return false;
29357 /* Implement OPTIMIZE_MODE_SWITCHING. */
29359 bool
29360 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29362 bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29363 || (aarch64_cfun_has_new_state ("za")
29364 && df_regs_ever_live_p (ZA_REGNUM))
29365 || (aarch64_cfun_has_new_state ("zt0")
29366 && df_regs_ever_live_p (ZT0_REGNUM)));
29368 if (have_sme_state && nonlocal_goto_handler_labels)
29370 static bool reported;
29371 if (!reported)
29373 sorry ("non-local gotos in functions with SME state");
29374 reported = true;
29378 switch (entity)
29380 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29381 case aarch64_mode_entity::LOCAL_SME_STATE:
29382 return have_sme_state && !nonlocal_goto_handler_labels;
29384 gcc_unreachable ();
29387 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER. */
29389 static void
29390 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29391 aarch64_tristate_mode prev_mode)
29393 if (mode == aarch64_tristate_mode::YES)
29395 gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29396 aarch64_init_tpidr2_block ();
29398 else
29399 gcc_unreachable ();
29402 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE. */
29404 static void
29405 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
29406 aarch64_local_sme_state prev_mode)
29408 /* Back-propagation should ensure that we're always starting from
29409 a known mode. */
29410 gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
29412 if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29414 /* Commit any uncommitted lazy save. This leaves ZA either active
29415 and zero (lazy save case) or off (normal case).
29417 The sequence is:
29419 mrs <temp>, tpidr2_el0
29420 cbz <temp>, no_save
29421 bl __arm_tpidr2_save
29422 msr tpidr2_el0, xzr
29423 zero { za } // Only if ZA is live
29424 zero { zt0 } // Only if ZT0 is live
29425 no_save: */
29426 auto tmp_reg = gen_reg_rtx (DImode);
29427 emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
29428 auto label = gen_label_rtx ();
29429 rtx branch = aarch64_gen_compare_zero_and_branch (EQ, tmp_reg, label);
29430 auto jump = emit_jump_insn (branch);
29431 JUMP_LABEL (jump) = label;
29432 emit_insn (gen_aarch64_tpidr2_save ());
29433 emit_insn (gen_aarch64_clear_tpidr2 ());
29434 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29435 || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29437 if (aarch64_cfun_has_state ("za"))
29438 emit_insn (gen_aarch64_initial_zero_za ());
29439 if (aarch64_cfun_has_state ("zt0"))
29440 emit_insn (gen_aarch64_sme_zero_zt0 ());
29442 emit_label (label);
29445 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29446 || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29448 if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29450 /* Make ZA active after being inactive.
29452 First handle the case in which the lazy save we set up was
29453 committed by a callee. If the function's source-level ZA state
29454 is live then we must conditionally restore it from the lazy
29455 save buffer. Otherwise we can just force PSTATE.ZA to 1. */
29456 if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
29457 emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29458 else
29459 emit_insn (gen_aarch64_smstart_za ());
29461 /* Now handle the case in which the lazy save was not committed.
29462 In that case, ZA still contains the current function's ZA state,
29463 and we just need to cancel the lazy save. */
29464 emit_insn (gen_aarch64_clear_tpidr2 ());
29466 /* Restore the ZT0 state, if we have some. */
29467 if (aarch64_cfun_has_state ("zt0"))
29468 aarch64_restore_zt0 (true);
29470 return;
29473 if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
29475 /* Retrieve the current function's ZA state from the lazy save
29476 buffer. */
29477 aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29479 /* Restore the ZT0 state, if we have some. */
29480 if (aarch64_cfun_has_state ("zt0"))
29481 aarch64_restore_zt0 (true);
29482 return;
29485 if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
29486 || prev_mode == aarch64_local_sme_state::OFF)
29488 /* INACTIVE_CALLER means that we are enabling ZA for the first
29489 time in this function. The code above means that ZA is either
29490 active and zero (if we committed a lazy save) or off. Handle
29491 the latter case by forcing ZA on.
29493 OFF means that PSTATE.ZA is guaranteed to be 0. We just need
29494 to force it to 1.
29496 Both cases leave ZA zeroed. */
29497 emit_insn (gen_aarch64_smstart_za ());
29499 /* Restore the ZT0 state, if we have some. */
29500 if (prev_mode == aarch64_local_sme_state::OFF
29501 && aarch64_cfun_has_state ("zt0"))
29502 aarch64_restore_zt0 (true);
29503 return;
29506 if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29507 || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
29508 /* A simple change in liveness, such as in a CFG structure where
29509 ZA is only conditionally defined. No code is needed. */
29510 return;
29512 gcc_unreachable ();
29515 if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29517 if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29518 || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29519 || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29521 /* Save the ZT0 state, if we have some. */
29522 if (aarch64_cfun_has_state ("zt0"))
29523 aarch64_save_zt0 ();
29525 /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29526 case of setting up a lazy save buffer before a call.
29527 A transition from INACTIVE_CALLER is similar, except that
29528 the contents of ZA are known to be zero.
29530 A transition from ACTIVE_DEAD means that ZA is live at the
29531 point of the transition, but is dead on at least one incoming
29532 edge. (That is, ZA is only conditionally initialized.)
29533 For efficiency, we want to set up a lazy save even for
29534 dead contents, since forcing ZA off would make later code
29535 restore ZA from the lazy save buffer. */
29536 emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29537 return;
29540 if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
29541 || prev_mode == aarch64_local_sme_state::OFF)
29542 /* We're simply discarding the information about which inactive
29543 state applies. */
29544 return;
29546 gcc_unreachable ();
29549 if (mode == aarch64_local_sme_state::INACTIVE_CALLER
29550 || mode == aarch64_local_sme_state::OFF)
29552 /* Save the ZT0 state, if we have some. */
29553 if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29554 || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
29555 && mode == aarch64_local_sme_state::OFF
29556 && aarch64_cfun_has_state ("zt0"))
29557 aarch64_save_zt0 ();
29559 /* The transition to INACTIVE_CALLER is used before returning from
29560 new("za") functions. Any state in ZA belongs to the current
29561 function rather than a caller, but that state is no longer
29562 needed. Clear any pending lazy save and turn ZA off.
29564 The transition to OFF is used before calling a private-ZA function.
29565 We committed any incoming lazy save above, so at this point any
29566 contents in ZA belong to the current function. */
29567 if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29568 emit_insn (gen_aarch64_clear_tpidr2 ());
29570 if (prev_mode != aarch64_local_sme_state::OFF
29571 && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
29572 emit_insn (gen_aarch64_smstop_za ());
29574 return;
29577 if (mode == aarch64_local_sme_state::SAVED_LOCAL)
29579 /* This is a transition to an exception handler. */
29580 gcc_assert (prev_mode == aarch64_local_sme_state::OFF
29581 || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
29582 return;
29585 gcc_unreachable ();
29588 /* Implement TARGET_MODE_EMIT. */
29590 static void
29591 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
29593 if (mode == prev_mode)
29594 return;
29596 start_sequence ();
29597 switch (aarch64_mode_entity (entity))
29599 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29600 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
29601 aarch64_tristate_mode (prev_mode));
29602 break;
29604 case aarch64_mode_entity::LOCAL_SME_STATE:
29605 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
29606 aarch64_local_sme_state (prev_mode));
29607 break;
29609 rtx_insn *seq = get_insns ();
29610 end_sequence ();
29612 /* Get the set of clobbered registers that are currently live. */
29613 HARD_REG_SET clobbers = {};
29614 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
29616 if (!NONDEBUG_INSN_P (insn))
29617 continue;
29618 vec_rtx_properties properties;
29619 properties.add_insn (insn, false);
29620 for (rtx_obj_reference ref : properties.refs ())
29621 if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
29622 SET_HARD_REG_BIT (clobbers, ref.regno);
29624 clobbers &= live;
29626 /* Emit instructions to save clobbered registers to pseudos. Queue
29627 instructions to restore the registers afterwards.
29629 This should only needed in rare situations. */
29630 auto_vec<rtx, 33> after;
29631 for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
29632 if (TEST_HARD_REG_BIT (clobbers, regno))
29634 rtx hard_reg = gen_rtx_REG (DImode, regno);
29635 rtx pseudo_reg = gen_reg_rtx (DImode);
29636 emit_move_insn (pseudo_reg, hard_reg);
29637 after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
29639 if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
29641 rtx pseudo_reg = gen_reg_rtx (DImode);
29642 emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
29643 after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
29646 /* Emit the transition instructions themselves. */
29647 emit_insn (seq);
29649 /* Restore the clobbered registers. */
29650 for (auto *insn : after)
29651 emit_insn (insn);
29654 /* Return true if INSN references the SME state represented by hard register
29655 REGNO. */
29657 static bool
29658 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
29660 df_ref ref;
29661 FOR_EACH_INSN_DEF (ref, insn)
29662 if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
29663 && DF_REF_REGNO (ref) == regno)
29664 return true;
29665 FOR_EACH_INSN_USE (ref, insn)
29666 if (DF_REF_REGNO (ref) == regno)
29667 return true;
29668 return false;
29671 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE. */
29673 static aarch64_local_sme_state
29674 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
29676 if (!CALL_P (insn)
29677 && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29679 static bool reported;
29680 if (!reported)
29682 sorry ("catching non-call exceptions in functions with SME state");
29683 reported = true;
29685 /* Aim for graceful error recovery by picking the value that is
29686 least likely to generate an ICE. */
29687 return aarch64_local_sme_state::INACTIVE_LOCAL;
29690 /* A non-local goto is equivalent to a return. We disallow non-local
29691 receivers in functions with SME state, so we know that the target
29692 expects ZA to be dormant or off. */
29693 if (JUMP_P (insn)
29694 && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
29695 return aarch64_local_sme_state::INACTIVE_CALLER;
29697 /* start_private_za_call and end_private_za_call bracket a sequence
29698 that calls a private-ZA function. Force ZA to be turned off if the
29699 function doesn't have any live ZA state, otherwise require ZA to be
29700 inactive. */
29701 auto icode = recog_memoized (insn);
29702 if (icode == CODE_FOR_aarch64_start_private_za_call
29703 || icode == CODE_FOR_aarch64_end_private_za_call)
29704 return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29705 ? aarch64_local_sme_state::INACTIVE_LOCAL
29706 : aarch64_local_sme_state::OFF);
29708 /* Force ZA to contain the current function's ZA state if INSN wants
29709 to access it. Do the same for accesses to ZT0, since ZA and ZT0
29710 are both controlled by PSTATE.ZA. */
29711 if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
29712 || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
29713 return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29714 ? aarch64_local_sme_state::ACTIVE_LIVE
29715 : aarch64_local_sme_state::ACTIVE_DEAD);
29717 return aarch64_local_sme_state::ANY;
29720 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER. */
29722 static aarch64_tristate_mode
29723 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
29725 /* We need to set up a lazy save buffer no later than the first
29726 transition to INACTIVE_LOCAL (which involves setting up a lazy save). */
29727 if (aarch64_mode_needed_local_sme_state (insn, live)
29728 == aarch64_local_sme_state::INACTIVE_LOCAL)
29729 return aarch64_tristate_mode::YES;
29731 /* Also make sure that the lazy save buffer is set up before the first
29732 insn that throws internally. The exception handler will sometimes
29733 load from it. */
29734 if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29735 return aarch64_tristate_mode::YES;
29737 return aarch64_tristate_mode::MAYBE;
29740 /* Implement TARGET_MODE_NEEDED. */
29742 static int
29743 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
29745 switch (aarch64_mode_entity (entity))
29747 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29748 return int (aarch64_mode_needed_za_save_buffer (insn, live));
29750 case aarch64_mode_entity::LOCAL_SME_STATE:
29751 return int (aarch64_mode_needed_local_sme_state (insn, live));
29753 gcc_unreachable ();
29756 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE. */
29758 static aarch64_local_sme_state
29759 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
29760 HARD_REG_SET live)
29762 /* Note places where ZA dies, so that we can try to avoid saving and
29763 restoring state that isn't needed. */
29764 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29765 && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
29766 return aarch64_local_sme_state::ACTIVE_DEAD;
29768 /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29769 function. */
29770 if (mode == aarch64_local_sme_state::ACTIVE_DEAD
29771 && TEST_HARD_REG_BIT (live, ZA_REGNUM))
29772 return aarch64_local_sme_state::ACTIVE_LIVE;
29774 return mode;
29777 /* Implement TARGET_MODE_AFTER. */
29779 static int
29780 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
29782 switch (aarch64_mode_entity (entity))
29784 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29785 return mode;
29787 case aarch64_mode_entity::LOCAL_SME_STATE:
29788 return int (aarch64_mode_after_local_sme_state
29789 (aarch64_local_sme_state (mode), live));
29791 gcc_unreachable ();
29794 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE. */
29796 static aarch64_local_sme_state
29797 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
29798 aarch64_local_sme_state mode2)
29800 /* Perform a symmetrical check for two values. */
29801 auto is_pair = [&](aarch64_local_sme_state val1,
29802 aarch64_local_sme_state val2)
29804 return ((mode1 == val1 && mode2 == val2)
29805 || (mode1 == val2 && mode2 == val1));
29808 /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
29809 to a caller. OFF is one of the options. */
29810 if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
29811 aarch64_local_sme_state::OFF))
29812 return aarch64_local_sme_state::INACTIVE_CALLER;
29814 /* Similarly for dormant contents belonging to the current function. */
29815 if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
29816 aarch64_local_sme_state::OFF))
29817 return aarch64_local_sme_state::INACTIVE_LOCAL;
29819 /* Treat a conditionally-initialized value as a fully-initialized value. */
29820 if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
29821 aarch64_local_sme_state::ACTIVE_DEAD))
29822 return aarch64_local_sme_state::ACTIVE_LIVE;
29824 return aarch64_local_sme_state::ANY;
29827 /* Implement TARGET_MODE_CONFLUENCE. */
29829 static int
29830 aarch64_mode_confluence (int entity, int mode1, int mode2)
29832 gcc_assert (mode1 != mode2);
29833 switch (aarch64_mode_entity (entity))
29835 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29836 return int (aarch64_tristate_mode::MAYBE);
29838 case aarch64_mode_entity::LOCAL_SME_STATE:
29839 return int (aarch64_local_sme_confluence
29840 (aarch64_local_sme_state (mode1),
29841 aarch64_local_sme_state (mode2)));
29843 gcc_unreachable ();
29846 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
29847 NO throughput, or makes one transition from NO to YES. */
29849 static aarch64_tristate_mode
29850 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
29851 aarch64_tristate_mode mode2)
29853 /* Keep bringing the transition forward until it starts from NO. */
29854 if (mode1 == aarch64_tristate_mode::MAYBE
29855 && mode2 == aarch64_tristate_mode::YES)
29856 return mode2;
29858 return aarch64_tristate_mode::MAYBE;
29861 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE. */
29863 static aarch64_local_sme_state
29864 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
29865 aarch64_local_sme_state mode2)
29867 /* We always need to know what the current state is when transitioning
29868 to a new state. Force any location with indeterminate starting state
29869 to be active. */
29870 if (mode1 == aarch64_local_sme_state::ANY)
29871 switch (mode2)
29873 case aarch64_local_sme_state::INACTIVE_CALLER:
29874 case aarch64_local_sme_state::OFF:
29875 case aarch64_local_sme_state::ACTIVE_DEAD:
29876 /* The current function's ZA state is not live. */
29877 return aarch64_local_sme_state::ACTIVE_DEAD;
29879 case aarch64_local_sme_state::INACTIVE_LOCAL:
29880 case aarch64_local_sme_state::ACTIVE_LIVE:
29881 /* The current function's ZA state is live. */
29882 return aarch64_local_sme_state::ACTIVE_LIVE;
29884 case aarch64_local_sme_state::SAVED_LOCAL:
29885 /* This is a transition to an exception handler. Since we don't
29886 support non-call exceptions for SME functions, the source of
29887 the transition must be known. We'll assert later if that's
29888 not the case. */
29889 return aarch64_local_sme_state::ANY;
29891 case aarch64_local_sme_state::ANY:
29892 return aarch64_local_sme_state::ANY;
29895 return aarch64_local_sme_state::ANY;
29898 /* Implement TARGET_MODE_BACKPROP. */
29900 static int
29901 aarch64_mode_backprop (int entity, int mode1, int mode2)
29903 switch (aarch64_mode_entity (entity))
29905 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29906 return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
29907 aarch64_tristate_mode (mode2)));
29909 case aarch64_mode_entity::LOCAL_SME_STATE:
29910 return int (aarch64_local_sme_backprop
29911 (aarch64_local_sme_state (mode1),
29912 aarch64_local_sme_state (mode2)));
29914 gcc_unreachable ();
29917 /* Implement TARGET_MODE_ENTRY. */
29919 static int
29920 aarch64_mode_entry (int entity)
29922 switch (aarch64_mode_entity (entity))
29924 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29925 return int (aarch64_tristate_mode::NO);
29927 case aarch64_mode_entity::LOCAL_SME_STATE:
29928 return int (aarch64_cfun_shared_flags ("za") != 0
29929 ? aarch64_local_sme_state::ACTIVE_LIVE
29930 : aarch64_cfun_incoming_pstate_za () != 0
29931 ? aarch64_local_sme_state::ACTIVE_DEAD
29932 : aarch64_local_sme_state::INACTIVE_CALLER);
29934 gcc_unreachable ();
29937 /* Implement TARGET_MODE_EXIT. */
29939 static int
29940 aarch64_mode_exit (int entity)
29942 switch (aarch64_mode_entity (entity))
29944 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29945 return int (aarch64_tristate_mode::MAYBE);
29947 case aarch64_mode_entity::LOCAL_SME_STATE:
29948 return int (aarch64_cfun_shared_flags ("za") != 0
29949 ? aarch64_local_sme_state::ACTIVE_LIVE
29950 : aarch64_cfun_incoming_pstate_za () != 0
29951 ? aarch64_local_sme_state::ACTIVE_DEAD
29952 : aarch64_local_sme_state::INACTIVE_CALLER);
29954 gcc_unreachable ();
29957 /* Implement TARGET_MODE_EH_HANDLER. */
29959 static int
29960 aarch64_mode_eh_handler (int entity)
29962 switch (aarch64_mode_entity (entity))
29964 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29965 /* Require a lazy save buffer to be allocated before the first
29966 insn that can throw. */
29967 return int (aarch64_tristate_mode::YES);
29969 case aarch64_mode_entity::LOCAL_SME_STATE:
29970 return int (aarch64_local_sme_state::SAVED_LOCAL);
29972 gcc_unreachable ();
29975 /* Implement TARGET_MODE_PRIORITY. */
29977 static int
29978 aarch64_mode_priority (int, int n)
29980 return n;
29983 /* Implement TARGET_MD_ASM_ADJUST. */
29985 static rtx_insn *
29986 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
29987 vec<machine_mode> &input_modes,
29988 vec<const char *> &constraints,
29989 vec<rtx> &uses, vec<rtx> &clobbers,
29990 HARD_REG_SET &clobbered_regs, location_t loc)
29992 rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
29993 uses, clobbers, clobbered_regs, loc);
29995 /* "za" in the clobber list of a function with ZA state is defined to
29996 mean that the asm can read from and write to ZA. We can model the
29997 read using a USE, but unfortunately, it's not possible to model the
29998 write directly. Use a separate insn to model the effect.
30000 We must ensure that ZA is active on entry, which is enforced by using
30001 SME_STATE_REGNUM. The asm must ensure that ZA is active on return.
30003 The same thing applies to ZT0. */
30004 if (TARGET_ZA)
30005 for (unsigned int i = clobbers.length (); i-- > 0; )
30007 rtx x = clobbers[i];
30008 if (REG_P (x)
30009 && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
30011 auto id = cfun->machine->next_asm_update_za_id++;
30013 start_sequence ();
30014 if (seq)
30015 emit_insn (seq);
30016 rtx id_rtx = gen_int_mode (id, SImode);
30017 emit_insn (REGNO (x) == ZA_REGNUM
30018 ? gen_aarch64_asm_update_za (id_rtx)
30019 : gen_aarch64_asm_update_zt0 (id_rtx));
30020 seq = get_insns ();
30021 end_sequence ();
30023 auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
30024 uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
30025 uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
30027 clobbers.ordered_remove (i);
30028 CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
30031 return seq;
30034 /* BB is the target of an exception or nonlocal goto edge, which means
30035 that PSTATE.SM is known to be 0 on entry. Put it into the state that
30036 the current function requires. */
30038 static bool
30039 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
30041 if (TARGET_NON_STREAMING)
30042 return false;
30044 start_sequence ();
30045 rtx_insn *guard_label = nullptr;
30046 if (TARGET_STREAMING_COMPATIBLE)
30047 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30048 AARCH64_FL_SM_OFF);
30049 aarch64_sme_mode_switch_regs args_switch;
30050 args_switch.add_call_preserved_regs (df_get_live_in (bb));
30051 args_switch.emit_prologue ();
30052 aarch64_switch_pstate_sm (AARCH64_FL_SM_OFF, AARCH64_FL_SM_ON);
30053 args_switch.emit_epilogue ();
30054 if (guard_label)
30055 emit_label (guard_label);
30056 auto seq = get_insns ();
30057 end_sequence ();
30059 emit_insn_after (seq, bb_note (bb));
30060 return true;
30063 /* JUMP is a nonlocal goto. Its target requires PSTATE.SM to be 0 on entry,
30064 so arrange to make it so. */
30066 static bool
30067 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
30069 if (TARGET_NON_STREAMING)
30070 return false;
30072 start_sequence ();
30073 rtx_insn *guard_label = nullptr;
30074 if (TARGET_STREAMING_COMPATIBLE)
30075 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30076 AARCH64_FL_SM_OFF);
30077 aarch64_switch_pstate_sm (AARCH64_FL_SM_ON, AARCH64_FL_SM_OFF);
30078 if (guard_label)
30079 emit_label (guard_label);
30080 auto seq = get_insns ();
30081 end_sequence ();
30083 emit_insn_before (seq, jump);
30084 return true;
30087 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30088 to switch to the new mode and the instructions needed to restore the
30089 original mode. Return true if something changed. */
30090 static bool
30091 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
30093 /* Mode switches for sibling calls are handled via the epilogue. */
30094 if (SIBLING_CALL_P (call))
30095 return false;
30097 auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
30098 if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
30099 return false;
30101 /* Switch mode before the call, preserving any argument registers
30102 across the switch. */
30103 start_sequence ();
30104 rtx_insn *args_guard_label = nullptr;
30105 if (TARGET_STREAMING_COMPATIBLE)
30106 args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30107 callee_isa_mode);
30108 aarch64_sme_mode_switch_regs args_switch;
30109 args_switch.add_call_args (call);
30110 args_switch.emit_prologue ();
30111 aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
30112 args_switch.emit_epilogue ();
30113 if (args_guard_label)
30114 emit_label (args_guard_label);
30115 auto args_seq = get_insns ();
30116 end_sequence ();
30117 emit_insn_before (args_seq, call);
30119 if (find_reg_note (call, REG_NORETURN, NULL_RTX))
30120 return true;
30122 /* Switch mode after the call, preserving any return registers across
30123 the switch. */
30124 start_sequence ();
30125 rtx_insn *return_guard_label = nullptr;
30126 if (TARGET_STREAMING_COMPATIBLE)
30127 return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30128 callee_isa_mode);
30129 aarch64_sme_mode_switch_regs return_switch;
30130 return_switch.add_call_result (call);
30131 return_switch.emit_prologue ();
30132 aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
30133 return_switch.emit_epilogue ();
30134 if (return_guard_label)
30135 emit_label (return_guard_label);
30136 auto result_seq = get_insns ();
30137 end_sequence ();
30138 emit_insn_after (result_seq, call);
30139 return true;
30142 namespace {
30144 const pass_data pass_data_switch_pstate_sm =
30146 RTL_PASS, // type
30147 "smstarts", // name
30148 OPTGROUP_NONE, // optinfo_flags
30149 TV_NONE, // tv_id
30150 0, // properties_required
30151 0, // properties_provided
30152 0, // properties_destroyed
30153 0, // todo_flags_start
30154 TODO_df_finish, // todo_flags_finish
30157 class pass_switch_pstate_sm : public rtl_opt_pass
30159 public:
30160 pass_switch_pstate_sm (gcc::context *ctxt)
30161 : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
30164 // opt_pass methods:
30165 bool gate (function *) override final;
30166 unsigned int execute (function *) override final;
30169 bool
30170 pass_switch_pstate_sm::gate (function *fn)
30172 return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_FL_SM_OFF
30173 || cfun->machine->call_switches_pstate_sm);
30176 /* Emit any instructions needed to switch PSTATE.SM. */
30177 unsigned int
30178 pass_switch_pstate_sm::execute (function *fn)
30180 basic_block bb;
30182 auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30183 bitmap_clear (blocks);
30184 FOR_EACH_BB_FN (bb, fn)
30186 if (has_abnormal_call_or_eh_pred_edge_p (bb)
30187 && aarch64_switch_pstate_sm_for_landing_pad (bb))
30188 bitmap_set_bit (blocks, bb->index);
30190 if (cfun->machine->call_switches_pstate_sm)
30192 rtx_insn *insn;
30193 FOR_BB_INSNS (bb, insn)
30194 if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30195 if (aarch64_switch_pstate_sm_for_call (call))
30196 bitmap_set_bit (blocks, bb->index);
30199 auto end = BB_END (bb);
30200 if (JUMP_P (end)
30201 && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30202 && aarch64_switch_pstate_sm_for_jump (end))
30203 bitmap_set_bit (blocks, bb->index);
30205 find_many_sub_basic_blocks (blocks);
30206 clear_aux_for_blocks ();
30207 return 0;
30212 rtl_opt_pass *
30213 make_pass_switch_pstate_sm (gcc::context *ctxt)
30215 return new pass_switch_pstate_sm (ctxt);
30218 /* Parse an implementation-defined system register name of
30219 the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30220 Return true if name matched against above pattern, false
30221 otherwise. */
30222 bool
30223 aarch64_is_implem_def_reg (const char *regname)
30225 unsigned pos = 0;
30226 unsigned name_len = strlen (regname);
30227 if (name_len < 12 || name_len > 14)
30228 return false;
30230 auto cterm_valid_p = [&]()
30232 bool leading_zero_p = false;
30233 unsigned i = 0;
30234 char n[3] = {0};
30236 if (regname[pos] != 'c')
30237 return false;
30238 pos++;
30239 while (regname[pos] != '_')
30241 if (leading_zero_p)
30242 return false;
30243 if (i == 0 && regname[pos] == '0')
30244 leading_zero_p = true;
30245 if (i > 2)
30246 return false;
30247 if (!ISDIGIT (regname[pos]))
30248 return false;
30249 n[i++] = regname[pos++];
30251 if (atoi (n) > 15)
30252 return false;
30253 return true;
30256 if (regname[pos] != 's')
30257 return false;
30258 pos++;
30259 if (regname[pos] < '0' || regname[pos] > '3')
30260 return false;
30261 pos++;
30262 if (regname[pos++] != '_')
30263 return false;
30264 if (regname[pos] < '0' || regname[pos] > '7')
30265 return false;
30266 pos++;
30267 if (regname[pos++] != '_')
30268 return false;
30269 if (!cterm_valid_p ())
30270 return false;
30271 if (regname[pos++] != '_')
30272 return false;
30273 if (!cterm_valid_p ())
30274 return false;
30275 if (regname[pos++] != '_')
30276 return false;
30277 if (regname[pos] < '0' || regname[pos] > '7')
30278 return false;
30279 return true;
30282 /* Return true if REGNAME matches either a known permitted system
30283 register name, or a generic sysreg specification. For use in
30284 back-end predicate `aarch64_sysreg_string'. */
30285 bool
30286 aarch64_valid_sysreg_name_p (const char *regname)
30288 const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30289 if (sysreg == NULL)
30290 return aarch64_is_implem_def_reg (regname);
30291 if (sysreg->arch_reqs)
30292 return (aarch64_isa_flags & sysreg->arch_reqs);
30293 return true;
30296 /* Return the generic sysreg specification for a valid system register
30297 name, otherwise NULL. WRITE_P is true iff the register is being
30298 written to. IS128OP indicates the requested system register should
30299 be checked for a 128-bit implementation. */
30300 const char *
30301 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30303 const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30304 if (sysreg == NULL)
30306 if (aarch64_is_implem_def_reg (regname))
30307 return regname;
30308 else
30309 return NULL;
30311 if (is128op && !(sysreg->properties & F_REG_128))
30312 return NULL;
30313 if ((write_p && (sysreg->properties & F_REG_READ))
30314 || (!write_p && (sysreg->properties & F_REG_WRITE)))
30315 return NULL;
30316 if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30317 return NULL;
30318 return sysreg->encoding;
30321 /* Target-specific selftests. */
30323 #if CHECKING_P
30325 namespace selftest {
30327 /* Selftest for the RTL loader.
30328 Verify that the RTL loader copes with a dump from
30329 print_rtx_function. This is essentially just a test that class
30330 function_reader can handle a real dump, but it also verifies
30331 that lookup_reg_by_dump_name correctly handles hard regs.
30332 The presence of hard reg names in the dump means that the test is
30333 target-specific, hence it is in this file. */
30335 static void
30336 aarch64_test_loading_full_dump ()
30338 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
30340 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
30342 rtx_insn *insn_1 = get_insn_by_uid (1);
30343 ASSERT_EQ (NOTE, GET_CODE (insn_1));
30345 rtx_insn *insn_15 = get_insn_by_uid (15);
30346 ASSERT_EQ (INSN, GET_CODE (insn_15));
30347 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
30349 /* Verify crtl->return_rtx. */
30350 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
30351 ASSERT_EQ (0, REGNO (crtl->return_rtx));
30352 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
30355 /* Test the fractional_cost class. */
30357 static void
30358 aarch64_test_fractional_cost ()
30360 using cf = fractional_cost;
30362 ASSERT_EQ (cf (0, 20), 0);
30364 ASSERT_EQ (cf (4, 2), 2);
30365 ASSERT_EQ (3, cf (9, 3));
30367 ASSERT_NE (cf (5, 2), 2);
30368 ASSERT_NE (3, cf (8, 3));
30370 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30371 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30372 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30374 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30375 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30376 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30377 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30378 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30379 ASSERT_EQ (3 - cf (10, 3), 0);
30381 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30382 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30384 ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30385 ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30386 ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30387 ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30388 ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30389 ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30390 ASSERT_TRUE (cf (239, 240) <= 1);
30391 ASSERT_TRUE (cf (240, 240) <= 1);
30392 ASSERT_FALSE (cf (241, 240) <= 1);
30393 ASSERT_FALSE (2 <= cf (207, 104));
30394 ASSERT_TRUE (2 <= cf (208, 104));
30395 ASSERT_TRUE (2 <= cf (209, 104));
30397 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30398 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30399 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30400 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30401 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30402 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30403 ASSERT_TRUE (cf (239, 240) < 1);
30404 ASSERT_FALSE (cf (240, 240) < 1);
30405 ASSERT_FALSE (cf (241, 240) < 1);
30406 ASSERT_FALSE (2 < cf (207, 104));
30407 ASSERT_FALSE (2 < cf (208, 104));
30408 ASSERT_TRUE (2 < cf (209, 104));
30410 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30411 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30412 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30413 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30414 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30415 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30416 ASSERT_FALSE (cf (239, 240) >= 1);
30417 ASSERT_TRUE (cf (240, 240) >= 1);
30418 ASSERT_TRUE (cf (241, 240) >= 1);
30419 ASSERT_TRUE (2 >= cf (207, 104));
30420 ASSERT_TRUE (2 >= cf (208, 104));
30421 ASSERT_FALSE (2 >= cf (209, 104));
30423 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30424 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30425 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30426 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30427 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30428 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30429 ASSERT_FALSE (cf (239, 240) > 1);
30430 ASSERT_FALSE (cf (240, 240) > 1);
30431 ASSERT_TRUE (cf (241, 240) > 1);
30432 ASSERT_TRUE (2 > cf (207, 104));
30433 ASSERT_FALSE (2 > cf (208, 104));
30434 ASSERT_FALSE (2 > cf (209, 104));
30436 ASSERT_EQ (cf (1, 2).ceil (), 1);
30437 ASSERT_EQ (cf (11, 7).ceil (), 2);
30438 ASSERT_EQ (cf (20, 1).ceil (), 20);
30439 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30440 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30441 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30442 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30443 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30445 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30448 /* Calculate whether our system register data, as imported from
30449 `aarch64-sys-reg.def' has any duplicate entries. */
30450 static void
30451 aarch64_test_sysreg_encoding_clashes (void)
30453 using dup_instances_t = hash_map<nofree_string_hash,
30454 std::vector<const sysreg_t*>>;
30456 dup_instances_t duplicate_instances;
30458 /* Every time an encoding is established to come up more than once
30459 we add it to a "clash-analysis queue", which is then used to extract
30460 necessary information from our hash map when establishing whether
30461 repeated encodings are valid. */
30463 /* 1) Collect recurrence information. */
30464 for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
30466 const sysreg_t *reg = aarch64_sysregs + i;
30468 std::vector<const sysreg_t*> *tmp
30469 = &duplicate_instances.get_or_insert (reg->encoding);
30471 tmp->push_back (reg);
30474 /* 2) Carry out analysis on collected data. */
30475 for (auto instance : duplicate_instances)
30477 unsigned nrep = instance.second.size ();
30478 if (nrep > 1)
30479 for (unsigned i = 0; i < nrep; i++)
30480 for (unsigned j = i + 1; j < nrep; j++)
30482 const sysreg_t *a = instance.second[i];
30483 const sysreg_t *b = instance.second[j];
30484 ASSERT_TRUE ((a->properties != b->properties)
30485 || (a->arch_reqs != b->arch_reqs));
30490 /* Run all target-specific selftests. */
30492 static void
30493 aarch64_run_selftests (void)
30495 aarch64_test_loading_full_dump ();
30496 aarch64_test_fractional_cost ();
30497 aarch64_test_sysreg_encoding_clashes ();
30500 } // namespace selftest
30502 #endif /* #if CHECKING_P */
30504 #undef TARGET_STACK_PROTECT_GUARD
30505 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30507 #undef TARGET_ADDRESS_COST
30508 #define TARGET_ADDRESS_COST aarch64_address_cost
30510 /* This hook will determines whether unnamed bitfields affect the alignment
30511 of the containing structure. The hook returns true if the structure
30512 should inherit the alignment requirements of an unnamed bitfield's
30513 type. */
30514 #undef TARGET_ALIGN_ANON_BITFIELD
30515 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30517 #undef TARGET_ASM_ALIGNED_DI_OP
30518 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30520 #undef TARGET_ASM_ALIGNED_HI_OP
30521 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30523 #undef TARGET_ASM_ALIGNED_SI_OP
30524 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30526 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30527 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30528 hook_bool_const_tree_hwi_hwi_const_tree_true
30530 #undef TARGET_ASM_FILE_START
30531 #define TARGET_ASM_FILE_START aarch64_start_file
30533 #undef TARGET_ASM_OUTPUT_MI_THUNK
30534 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30536 #undef TARGET_ASM_SELECT_RTX_SECTION
30537 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30539 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30540 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30542 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30543 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30545 #undef TARGET_BUILD_BUILTIN_VA_LIST
30546 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30548 #undef TARGET_CALLEE_COPIES
30549 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30551 #undef TARGET_FRAME_POINTER_REQUIRED
30552 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30554 #undef TARGET_CAN_ELIMINATE
30555 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30557 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30558 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30559 aarch64_function_attribute_inlinable_p
30561 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30562 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30564 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30565 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30567 #undef TARGET_CAN_INLINE_P
30568 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30570 #undef TARGET_CANNOT_FORCE_CONST_MEM
30571 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30573 #undef TARGET_CASE_VALUES_THRESHOLD
30574 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30576 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30577 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30579 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30580 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30582 /* Only the least significant bit is used for initialization guard
30583 variables. */
30584 #undef TARGET_CXX_GUARD_MASK_BIT
30585 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30587 #undef TARGET_C_MODE_FOR_SUFFIX
30588 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30590 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30591 #undef TARGET_DEFAULT_TARGET_FLAGS
30592 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30593 #endif
30595 #undef TARGET_CLASS_MAX_NREGS
30596 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30598 #undef TARGET_BUILTIN_DECL
30599 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30601 #undef TARGET_BUILTIN_RECIPROCAL
30602 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30604 #undef TARGET_C_EXCESS_PRECISION
30605 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30607 #undef TARGET_C_BITINT_TYPE_INFO
30608 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
30610 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
30611 #define TARGET_C_MODE_FOR_FLOATING_TYPE aarch64_c_mode_for_floating_type
30613 #undef TARGET_EXPAND_BUILTIN
30614 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30616 #undef TARGET_EXPAND_BUILTIN_VA_START
30617 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30619 #undef TARGET_FOLD_BUILTIN
30620 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30622 #undef TARGET_FUNCTION_ARG
30623 #define TARGET_FUNCTION_ARG aarch64_function_arg
30625 #undef TARGET_FUNCTION_ARG_ADVANCE
30626 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30628 #undef TARGET_FUNCTION_ARG_BOUNDARY
30629 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30631 #undef TARGET_FUNCTION_ARG_PADDING
30632 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30634 #undef TARGET_GET_RAW_RESULT_MODE
30635 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30636 #undef TARGET_GET_RAW_ARG_MODE
30637 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30639 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30640 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30642 #undef TARGET_FUNCTION_VALUE
30643 #define TARGET_FUNCTION_VALUE aarch64_function_value
30645 #undef TARGET_FUNCTION_VALUE_REGNO_P
30646 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30648 #undef TARGET_START_CALL_ARGS
30649 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30651 #undef TARGET_END_CALL_ARGS
30652 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30654 #undef TARGET_GIMPLE_FOLD_BUILTIN
30655 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30657 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30658 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30660 #undef TARGET_INIT_BUILTINS
30661 #define TARGET_INIT_BUILTINS aarch64_init_builtins
30663 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30664 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30665 aarch64_ira_change_pseudo_allocno_class
30667 #undef TARGET_LEGITIMATE_ADDRESS_P
30668 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30670 #undef TARGET_LEGITIMATE_CONSTANT_P
30671 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30673 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30674 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30675 aarch64_legitimize_address_displacement
30677 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30678 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30680 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30681 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30682 aarch64_libgcc_floating_mode_supported_p
30684 #undef TARGET_MANGLE_TYPE
30685 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30687 #undef TARGET_INVALID_BINARY_OP
30688 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30690 #undef TARGET_VERIFY_TYPE_CONTEXT
30691 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30693 #undef TARGET_MEMORY_MOVE_COST
30694 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30696 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30697 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30699 #undef TARGET_MUST_PASS_IN_STACK
30700 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30702 /* This target hook should return true if accesses to volatile bitfields
30703 should use the narrowest mode possible. It should return false if these
30704 accesses should use the bitfield container type. */
30705 #undef TARGET_NARROW_VOLATILE_BITFIELD
30706 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30708 #undef TARGET_OPTION_OVERRIDE
30709 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30711 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30712 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30713 aarch64_override_options_after_change
30715 #undef TARGET_OFFLOAD_OPTIONS
30716 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30718 #undef TARGET_OPTION_RESTORE
30719 #define TARGET_OPTION_RESTORE aarch64_option_restore
30721 #undef TARGET_OPTION_PRINT
30722 #define TARGET_OPTION_PRINT aarch64_option_print
30724 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30725 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30727 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30728 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30729 aarch64_option_valid_version_attribute_p
30731 #undef TARGET_SET_CURRENT_FUNCTION
30732 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30734 #undef TARGET_PASS_BY_REFERENCE
30735 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30737 #undef TARGET_PREFERRED_RELOAD_CLASS
30738 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30740 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30741 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30743 #undef TARGET_DWARF_FRAME_REG_MODE
30744 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30746 #undef TARGET_PROMOTED_TYPE
30747 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30749 #undef TARGET_SECONDARY_RELOAD
30750 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30752 #undef TARGET_SECONDARY_MEMORY_NEEDED
30753 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30755 #undef TARGET_SHIFT_TRUNCATION_MASK
30756 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30758 #undef TARGET_SETUP_INCOMING_VARARGS
30759 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30761 #undef TARGET_STRUCT_VALUE_RTX
30762 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
30764 #undef TARGET_REGISTER_MOVE_COST
30765 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30767 #undef TARGET_RETURN_IN_MEMORY
30768 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30770 #undef TARGET_RETURN_IN_MSB
30771 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30773 #undef TARGET_RTX_COSTS
30774 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30776 #undef TARGET_INSN_COST
30777 #define TARGET_INSN_COST aarch64_insn_cost
30779 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30780 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30782 #undef TARGET_SCHED_ISSUE_RATE
30783 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
30785 #undef TARGET_SCHED_VARIABLE_ISSUE
30786 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
30788 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
30789 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
30790 aarch64_sched_first_cycle_multipass_dfa_lookahead
30792 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
30793 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
30794 aarch64_first_cycle_multipass_dfa_lookahead_guard
30796 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
30797 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
30798 aarch64_get_separate_components
30800 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
30801 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
30802 aarch64_components_for_bb
30804 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
30805 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
30806 aarch64_disqualify_components
30808 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
30809 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
30810 aarch64_emit_prologue_components
30812 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
30813 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
30814 aarch64_emit_epilogue_components
30816 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
30817 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
30818 aarch64_set_handled_components
30820 #undef TARGET_TRAMPOLINE_INIT
30821 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
30823 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
30824 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
30826 #undef TARGET_VECTOR_MODE_SUPPORTED_P
30827 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
30829 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
30830 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
30832 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
30833 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
30835 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
30836 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
30837 aarch64_builtin_support_vector_misalignment
30839 #undef TARGET_ARRAY_MODE
30840 #define TARGET_ARRAY_MODE aarch64_array_mode
30842 #undef TARGET_ARRAY_MODE_SUPPORTED_P
30843 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
30845 #undef TARGET_VECTORIZE_CREATE_COSTS
30846 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
30848 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
30849 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
30850 aarch64_builtin_vectorization_cost
30852 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
30853 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
30855 #undef TARGET_VECTORIZE_BUILTINS
30856 #define TARGET_VECTORIZE_BUILTINS
30858 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
30859 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
30860 aarch64_autovectorize_vector_modes
30862 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
30863 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
30864 aarch64_atomic_assign_expand_fenv
30866 /* Section anchor support. */
30868 #undef TARGET_MIN_ANCHOR_OFFSET
30869 #define TARGET_MIN_ANCHOR_OFFSET -256
30871 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
30872 byte offset; we can do much more for larger data types, but have no way
30873 to determine the size of the access. We assume accesses are aligned. */
30874 #undef TARGET_MAX_ANCHOR_OFFSET
30875 #define TARGET_MAX_ANCHOR_OFFSET 4095
30877 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
30878 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
30879 aarch64_vectorize_preferred_div_as_shifts_over_mult
30881 #undef TARGET_VECTOR_ALIGNMENT
30882 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
30884 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
30885 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
30886 aarch64_vectorize_preferred_vector_alignment
30887 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
30888 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
30889 aarch64_simd_vector_alignment_reachable
30891 /* vec_perm support. */
30893 #undef TARGET_VECTORIZE_VEC_PERM_CONST
30894 #define TARGET_VECTORIZE_VEC_PERM_CONST \
30895 aarch64_vectorize_vec_perm_const
30897 #undef TARGET_VECTORIZE_RELATED_MODE
30898 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
30899 #undef TARGET_VECTORIZE_GET_MASK_MODE
30900 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
30901 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
30902 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
30903 aarch64_empty_mask_is_expensive
30904 #undef TARGET_PREFERRED_ELSE_VALUE
30905 #define TARGET_PREFERRED_ELSE_VALUE \
30906 aarch64_preferred_else_value
30908 #undef TARGET_INIT_LIBFUNCS
30909 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
30911 #undef TARGET_FIXED_CONDITION_CODE_REGS
30912 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
30914 #undef TARGET_FLAGS_REGNUM
30915 #define TARGET_FLAGS_REGNUM CC_REGNUM
30917 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
30918 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
30920 #undef TARGET_ASAN_SHADOW_OFFSET
30921 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
30923 #undef TARGET_LEGITIMIZE_ADDRESS
30924 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
30926 #undef TARGET_SCHED_CAN_SPECULATE_INSN
30927 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
30929 #undef TARGET_CAN_USE_DOLOOP_P
30930 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
30932 #undef TARGET_SCHED_ADJUST_PRIORITY
30933 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
30935 #undef TARGET_SCHED_MACRO_FUSION_P
30936 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
30938 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
30939 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
30941 #undef TARGET_SCHED_FUSION_PRIORITY
30942 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
30944 #undef TARGET_UNSPEC_MAY_TRAP_P
30945 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
30947 #undef TARGET_USE_PSEUDO_PIC_REG
30948 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
30950 #undef TARGET_PRINT_OPERAND
30951 #define TARGET_PRINT_OPERAND aarch64_print_operand
30953 #undef TARGET_PRINT_OPERAND_ADDRESS
30954 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
30956 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
30957 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
30959 #undef TARGET_OPTAB_SUPPORTED_P
30960 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
30962 #undef TARGET_OMIT_STRUCT_RETURN_REG
30963 #define TARGET_OMIT_STRUCT_RETURN_REG true
30965 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
30966 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
30967 aarch64_dwarf_poly_indeterminate_value
30969 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
30970 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
30971 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
30973 #undef TARGET_HARD_REGNO_NREGS
30974 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
30975 #undef TARGET_HARD_REGNO_MODE_OK
30976 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
30978 #undef TARGET_MODES_TIEABLE_P
30979 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
30981 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
30982 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
30983 aarch64_hard_regno_call_part_clobbered
30985 #undef TARGET_INSN_CALLEE_ABI
30986 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
30988 #undef TARGET_CONSTANT_ALIGNMENT
30989 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
30991 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
30992 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
30993 aarch64_stack_clash_protection_alloca_probe_range
30995 #undef TARGET_COMPUTE_PRESSURE_CLASSES
30996 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
30998 #undef TARGET_CAN_CHANGE_MODE_CLASS
30999 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
31001 #undef TARGET_SELECT_EARLY_REMAT_MODES
31002 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
31004 #undef TARGET_SPECULATION_SAFE_VALUE
31005 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
31007 #undef TARGET_ESTIMATED_POLY_VALUE
31008 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
31010 #undef TARGET_ATTRIBUTE_TABLE
31011 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
31013 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
31014 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
31015 aarch64_simd_clone_compute_vecsize_and_simdlen
31017 #undef TARGET_SIMD_CLONE_ADJUST
31018 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
31020 #undef TARGET_SIMD_CLONE_USABLE
31021 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
31023 #undef TARGET_COMP_TYPE_ATTRIBUTES
31024 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
31026 #undef TARGET_MERGE_DECL_ATTRIBUTES
31027 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
31029 #undef TARGET_GET_MULTILIB_ABI_NAME
31030 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
31032 #undef TARGET_FNTYPE_ABI
31033 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
31035 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
31036 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
31038 #if CHECKING_P
31039 #undef TARGET_RUN_TARGET_SELFTESTS
31040 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
31041 #endif /* #if CHECKING_P */
31043 #undef TARGET_ASM_POST_CFI_STARTPROC
31044 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
31046 #undef TARGET_STRICT_ARGUMENT_NAMING
31047 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
31049 #undef TARGET_MODE_EMIT
31050 #define TARGET_MODE_EMIT aarch64_mode_emit
31052 #undef TARGET_MODE_NEEDED
31053 #define TARGET_MODE_NEEDED aarch64_mode_needed
31055 #undef TARGET_MODE_AFTER
31056 #define TARGET_MODE_AFTER aarch64_mode_after
31058 #undef TARGET_MODE_CONFLUENCE
31059 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
31061 #undef TARGET_MODE_BACKPROP
31062 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
31064 #undef TARGET_MODE_ENTRY
31065 #define TARGET_MODE_ENTRY aarch64_mode_entry
31067 #undef TARGET_MODE_EXIT
31068 #define TARGET_MODE_EXIT aarch64_mode_exit
31070 #undef TARGET_MODE_EH_HANDLER
31071 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
31073 #undef TARGET_MODE_PRIORITY
31074 #define TARGET_MODE_PRIORITY aarch64_mode_priority
31076 #undef TARGET_MD_ASM_ADJUST
31077 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31079 #undef TARGET_ASM_FILE_END
31080 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31082 #undef TARGET_ASM_FUNCTION_EPILOGUE
31083 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31085 #undef TARGET_HAVE_SHADOW_CALL_STACK
31086 #define TARGET_HAVE_SHADOW_CALL_STACK true
31088 #undef TARGET_CONST_ANCHOR
31089 #define TARGET_CONST_ANCHOR 0x1000000
31091 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31092 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31094 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31095 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31097 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31098 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31100 #undef TARGET_OPTION_FUNCTION_VERSIONS
31101 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31103 #undef TARGET_COMPARE_VERSION_PRIORITY
31104 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31106 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31107 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31108 aarch64_generate_version_dispatcher_body
31110 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31111 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31112 aarch64_get_function_versions_dispatcher
31114 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31115 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31117 struct gcc_target targetm = TARGET_INITIALIZER;
31119 #include "gt-aarch64.h"