diagnostics: rename tree-diagnostic-path.cc to diagnostic-path.cc
[official-gcc.git] / gcc / config / aarch64 / aarch64.cc
blob026f8627a89306c32629a3b191db126c08dc1084
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2024 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #define INCLUDE_VECTOR
26 #include "config.h"
27 #include "system.h"
28 #include "coretypes.h"
29 #include "backend.h"
30 #include "target.h"
31 #include "rtl.h"
32 #include "tree.h"
33 #include "memmodel.h"
34 #include "gimple.h"
35 #include "cfghooks.h"
36 #include "cfgloop.h"
37 #include "df.h"
38 #include "tm_p.h"
39 #include "stringpool.h"
40 #include "attribs.h"
41 #include "optabs.h"
42 #include "regs.h"
43 #include "emit-rtl.h"
44 #include "recog.h"
45 #include "cgraph.h"
46 #include "diagnostic.h"
47 #include "insn-attr.h"
48 #include "alias.h"
49 #include "fold-const.h"
50 #include "stor-layout.h"
51 #include "calls.h"
52 #include "varasm.h"
53 #include "output.h"
54 #include "flags.h"
55 #include "explow.h"
56 #include "expr.h"
57 #include "reload.h"
58 #include "langhooks.h"
59 #include "opts.h"
60 #include "gimplify.h"
61 #include "dwarf2.h"
62 #include "gimple-iterator.h"
63 #include "tree-vectorizer.h"
64 #include "aarch64-cost-tables.h"
65 #include "dumpfile.h"
66 #include "builtins.h"
67 #include "rtl-iter.h"
68 #include "tm-constrs.h"
69 #include "sched-int.h"
70 #include "target-globals.h"
71 #include "common/common-target.h"
72 #include "cfgrtl.h"
73 #include "selftest.h"
74 #include "selftest-rtl.h"
75 #include "rtx-vector-builder.h"
76 #include "intl.h"
77 #include "expmed.h"
78 #include "function-abi.h"
79 #include "gimple-pretty-print.h"
80 #include "tree-ssa-loop-niter.h"
81 #include "fractional-cost.h"
82 #include "rtlanal.h"
83 #include "tree-dfa.h"
84 #include "asan.h"
85 #include "aarch64-feature-deps.h"
86 #include "config/arm/aarch-common.h"
87 #include "config/arm/aarch-common-protos.h"
88 #include "common/config/aarch64/cpuinfo.h"
89 #include "ssa.h"
90 #include "except.h"
91 #include "tree-pass.h"
92 #include "cfgbuild.h"
93 #include "symbol-summary.h"
94 #include "sreal.h"
95 #include "ipa-cp.h"
96 #include "ipa-prop.h"
97 #include "ipa-fnsummary.h"
98 #include "hash-map.h"
100 /* This file should be included last. */
101 #include "target-def.h"
103 /* Defined for convenience. */
104 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
106 /* Maximum bytes set for an inline memset expansion. With -Os use 3 STP
107 and 1 MOVI/DUP (same size as a call). */
108 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
110 /* Flags that describe how a function shares certain architectural state
111 with its callers.
113 - AARCH64_STATE_SHARED indicates that the function does share the state
114 with callers.
116 - AARCH64_STATE_IN indicates that the function reads (or might read) the
117 incoming state. The converse is that the function ignores the incoming
118 state.
120 - AARCH64_STATE_OUT indicates that the function returns new state.
121 The converse is that the state on return is the same as it was on entry.
123 A function that partially modifies the state treats it as both IN
124 and OUT (because the value on return depends to some extent on the
125 value on input). */
126 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
127 constexpr auto AARCH64_STATE_IN = 1U << 1;
128 constexpr auto AARCH64_STATE_OUT = 1U << 2;
130 /* Information about a legitimate vector immediate operand. */
131 struct simd_immediate_info
133 enum insn_type { MOV, MVN, INDEX, PTRUE };
134 enum modifier_type { LSL, MSL };
136 simd_immediate_info () {}
137 simd_immediate_info (scalar_float_mode, rtx);
138 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
139 insn_type = MOV, modifier_type = LSL,
140 unsigned int = 0);
141 simd_immediate_info (scalar_mode, rtx, rtx);
142 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
144 /* The mode of the elements. */
145 scalar_mode elt_mode;
147 /* The instruction to use to move the immediate into a vector. */
148 insn_type insn;
150 union
152 /* For MOV and MVN. */
153 struct
155 /* The value of each element. */
156 rtx value;
158 /* The kind of shift modifier to use, and the number of bits to shift.
159 This is (LSL, 0) if no shift is needed. */
160 modifier_type modifier;
161 unsigned int shift;
162 } mov;
164 /* For INDEX. */
165 struct
167 /* The value of the first element and the step to be added for each
168 subsequent element. */
169 rtx base, step;
170 } index;
172 /* For PTRUE. */
173 aarch64_svpattern pattern;
174 } u;
177 /* Construct a floating-point immediate in which each element has mode
178 ELT_MODE_IN and value VALUE_IN. */
179 inline simd_immediate_info
180 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
181 : elt_mode (elt_mode_in), insn (MOV)
183 u.mov.value = value_in;
184 u.mov.modifier = LSL;
185 u.mov.shift = 0;
188 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
189 and value VALUE_IN. The other parameters are as for the structure
190 fields. */
191 inline simd_immediate_info
192 ::simd_immediate_info (scalar_int_mode elt_mode_in,
193 unsigned HOST_WIDE_INT value_in,
194 insn_type insn_in, modifier_type modifier_in,
195 unsigned int shift_in)
196 : elt_mode (elt_mode_in), insn (insn_in)
198 u.mov.value = gen_int_mode (value_in, elt_mode_in);
199 u.mov.modifier = modifier_in;
200 u.mov.shift = shift_in;
203 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
204 and where element I is equal to BASE_IN + I * STEP_IN. */
205 inline simd_immediate_info
206 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
207 : elt_mode (elt_mode_in), insn (INDEX)
209 u.index.base = base_in;
210 u.index.step = step_in;
213 /* Construct a predicate that controls elements of mode ELT_MODE_IN
214 and has PTRUE pattern PATTERN_IN. */
215 inline simd_immediate_info
216 ::simd_immediate_info (scalar_int_mode elt_mode_in,
217 aarch64_svpattern pattern_in)
218 : elt_mode (elt_mode_in), insn (PTRUE)
220 u.pattern = pattern_in;
223 namespace {
225 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
226 class pure_scalable_type_info
228 public:
229 /* Represents the result of analyzing a type. All values are nonzero,
230 in the possibly forlorn hope that accidental conversions to bool
231 trigger a warning. */
232 enum analysis_result
234 /* The type does not have an ABI identity; i.e. it doesn't contain
235 at least one object whose type is a Fundamental Data Type. */
236 NO_ABI_IDENTITY = 1,
238 /* The type is definitely a Pure Scalable Type. */
239 IS_PST,
241 /* The type is definitely not a Pure Scalable Type. */
242 ISNT_PST,
244 /* It doesn't matter for PCS purposes whether the type is a Pure
245 Scalable Type or not, since the type will be handled the same
246 way regardless.
248 Specifically, this means that if the type is a Pure Scalable Type,
249 there aren't enough argument registers to hold it, and so it will
250 need to be passed or returned in memory. If the type isn't a
251 Pure Scalable Type, it's too big to be passed or returned in core
252 or SIMD&FP registers, and so again will need to go in memory. */
253 DOESNT_MATTER
256 /* Aggregates of 17 bytes or more are normally passed and returned
257 in memory, so aggregates of that size can safely be analyzed as
258 DOESNT_MATTER. We need to be able to collect enough pieces to
259 represent a PST that is smaller than that. Since predicates are
260 2 bytes in size for -msve-vector-bits=128, that means we need to be
261 able to store at least 8 pieces.
263 We also need to be able to store enough pieces to represent
264 a single vector in each vector argument register and a single
265 predicate in each predicate argument register. This means that
266 we need at least 12 pieces. */
267 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
268 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
270 /* Describes one piece of a PST. Each piece is one of:
272 - a single Scalable Vector Type (SVT)
273 - a single Scalable Predicate Type (SPT)
274 - a PST containing 2, 3 or 4 SVTs, with no padding
276 It either represents a single built-in type or a PST formed from
277 multiple homogeneous built-in types. */
278 struct piece
280 rtx get_rtx (unsigned int, unsigned int) const;
282 /* The number of vector and predicate registers that the piece
283 occupies. One of the two is always zero. */
284 unsigned int num_zr;
285 unsigned int num_pr;
287 /* The mode of the registers described above. */
288 machine_mode mode;
290 /* If this piece is formed from multiple homogeneous built-in types,
291 this is the mode of the built-in types, otherwise it is MODE. */
292 machine_mode orig_mode;
294 /* The offset in bytes of the piece from the start of the type. */
295 poly_uint64 offset;
298 /* Divides types analyzed as IS_PST into individual pieces. The pieces
299 are in memory order. */
300 auto_vec<piece, MAX_PIECES> pieces;
302 unsigned int num_zr () const;
303 unsigned int num_pr () const;
305 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
307 analysis_result analyze (const_tree);
308 bool analyze_registers (const_tree);
310 private:
311 analysis_result analyze_array (const_tree);
312 analysis_result analyze_record (const_tree);
313 void add_piece (const piece &);
317 /* The current code model. */
318 enum aarch64_code_model aarch64_cmodel;
320 enum aarch64_tp_reg aarch64_tpidr_register;
322 /* The number of 64-bit elements in an SVE vector. */
323 poly_uint16 aarch64_sve_vg;
325 #ifdef HAVE_AS_TLS
326 #undef TARGET_HAVE_TLS
327 #define TARGET_HAVE_TLS 1
328 #endif
330 static bool aarch64_composite_type_p (const_tree, machine_mode);
331 static bool aarch64_return_in_memory_1 (const_tree);
332 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
333 const_tree,
334 machine_mode *, int *,
335 bool *, bool);
336 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
337 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
338 static void aarch64_override_options_after_change (void);
339 static bool aarch64_vector_mode_supported_p (machine_mode);
340 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
341 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
342 const_tree type,
343 int misalignment,
344 bool is_packed);
345 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
346 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
347 aarch64_addr_query_type);
349 /* The processor for which instructions should be scheduled. */
350 enum aarch64_processor aarch64_tune = cortexa53;
352 /* Mask to specify which instruction scheduling options should be used. */
353 uint64_t aarch64_tune_flags = 0;
355 /* Global flag for PC relative loads. */
356 bool aarch64_pcrelative_literal_loads;
358 /* Global flag for whether frame pointer is enabled. */
359 bool aarch64_use_frame_pointer;
361 /* Support for command line parsing of boolean flags in the tuning
362 structures. */
363 struct aarch64_flag_desc
365 const char* name;
366 unsigned int flag;
369 #define AARCH64_FUSION_PAIR(name, internal_name) \
370 { name, AARCH64_FUSE_##internal_name },
371 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
373 { "none", AARCH64_FUSE_NOTHING },
374 #include "aarch64-fusion-pairs.def"
375 { "all", AARCH64_FUSE_ALL },
376 { NULL, AARCH64_FUSE_NOTHING }
379 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
380 { name, AARCH64_EXTRA_TUNE_##internal_name },
381 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
383 { "none", AARCH64_EXTRA_TUNE_NONE },
384 #include "aarch64-tuning-flags.def"
385 { "all", AARCH64_EXTRA_TUNE_ALL },
386 { NULL, AARCH64_EXTRA_TUNE_NONE }
389 /* Tuning parameters. */
390 #include "tuning_models/generic.h"
391 #include "tuning_models/generic_armv8_a.h"
392 #include "tuning_models/generic_armv9_a.h"
393 #include "tuning_models/cortexa35.h"
394 #include "tuning_models/cortexa53.h"
395 #include "tuning_models/cortexa57.h"
396 #include "tuning_models/cortexa72.h"
397 #include "tuning_models/cortexa73.h"
398 #include "tuning_models/exynosm1.h"
399 #include "tuning_models/thunderxt88.h"
400 #include "tuning_models/thunderx.h"
401 #include "tuning_models/tsv110.h"
402 #include "tuning_models/xgene1.h"
403 #include "tuning_models/emag.h"
404 #include "tuning_models/qdf24xx.h"
405 #include "tuning_models/saphira.h"
406 #include "tuning_models/thunderx2t99.h"
407 #include "tuning_models/thunderx3t110.h"
408 #include "tuning_models/neoversen1.h"
409 #include "tuning_models/ampere1.h"
410 #include "tuning_models/ampere1a.h"
411 #include "tuning_models/ampere1b.h"
412 #include "tuning_models/neoversev1.h"
413 #include "tuning_models/neoverse512tvb.h"
414 #include "tuning_models/neoversen2.h"
415 #include "tuning_models/neoversev2.h"
416 #include "tuning_models/a64fx.h"
418 /* Support for fine-grained override of the tuning structures. */
419 struct aarch64_tuning_override_function
421 const char* name;
422 void (*parse_override)(const char*, struct tune_params*);
425 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
426 static void aarch64_parse_tune_string (const char*, struct tune_params*);
427 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
429 static const struct aarch64_tuning_override_function
430 aarch64_tuning_override_functions[] =
432 { "fuse", aarch64_parse_fuse_string },
433 { "tune", aarch64_parse_tune_string },
434 { "sve_width", aarch64_parse_sve_width_string },
435 { NULL, NULL }
438 /* A processor implementing AArch64. */
439 struct processor
441 const char *name;
442 aarch64_processor ident;
443 aarch64_processor sched_core;
444 aarch64_arch arch;
445 aarch64_feature_flags flags;
446 const tune_params *tune;
449 /* Architectures implementing AArch64. */
450 static CONSTEXPR const processor all_architectures[] =
452 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
453 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
454 feature_deps::ARCH_IDENT ().enable, NULL},
455 #include "aarch64-arches.def"
456 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
459 /* Processor cores implementing AArch64. */
460 static const struct processor all_cores[] =
462 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
463 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
464 feature_deps::cpu_##IDENT, &COSTS##_tunings},
465 #include "aarch64-cores.def"
466 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
468 /* Internal representation of system registers. */
469 typedef struct {
470 const char *name;
471 /* Stringified sysreg encoding values, represented as
472 s<sn>_<op1>_c<cn>_c<cm>_<op2>. */
473 const char *encoding;
474 /* Flags affecting sysreg usage, such as read/write-only. */
475 unsigned properties;
476 /* Architectural features implied by sysreg. */
477 aarch64_feature_flags arch_reqs;
478 } sysreg_t;
480 /* An aarch64_feature_set initializer for a single feature,
481 AARCH64_FEATURE_<FEAT>. */
482 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
484 /* Used by AARCH64_FEATURES. */
485 #define AARCH64_OR_FEATURES_1(X, F1) \
486 AARCH64_FEATURE (F1)
487 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
488 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
489 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
490 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
492 /* An aarch64_feature_set initializer for the N features listed in "...". */
493 #define AARCH64_FEATURES(N, ...) \
494 AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
496 #define AARCH64_NO_FEATURES 0
498 /* Flags associated with the properties of system registers. It mainly serves
499 to mark particular registers as read or write only. */
500 #define F_DEPRECATED (1 << 1)
501 #define F_REG_READ (1 << 2)
502 #define F_REG_WRITE (1 << 3)
503 #define F_ARCHEXT (1 << 4)
504 /* Flag indicating register name is alias for another system register. */
505 #define F_REG_ALIAS (1 << 5)
506 /* Flag indicatinig registers which may be implemented with 128-bits. */
507 #define F_REG_128 (1 << 6)
509 /* Database of system registers, their encodings and architectural
510 requirements. */
511 const sysreg_t aarch64_sysregs[] =
513 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
514 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
515 { NAME, ENC, FLAGS, ARCH },
516 #include "aarch64-sys-regs.def"
517 #undef CPENC
520 #undef AARCH64_NO_FEATURES
522 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
523 static sysreg_map_t *sysreg_map = nullptr;
525 /* Map system register names to their hardware metadata: encoding,
526 feature flags and architectural feature requirements, all of which
527 are encoded in a sysreg_t struct. */
528 void
529 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
531 bool dup = sysreg_map->put (name, metadata);
532 gcc_checking_assert (!dup);
535 /* Lazily initialize hash table for system register validation,
536 checking the validity of supplied register name and returning
537 register's associated metadata. */
538 static void
539 aarch64_init_sysregs (void)
541 gcc_assert (!sysreg_map);
542 sysreg_map = new sysreg_map_t;
545 for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
547 const sysreg_t *reg = aarch64_sysregs + i;
548 aarch64_register_sysreg (reg->name, reg);
552 /* No direct access to the sysreg hash-map should be made. Doing so
553 risks trying to acess an unitialized hash-map and dereferencing the
554 returned double pointer without due care risks dereferencing a
555 null-pointer. */
556 const sysreg_t *
557 aarch64_lookup_sysreg_map (const char *regname)
559 if (!sysreg_map)
560 aarch64_init_sysregs ();
562 const sysreg_t **sysreg_entry = sysreg_map->get (regname);
563 if (sysreg_entry != NULL)
564 return *sysreg_entry;
565 return NULL;
568 /* The current tuning set. */
569 struct tune_params aarch64_tune_params = generic_tunings;
571 /* If NAME is the name of an arm:: attribute that describes shared state,
572 return its associated AARCH64_STATE_* flags, otherwise return 0. */
573 static unsigned int
574 aarch64_attribute_shared_state_flags (const char *name)
576 if (strcmp (name, "in") == 0)
577 return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
578 if (strcmp (name, "inout") == 0)
579 return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
580 if (strcmp (name, "out") == 0)
581 return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
582 if (strcmp (name, "preserves") == 0)
583 return AARCH64_STATE_SHARED;
584 return 0;
587 /* See whether attribute list ATTRS has any sharing information
588 for state STATE_NAME. Return the associated state flags if so,
589 otherwise return 0. */
590 static unsigned int
591 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
593 for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
595 if (!cxx11_attribute_p (attr))
596 continue;
598 auto ns = IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr)));
599 if (strcmp (ns, "arm") != 0)
600 continue;
602 auto attr_name = IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr)));
603 auto flags = aarch64_attribute_shared_state_flags (attr_name);
604 if (!flags)
605 continue;
607 for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
609 tree value = TREE_VALUE (arg);
610 if (TREE_CODE (value) == STRING_CST
611 && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
612 return flags;
615 return 0;
618 /* Return true if DECL creates a new scope for state STATE_STRING. */
619 static bool
620 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
622 if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
623 for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
625 tree value = TREE_VALUE (arg);
626 if (TREE_CODE (value) == STRING_CST
627 && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
628 return true;
630 return false;
633 /* Return true if attribute argument VALUE is a recognized state string,
634 otherwise report an error. NAME is the name of the attribute to which
635 VALUE is being passed. */
636 static bool
637 aarch64_check_state_string (tree name, tree value)
639 if (TREE_CODE (value) != STRING_CST)
641 error ("the arguments to %qE must be constant strings", name);
642 return false;
645 const char *state_name = TREE_STRING_POINTER (value);
646 if (strcmp (state_name, "za") != 0
647 && strcmp (state_name, "zt0") != 0)
649 error ("unrecognized state string %qs", state_name);
650 return false;
653 return true;
656 /* qsort callback to compare two STRING_CSTs. */
657 static int
658 cmp_string_csts (const void *a, const void *b)
660 return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
661 TREE_STRING_POINTER (*(const_tree const *) b));
664 /* Canonicalize a list of state strings. ARGS contains the arguments to
665 a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
666 of the same type. If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
667 arguments and drop the new attribute. Otherwise, the new attribute must
668 be kept and ARGS must include the information in OLD_ATTR.
670 In both cases, the new arguments must be a sorted list of state strings
671 with duplicates removed.
673 Return true if new attribute should be kept, false if it should be
674 dropped. */
675 static bool
676 aarch64_merge_string_arguments (tree args, tree old_attr,
677 bool can_merge_in_place)
679 /* Get a sorted list of all state strings (including duplicates). */
680 auto add_args = [](vec<tree> &strings, const_tree args)
682 for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
683 if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
684 strings.safe_push (TREE_VALUE (arg));
686 auto_vec<tree, 16> strings;
687 add_args (strings, args);
688 if (old_attr)
689 add_args (strings, TREE_VALUE (old_attr));
690 strings.qsort (cmp_string_csts);
692 /* The list can be empty if there was no previous attribute and if all
693 the new arguments are erroneous. Drop the attribute in that case. */
694 if (strings.is_empty ())
695 return false;
697 /* Destructively modify one of the argument lists, removing duplicates
698 on the fly. */
699 bool use_old_attr = old_attr && can_merge_in_place;
700 tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
701 tree prev = NULL_TREE;
702 for (tree arg : strings)
704 if (prev && simple_cst_equal (arg, prev))
705 continue;
706 prev = arg;
707 if (!*end)
708 *end = tree_cons (NULL_TREE, arg, NULL_TREE);
709 else
710 TREE_VALUE (*end) = arg;
711 end = &TREE_CHAIN (*end);
713 *end = NULL_TREE;
714 return !use_old_attr;
717 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
719 static tree
720 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
721 int, bool *no_add_attrs)
723 /* Since we set fn_type_req to true, the caller should have checked
724 this for us. */
725 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
726 switch ((arm_pcs) fntype_abi (*node).id ())
728 case ARM_PCS_AAPCS64:
729 case ARM_PCS_SIMD:
730 return NULL_TREE;
732 case ARM_PCS_SVE:
733 error ("the %qE attribute cannot be applied to an SVE function type",
734 name);
735 *no_add_attrs = true;
736 return NULL_TREE;
738 case ARM_PCS_TLSDESC:
739 case ARM_PCS_UNKNOWN:
740 break;
742 gcc_unreachable ();
745 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
746 otherwise report an error. */
747 static bool
748 aarch64_check_arm_new_against_type (tree args, tree decl)
750 tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
751 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
753 tree value = TREE_VALUE (arg);
754 if (TREE_CODE (value) == STRING_CST)
756 const char *state_name = TREE_STRING_POINTER (value);
757 if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
759 error_at (DECL_SOURCE_LOCATION (decl),
760 "cannot create a new %qs scope since %qs is shared"
761 " with callers", state_name, state_name);
762 return false;
766 return true;
769 /* Callback for arm::new attributes. */
770 static tree
771 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
773 tree decl = *node;
774 if (TREE_CODE (decl) != FUNCTION_DECL)
776 error ("%qE attribute applies only to function definitions", name);
777 *no_add_attrs = true;
778 return NULL_TREE;
780 if (TREE_TYPE (decl) == error_mark_node)
782 *no_add_attrs = true;
783 return NULL_TREE;
786 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
787 aarch64_check_state_string (name, TREE_VALUE (arg));
789 if (!aarch64_check_arm_new_against_type (args, decl))
791 *no_add_attrs = true;
792 return NULL_TREE;
795 /* If there is an old attribute, we should try to update it in-place,
796 so that there is only one (definitive) arm::new attribute on the decl. */
797 tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
798 if (!aarch64_merge_string_arguments (args, old_attr, true))
799 *no_add_attrs = true;
801 return NULL_TREE;
804 /* Callback for arm::{in,out,inout,preserves} attributes. */
805 static tree
806 handle_arm_shared (tree *node, tree name, tree args,
807 int, bool *no_add_attrs)
809 tree type = *node;
810 tree old_attrs = TYPE_ATTRIBUTES (type);
811 auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
812 for (tree arg = args; arg; arg = TREE_CHAIN (arg))
814 tree value = TREE_VALUE (arg);
815 if (aarch64_check_state_string (name, value))
817 const char *state_name = TREE_STRING_POINTER (value);
818 auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
819 state_name);
820 if (old_flags && old_flags != flags)
822 error ("inconsistent attributes for state %qs", state_name);
823 *no_add_attrs = true;
824 return NULL_TREE;
829 /* We can't update an old attribute in-place, since types are shared.
830 Instead make sure that this new attribute contains all the
831 information, so that the old attribute becomes redundant. */
832 tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
833 old_attrs);
834 if (!aarch64_merge_string_arguments (args, old_attr, false))
835 *no_add_attrs = true;
837 return NULL_TREE;
840 /* Mutually-exclusive function type attributes for controlling PSTATE.SM. */
841 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
843 /* Attribute name exclusion applies to:
844 function, type, variable */
845 { "streaming", false, true, false },
846 { "streaming_compatible", false, true, false },
847 { NULL, false, false, false }
850 /* Table of machine attributes. */
851 static const attribute_spec aarch64_gnu_attributes[] =
853 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
854 affects_type_identity, handler, exclude } */
855 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
856 handle_aarch64_vector_pcs_attribute, NULL },
857 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
858 aarch64_sve::handle_arm_sve_vector_bits_attribute,
859 NULL },
860 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
861 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
862 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL }
865 static const scoped_attribute_specs aarch64_gnu_attribute_table =
867 "gnu", { aarch64_gnu_attributes }
870 static const attribute_spec aarch64_arm_attributes[] =
872 { "streaming", 0, 0, false, true, true, true,
873 NULL, attr_streaming_exclusions },
874 { "streaming_compatible", 0, 0, false, true, true, true,
875 NULL, attr_streaming_exclusions },
876 { "locally_streaming", 0, 0, true, false, false, false, NULL, NULL },
877 { "new", 1, -1, true, false, false, false,
878 handle_arm_new, NULL },
879 { "preserves", 1, -1, false, true, true, true,
880 handle_arm_shared, NULL },
881 { "in", 1, -1, false, true, true, true,
882 handle_arm_shared, NULL },
883 { "out", 1, -1, false, true, true, true,
884 handle_arm_shared, NULL },
885 { "inout", 1, -1, false, true, true, true,
886 handle_arm_shared, NULL }
889 static const scoped_attribute_specs aarch64_arm_attribute_table =
891 "arm", { aarch64_arm_attributes }
894 static const scoped_attribute_specs *const aarch64_attribute_table[] =
896 &aarch64_gnu_attribute_table,
897 &aarch64_arm_attribute_table
900 typedef enum aarch64_cond_code
902 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
903 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
904 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
906 aarch64_cc;
908 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
911 /* The condition codes of the processor, and the inverse function. */
912 static const char * const aarch64_condition_codes[] =
914 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
915 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
918 /* The preferred condition codes for SVE conditions. */
919 static const char *const aarch64_sve_condition_codes[] =
921 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
922 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
925 /* Return the assembly token for svpattern value VALUE. */
927 static const char *
928 svpattern_token (enum aarch64_svpattern pattern)
930 switch (pattern)
932 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
933 AARCH64_FOR_SVPATTERN (CASE)
934 #undef CASE
935 case AARCH64_NUM_SVPATTERNS:
936 break;
938 gcc_unreachable ();
941 /* Return the location of a piece that is known to be passed or returned
942 in registers. FIRST_ZR is the first unused vector argument register
943 and FIRST_PR is the first unused predicate argument register. */
946 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
947 unsigned int first_pr) const
949 gcc_assert (VECTOR_MODE_P (mode)
950 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
951 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
953 if (num_zr > 0 && num_pr == 0)
954 return gen_rtx_REG (mode, first_zr);
956 if (num_zr == 0 && num_pr <= 2)
957 return gen_rtx_REG (mode, first_pr);
959 gcc_unreachable ();
962 /* Return the total number of vector registers required by the PST. */
964 unsigned int
965 pure_scalable_type_info::num_zr () const
967 unsigned int res = 0;
968 for (unsigned int i = 0; i < pieces.length (); ++i)
969 res += pieces[i].num_zr;
970 return res;
973 /* Return the total number of predicate registers required by the PST. */
975 unsigned int
976 pure_scalable_type_info::num_pr () const
978 unsigned int res = 0;
979 for (unsigned int i = 0; i < pieces.length (); ++i)
980 res += pieces[i].num_pr;
981 return res;
984 /* Return the location of a PST that is known to be passed or returned
985 in registers. FIRST_ZR is the first unused vector argument register
986 and FIRST_PR is the first unused predicate argument register. */
989 pure_scalable_type_info::get_rtx (machine_mode mode,
990 unsigned int first_zr,
991 unsigned int first_pr) const
993 /* Try to return a single REG if possible. This leads to better
994 code generation; it isn't required for correctness. */
995 if (mode == pieces[0].mode)
997 gcc_assert (pieces.length () == 1);
998 return pieces[0].get_rtx (first_zr, first_pr);
1001 /* Build up a PARALLEL that contains the individual pieces. */
1002 rtvec rtxes = rtvec_alloc (pieces.length ());
1003 for (unsigned int i = 0; i < pieces.length (); ++i)
1005 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1006 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1007 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1008 first_zr += pieces[i].num_zr;
1009 first_pr += pieces[i].num_pr;
1011 return gen_rtx_PARALLEL (mode, rtxes);
1014 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1015 in the AAPCS64. */
1017 pure_scalable_type_info::analysis_result
1018 pure_scalable_type_info::analyze (const_tree type)
1020 /* Prevent accidental reuse. */
1021 gcc_assert (pieces.is_empty ());
1023 /* No code will be generated for erroneous types, so we won't establish
1024 an ABI mapping. */
1025 if (type == error_mark_node)
1026 return NO_ABI_IDENTITY;
1028 /* Zero-sized types disappear in the language->ABI mapping. */
1029 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1030 return NO_ABI_IDENTITY;
1032 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1033 piece p = {};
1034 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1036 machine_mode mode = TYPE_MODE_RAW (type);
1037 gcc_assert (VECTOR_MODE_P (mode)
1038 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1040 p.mode = p.orig_mode = mode;
1041 add_piece (p);
1042 return IS_PST;
1045 /* Check for user-defined PSTs. */
1046 if (TREE_CODE (type) == ARRAY_TYPE)
1047 return analyze_array (type);
1048 if (TREE_CODE (type) == RECORD_TYPE)
1049 return analyze_record (type);
1051 return ISNT_PST;
1054 /* Analyze a type that is known not to be passed or returned in memory.
1055 Return true if it has an ABI identity and is a Pure Scalable Type. */
1057 bool
1058 pure_scalable_type_info::analyze_registers (const_tree type)
1060 analysis_result result = analyze (type);
1061 gcc_assert (result != DOESNT_MATTER);
1062 return result == IS_PST;
1065 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1067 pure_scalable_type_info::analysis_result
1068 pure_scalable_type_info::analyze_array (const_tree type)
1070 /* Analyze the element type. */
1071 pure_scalable_type_info element_info;
1072 analysis_result result = element_info.analyze (TREE_TYPE (type));
1073 if (result != IS_PST)
1074 return result;
1076 /* An array of unknown, flexible or variable length will be passed and
1077 returned by reference whatever we do. */
1078 tree nelts_minus_one = array_type_nelts (type);
1079 if (!tree_fits_uhwi_p (nelts_minus_one))
1080 return DOESNT_MATTER;
1082 /* Likewise if the array is constant-sized but too big to be interesting.
1083 The double checks against MAX_PIECES are to protect against overflow. */
1084 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1085 if (count > MAX_PIECES)
1086 return DOESNT_MATTER;
1087 count += 1;
1088 if (count * element_info.pieces.length () > MAX_PIECES)
1089 return DOESNT_MATTER;
1091 /* The above checks should have weeded out elements of unknown size. */
1092 poly_uint64 element_bytes;
1093 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1094 gcc_unreachable ();
1096 /* Build up the list of individual vectors and predicates. */
1097 gcc_assert (!element_info.pieces.is_empty ());
1098 for (unsigned int i = 0; i < count; ++i)
1099 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1101 piece p = element_info.pieces[j];
1102 p.offset += i * element_bytes;
1103 add_piece (p);
1105 return IS_PST;
1108 /* Subroutine of analyze for handling RECORD_TYPEs. */
1110 pure_scalable_type_info::analysis_result
1111 pure_scalable_type_info::analyze_record (const_tree type)
1113 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1115 if (TREE_CODE (field) != FIELD_DECL)
1116 continue;
1118 /* Zero-sized fields disappear in the language->ABI mapping. */
1119 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1120 continue;
1122 /* All fields with an ABI identity must be PSTs for the record as
1123 a whole to be a PST. If any individual field is too big to be
1124 interesting then the record is too. */
1125 pure_scalable_type_info field_info;
1126 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1127 if (subresult == NO_ABI_IDENTITY)
1128 continue;
1129 if (subresult != IS_PST)
1130 return subresult;
1132 /* Since all previous fields are PSTs, we ought to be able to track
1133 the field offset using poly_ints. */
1134 tree bitpos = bit_position (field);
1135 gcc_assert (poly_int_tree_p (bitpos));
1137 /* For the same reason, it shouldn't be possible to create a PST field
1138 whose offset isn't byte-aligned. */
1139 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1140 BITS_PER_UNIT);
1142 /* Punt if the record is too big to be interesting. */
1143 poly_uint64 bytepos;
1144 if (!wide_bytepos.to_uhwi (&bytepos)
1145 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1146 return DOESNT_MATTER;
1148 /* Add the individual vectors and predicates in the field to the
1149 record's list. */
1150 gcc_assert (!field_info.pieces.is_empty ());
1151 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1153 piece p = field_info.pieces[i];
1154 p.offset += bytepos;
1155 add_piece (p);
1158 /* Empty structures disappear in the language->ABI mapping. */
1159 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1162 /* Add P to the list of pieces in the type. */
1164 void
1165 pure_scalable_type_info::add_piece (const piece &p)
1167 /* Try to fold the new piece into the previous one to form a
1168 single-mode PST. For example, if we see three consecutive vectors
1169 of the same mode, we can represent them using the corresponding
1170 3-tuple mode.
1172 This is purely an optimization. */
1173 if (!pieces.is_empty ())
1175 piece &prev = pieces.last ();
1176 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1177 unsigned int nelems1, nelems2;
1178 if (prev.orig_mode == p.orig_mode
1179 && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1180 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1181 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1182 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1183 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1184 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1185 && targetm.array_mode (p.orig_mode,
1186 nelems1 + nelems2).exists (&prev.mode))
1188 prev.num_zr += p.num_zr;
1189 prev.num_pr += p.num_pr;
1190 return;
1193 pieces.quick_push (p);
1196 /* Return true if at least one possible value of type TYPE includes at
1197 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1199 This is a relatively expensive test for some types, so it should
1200 generally be made as late as possible. */
1202 static bool
1203 aarch64_some_values_include_pst_objects_p (const_tree type)
1205 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1206 return false;
1208 if (aarch64_sve::builtin_type_p (type))
1209 return true;
1211 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1212 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1214 if (RECORD_OR_UNION_TYPE_P (type))
1215 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1216 if (TREE_CODE (field) == FIELD_DECL
1217 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1218 return true;
1220 return false;
1223 /* Return the descriptor of the SIMD ABI. */
1225 static const predefined_function_abi &
1226 aarch64_simd_abi (void)
1228 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1229 if (!simd_abi.initialized_p ())
1231 HARD_REG_SET full_reg_clobbers
1232 = default_function_abi.full_reg_clobbers ();
1233 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1234 if (FP_SIMD_SAVED_REGNUM_P (regno))
1235 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1236 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1238 return simd_abi;
1241 /* Return the descriptor of the SVE PCS. */
1243 static const predefined_function_abi &
1244 aarch64_sve_abi (void)
1246 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1247 if (!sve_abi.initialized_p ())
1249 HARD_REG_SET full_reg_clobbers
1250 = default_function_abi.full_reg_clobbers ();
1251 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1252 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1253 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1254 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1255 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1257 return sve_abi;
1260 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1261 wraps, otherwise return X itself. */
1263 static rtx
1264 strip_salt (rtx x)
1266 rtx search = x;
1267 if (GET_CODE (search) == CONST)
1268 search = XEXP (search, 0);
1269 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1270 x = XVECEXP (search, 0, 0);
1271 return x;
1274 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1275 expression. */
1277 static rtx
1278 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1280 return strip_salt (strip_offset (addr, offset));
1283 /* Generate code to enable conditional branches in functions over 1 MiB. */
1284 const char *
1285 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1286 const char * branch_format)
1288 rtx_code_label * tmp_label = gen_label_rtx ();
1289 char label_buf[256];
1290 char buffer[128];
1291 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1292 CODE_LABEL_NUMBER (tmp_label));
1293 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1294 rtx dest_label = operands[pos_label];
1295 operands[pos_label] = tmp_label;
1297 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1298 output_asm_insn (buffer, operands);
1300 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1301 operands[pos_label] = dest_label;
1302 output_asm_insn (buffer, operands);
1303 return "";
1306 void
1307 aarch64_err_no_fpadvsimd (machine_mode mode)
1309 if (TARGET_GENERAL_REGS_ONLY)
1310 if (FLOAT_MODE_P (mode))
1311 error ("%qs is incompatible with the use of floating-point types",
1312 "-mgeneral-regs-only");
1313 else
1314 error ("%qs is incompatible with the use of vector types",
1315 "-mgeneral-regs-only");
1316 else
1317 if (FLOAT_MODE_P (mode))
1318 error ("%qs feature modifier is incompatible with the use of"
1319 " floating-point types", "+nofp");
1320 else
1321 error ("%qs feature modifier is incompatible with the use of"
1322 " vector types", "+nofp");
1325 /* Report when we try to do something that requires SVE when SVE is disabled.
1326 This is an error of last resort and isn't very high-quality. It usually
1327 involves attempts to measure the vector length in some way. */
1328 static void
1329 aarch64_report_sve_required (void)
1331 static bool reported_p = false;
1333 /* Avoid reporting a slew of messages for a single oversight. */
1334 if (reported_p)
1335 return;
1337 error ("this operation requires the SVE ISA extension");
1338 inform (input_location, "you can enable SVE using the command-line"
1339 " option %<-march%>, or by using the %<target%>"
1340 " attribute or pragma");
1341 reported_p = true;
1344 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1345 registers. */
1346 inline bool
1347 pr_or_ffr_regnum_p (unsigned int regno)
1349 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1352 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1353 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1354 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1355 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1356 and GENERAL_REGS is lower than the memory cost (in this case the best class
1357 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1358 cost results in bad allocations with many redundant int<->FP moves which
1359 are expensive on various cores.
1360 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1361 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1362 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1363 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1364 The result of this is that it is no longer inefficient to have a higher
1365 memory move cost than the register move cost.
1368 static reg_class_t
1369 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1370 reg_class_t best_class)
1372 machine_mode mode;
1374 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1375 || !reg_class_subset_p (FP_REGS, allocno_class))
1376 return allocno_class;
1378 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1379 || !reg_class_subset_p (FP_REGS, best_class))
1380 return best_class;
1382 mode = PSEUDO_REGNO_MODE (regno);
1383 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1386 static unsigned int
1387 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1389 if (GET_MODE_UNIT_SIZE (mode) == 4)
1390 return aarch64_tune_params.min_div_recip_mul_sf;
1391 return aarch64_tune_params.min_div_recip_mul_df;
1394 /* Return the reassociation width of treeop OPC with mode MODE. */
1395 static int
1396 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1398 if (VECTOR_MODE_P (mode))
1399 return aarch64_tune_params.vec_reassoc_width;
1400 if (INTEGRAL_MODE_P (mode))
1401 return aarch64_tune_params.int_reassoc_width;
1402 /* Reassociation reduces the number of FMAs which may result in worse
1403 performance. Use a per-CPU setting for FMA reassociation which allows
1404 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1405 CPUs with many FP pipes to enable reassociation.
1406 Since the reassociation pass doesn't understand FMA at all, assume
1407 that any FP addition might turn into FMA. */
1408 if (FLOAT_MODE_P (mode))
1409 return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1410 : aarch64_tune_params.fp_reassoc_width;
1411 return 1;
1414 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1415 unsigned
1416 aarch64_debugger_regno (unsigned regno)
1418 if (GP_REGNUM_P (regno))
1419 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1420 else if (regno == SP_REGNUM)
1421 return AARCH64_DWARF_SP;
1422 else if (FP_REGNUM_P (regno))
1423 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1424 else if (PR_REGNUM_P (regno))
1425 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1426 else if (regno == VG_REGNUM)
1427 return AARCH64_DWARF_VG;
1429 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1430 equivalent DWARF register. */
1431 return DWARF_FRAME_REGISTERS;
1434 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
1435 static machine_mode
1436 aarch64_dwarf_frame_reg_mode (int regno)
1438 /* Predicate registers are call-clobbered in the EH ABI (which is
1439 ARM_PCS_AAPCS64), so they should not be described by CFI.
1440 Their size changes as VL changes, so any values computed by
1441 __builtin_init_dwarf_reg_size_table might not be valid for
1442 all frames. */
1443 if (PR_REGNUM_P (regno))
1444 return VOIDmode;
1445 return default_dwarf_frame_reg_mode (regno);
1448 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1449 integer, otherwise return X unmodified. */
1450 static rtx
1451 aarch64_bit_representation (rtx x)
1453 if (CONST_DOUBLE_P (x))
1454 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1455 return x;
1458 /* Return an estimate for the number of quadwords in an SVE vector. This is
1459 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
1460 static unsigned int
1461 aarch64_estimated_sve_vq ()
1463 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1466 /* Return true if MODE is an SVE predicate mode. */
1467 static bool
1468 aarch64_sve_pred_mode_p (machine_mode mode)
1470 return (TARGET_SVE
1471 && (mode == VNx16BImode
1472 || mode == VNx8BImode
1473 || mode == VNx4BImode
1474 || mode == VNx2BImode));
1477 /* Three mutually-exclusive flags describing a vector or predicate type. */
1478 const unsigned int VEC_ADVSIMD = 1;
1479 const unsigned int VEC_SVE_DATA = 2;
1480 const unsigned int VEC_SVE_PRED = 4;
1481 /* Indicates a structure of 2, 3 or 4 vectors or predicates. */
1482 const unsigned int VEC_STRUCT = 8;
1483 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1484 vector has fewer significant bytes than a full SVE vector. */
1485 const unsigned int VEC_PARTIAL = 16;
1486 /* Useful combinations of the above. */
1487 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1488 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1490 /* Return a set of flags describing the vector properties of mode MODE.
1491 If ANY_TARGET_P is false (the default), ignore modes that are not supported
1492 by the current target. Otherwise categorize the modes that can be used
1493 with the set of all targets supported by the port. */
1495 static unsigned int
1496 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1498 if (aarch64_sve_pred_mode_p (mode))
1499 return VEC_SVE_PRED;
1501 /* Make the decision based on the mode's enum value rather than its
1502 properties, so that we keep the correct classification regardless
1503 of -msve-vector-bits. */
1504 switch (mode)
1506 /* Partial SVE QI vectors. */
1507 case E_VNx2QImode:
1508 case E_VNx4QImode:
1509 case E_VNx8QImode:
1510 /* Partial SVE HI vectors. */
1511 case E_VNx2HImode:
1512 case E_VNx4HImode:
1513 /* Partial SVE SI vector. */
1514 case E_VNx2SImode:
1515 /* Partial SVE HF vectors. */
1516 case E_VNx2HFmode:
1517 case E_VNx4HFmode:
1518 /* Partial SVE BF vectors. */
1519 case E_VNx2BFmode:
1520 case E_VNx4BFmode:
1521 /* Partial SVE SF vector. */
1522 case E_VNx2SFmode:
1523 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1525 case E_VNx16QImode:
1526 case E_VNx8HImode:
1527 case E_VNx4SImode:
1528 case E_VNx2DImode:
1529 case E_VNx8BFmode:
1530 case E_VNx8HFmode:
1531 case E_VNx4SFmode:
1532 case E_VNx2DFmode:
1533 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1535 /* x2 SVE vectors. */
1536 case E_VNx32QImode:
1537 case E_VNx16HImode:
1538 case E_VNx8SImode:
1539 case E_VNx4DImode:
1540 case E_VNx16BFmode:
1541 case E_VNx16HFmode:
1542 case E_VNx8SFmode:
1543 case E_VNx4DFmode:
1544 /* x3 SVE vectors. */
1545 case E_VNx48QImode:
1546 case E_VNx24HImode:
1547 case E_VNx12SImode:
1548 case E_VNx6DImode:
1549 case E_VNx24BFmode:
1550 case E_VNx24HFmode:
1551 case E_VNx12SFmode:
1552 case E_VNx6DFmode:
1553 /* x4 SVE vectors. */
1554 case E_VNx64QImode:
1555 case E_VNx32HImode:
1556 case E_VNx16SImode:
1557 case E_VNx8DImode:
1558 case E_VNx32BFmode:
1559 case E_VNx32HFmode:
1560 case E_VNx16SFmode:
1561 case E_VNx8DFmode:
1562 return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1564 case E_OImode:
1565 case E_CImode:
1566 case E_XImode:
1567 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1569 /* Structures of 64-bit Advanced SIMD vectors. */
1570 case E_V2x8QImode:
1571 case E_V2x4HImode:
1572 case E_V2x2SImode:
1573 case E_V2x1DImode:
1574 case E_V2x4BFmode:
1575 case E_V2x4HFmode:
1576 case E_V2x2SFmode:
1577 case E_V2x1DFmode:
1578 case E_V3x8QImode:
1579 case E_V3x4HImode:
1580 case E_V3x2SImode:
1581 case E_V3x1DImode:
1582 case E_V3x4BFmode:
1583 case E_V3x4HFmode:
1584 case E_V3x2SFmode:
1585 case E_V3x1DFmode:
1586 case E_V4x8QImode:
1587 case E_V4x4HImode:
1588 case E_V4x2SImode:
1589 case E_V4x1DImode:
1590 case E_V4x4BFmode:
1591 case E_V4x4HFmode:
1592 case E_V4x2SFmode:
1593 case E_V4x1DFmode:
1594 return (TARGET_FLOAT || any_target_p)
1595 ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1597 /* Structures of 128-bit Advanced SIMD vectors. */
1598 case E_V2x16QImode:
1599 case E_V2x8HImode:
1600 case E_V2x4SImode:
1601 case E_V2x2DImode:
1602 case E_V2x8BFmode:
1603 case E_V2x8HFmode:
1604 case E_V2x4SFmode:
1605 case E_V2x2DFmode:
1606 case E_V3x16QImode:
1607 case E_V3x8HImode:
1608 case E_V3x4SImode:
1609 case E_V3x2DImode:
1610 case E_V3x8BFmode:
1611 case E_V3x8HFmode:
1612 case E_V3x4SFmode:
1613 case E_V3x2DFmode:
1614 case E_V4x16QImode:
1615 case E_V4x8HImode:
1616 case E_V4x4SImode:
1617 case E_V4x2DImode:
1618 case E_V4x8BFmode:
1619 case E_V4x8HFmode:
1620 case E_V4x4SFmode:
1621 case E_V4x2DFmode:
1622 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1624 /* 64-bit Advanced SIMD vectors. */
1625 case E_V8QImode:
1626 case E_V4HImode:
1627 case E_V2SImode:
1628 case E_V1DImode:
1629 case E_V4HFmode:
1630 case E_V4BFmode:
1631 case E_V2SFmode:
1632 case E_V1DFmode:
1633 /* 128-bit Advanced SIMD vectors. */
1634 case E_V16QImode:
1635 case E_V8HImode:
1636 case E_V4SImode:
1637 case E_V2DImode:
1638 case E_V8HFmode:
1639 case E_V8BFmode:
1640 case E_V4SFmode:
1641 case E_V2DFmode:
1642 return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1644 case E_VNx32BImode:
1645 return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1647 default:
1648 return 0;
1652 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1653 bool
1654 aarch64_advsimd_struct_mode_p (machine_mode mode)
1656 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1657 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1660 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
1661 static bool
1662 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1664 return (aarch64_classify_vector_mode (mode)
1665 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1668 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
1669 static bool
1670 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1672 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1675 /* Return true if MODE is any of the data vector modes, including
1676 structure modes. */
1677 static bool
1678 aarch64_vector_data_mode_p (machine_mode mode)
1680 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1683 /* Return true if MODE is any form of SVE mode, including predicates,
1684 vectors and structures. */
1685 bool
1686 aarch64_sve_mode_p (machine_mode mode)
1688 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1691 /* Return true if MODE is an SVE data vector mode; either a single vector
1692 or a structure of vectors. */
1693 static bool
1694 aarch64_sve_data_mode_p (machine_mode mode)
1696 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1699 /* Return the number of defined bytes in one constituent vector of
1700 SVE mode MODE, which has vector flags VEC_FLAGS. */
1701 static poly_int64
1702 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1704 if (vec_flags & VEC_PARTIAL)
1705 /* A single partial vector. */
1706 return GET_MODE_SIZE (mode);
1708 if (vec_flags & VEC_SVE_DATA)
1709 /* A single vector or a tuple. */
1710 return BYTES_PER_SVE_VECTOR;
1712 /* A single predicate. */
1713 gcc_assert (vec_flags & VEC_SVE_PRED);
1714 return BYTES_PER_SVE_PRED;
1717 /* If MODE holds an array of vectors, return the number of vectors
1718 in the array, otherwise return 1. */
1720 static unsigned int
1721 aarch64_ldn_stn_vectors (machine_mode mode)
1723 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1724 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1725 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1726 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1727 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1728 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1729 return exact_div (GET_MODE_SIZE (mode),
1730 BYTES_PER_SVE_VECTOR).to_constant ();
1731 return 1;
1734 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1735 corresponding vector structure mode. */
1736 static opt_machine_mode
1737 aarch64_advsimd_vector_array_mode (machine_mode mode,
1738 unsigned HOST_WIDE_INT nelems)
1740 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1741 if (known_eq (GET_MODE_SIZE (mode), 8))
1742 flags |= VEC_PARTIAL;
1744 machine_mode struct_mode;
1745 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1746 if (aarch64_classify_vector_mode (struct_mode) == flags
1747 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1748 && known_eq (GET_MODE_NUNITS (struct_mode),
1749 GET_MODE_NUNITS (mode) * nelems))
1750 return struct_mode;
1751 return opt_machine_mode ();
1754 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1756 opt_machine_mode
1757 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1759 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1760 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1761 machine_mode mode;
1762 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1763 if (inner_mode == GET_MODE_INNER (mode)
1764 && known_eq (nunits, GET_MODE_NUNITS (mode))
1765 && aarch64_sve_data_mode_p (mode))
1766 return mode;
1767 return opt_machine_mode ();
1770 /* Implement target hook TARGET_ARRAY_MODE. */
1771 static opt_machine_mode
1772 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1774 if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1776 /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1777 a mode to other array sizes. Using integer modes requires a round
1778 trip through memory and generates terrible code. */
1779 if (nelems == 1)
1780 return mode;
1781 if (mode == VNx16BImode && nelems == 2)
1782 return VNx32BImode;
1783 return BLKmode;
1786 auto flags = aarch64_classify_vector_mode (mode);
1787 if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1788 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1789 GET_MODE_NUNITS (mode) * nelems);
1791 if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1792 return aarch64_advsimd_vector_array_mode (mode, nelems);
1794 return opt_machine_mode ();
1797 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1798 static bool
1799 aarch64_array_mode_supported_p (machine_mode mode,
1800 unsigned HOST_WIDE_INT nelems)
1802 if (TARGET_BASE_SIMD
1803 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1804 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1805 && (nelems >= 2 && nelems <= 4))
1806 return true;
1808 return false;
1811 /* MODE is some form of SVE vector mode. For data modes, return the number
1812 of vector register bits that each element of MODE occupies, such as 64
1813 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1814 in a 64-bit container). For predicate modes, return the number of
1815 data bits controlled by each significant predicate bit. */
1817 static unsigned int
1818 aarch64_sve_container_bits (machine_mode mode)
1820 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1821 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1822 ? BITS_PER_SVE_VECTOR
1823 : GET_MODE_BITSIZE (mode));
1824 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1827 /* Return the SVE predicate mode to use for elements that have
1828 ELEM_NBYTES bytes, if such a mode exists. */
1830 opt_machine_mode
1831 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1833 if (TARGET_SVE)
1835 if (elem_nbytes == 1)
1836 return VNx16BImode;
1837 if (elem_nbytes == 2)
1838 return VNx8BImode;
1839 if (elem_nbytes == 4)
1840 return VNx4BImode;
1841 if (elem_nbytes == 8)
1842 return VNx2BImode;
1844 return opt_machine_mode ();
1847 /* Return the SVE predicate mode that should be used to control
1848 SVE mode MODE. */
1850 machine_mode
1851 aarch64_sve_pred_mode (machine_mode mode)
1853 unsigned int bits = aarch64_sve_container_bits (mode);
1854 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1857 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1859 static opt_machine_mode
1860 aarch64_get_mask_mode (machine_mode mode)
1862 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1863 if (vec_flags & VEC_SVE_DATA)
1864 return aarch64_sve_pred_mode (mode);
1866 return default_get_mask_mode (mode);
1869 /* Return the integer element mode associated with SVE mode MODE. */
1871 static scalar_int_mode
1872 aarch64_sve_element_int_mode (machine_mode mode)
1874 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1875 ? BITS_PER_SVE_VECTOR
1876 : GET_MODE_BITSIZE (mode));
1877 unsigned int elt_bits = vector_element_size (vector_bits,
1878 GET_MODE_NUNITS (mode));
1879 return int_mode_for_size (elt_bits, 0).require ();
1882 /* Return an integer element mode that contains exactly
1883 aarch64_sve_container_bits (MODE) bits. This is wider than
1884 aarch64_sve_element_int_mode if MODE is a partial vector,
1885 otherwise it's the same. */
1887 static scalar_int_mode
1888 aarch64_sve_container_int_mode (machine_mode mode)
1890 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1893 /* Return the integer vector mode associated with SVE mode MODE.
1894 Unlike related_int_vector_mode, this can handle the case in which
1895 MODE is a predicate (and thus has a different total size). */
1897 machine_mode
1898 aarch64_sve_int_mode (machine_mode mode)
1900 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1901 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1904 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1906 static opt_machine_mode
1907 aarch64_vectorize_related_mode (machine_mode vector_mode,
1908 scalar_mode element_mode,
1909 poly_uint64 nunits)
1911 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1913 /* If we're operating on SVE vectors, try to return an SVE mode. */
1914 poly_uint64 sve_nunits;
1915 if ((vec_flags & VEC_SVE_DATA)
1916 && multiple_p (BYTES_PER_SVE_VECTOR,
1917 GET_MODE_SIZE (element_mode), &sve_nunits))
1919 machine_mode sve_mode;
1920 if (maybe_ne (nunits, 0U))
1922 /* Try to find a full or partial SVE mode with exactly
1923 NUNITS units. */
1924 if (multiple_p (sve_nunits, nunits)
1925 && aarch64_sve_data_mode (element_mode,
1926 nunits).exists (&sve_mode))
1927 return sve_mode;
1929 else
1931 /* Take the preferred number of units from the number of bytes
1932 that fit in VECTOR_MODE. We always start by "autodetecting"
1933 a full vector mode with preferred_simd_mode, so vectors
1934 chosen here will also be full vector modes. Then
1935 autovectorize_vector_modes tries smaller starting modes
1936 and thus smaller preferred numbers of units. */
1937 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1938 if (aarch64_sve_data_mode (element_mode,
1939 sve_nunits).exists (&sve_mode))
1940 return sve_mode;
1944 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1945 if (TARGET_SIMD
1946 && (vec_flags & VEC_ADVSIMD)
1947 && known_eq (nunits, 0U)
1948 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1949 && maybe_ge (GET_MODE_BITSIZE (element_mode)
1950 * GET_MODE_NUNITS (vector_mode), 128U))
1952 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1953 if (VECTOR_MODE_P (res))
1954 return res;
1957 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1960 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
1962 static bool
1963 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
1965 machine_mode mode = TYPE_MODE (type);
1966 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1967 bool sve_p = (vec_flags & VEC_ANY_SVE);
1968 bool simd_p = (vec_flags & VEC_ADVSIMD);
1970 return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
1973 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1974 prefer to use the first arithmetic operand as the else value if
1975 the else value doesn't matter, since that exactly matches the SVE
1976 destructive merging form. For ternary operations we could either
1977 pick the first operand and use FMAD-like instructions or the last
1978 operand and use FMLA-like instructions; the latter seems more
1979 natural. */
1981 static tree
1982 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1984 return nops == 3 ? ops[2] : ops[0];
1987 /* Implement TARGET_HARD_REGNO_NREGS. */
1989 static unsigned int
1990 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1992 /* ??? Logically we should only need to provide a value when
1993 HARD_REGNO_MODE_OK says that the combination is valid,
1994 but at the moment we need to handle all modes. Just ignore
1995 any runtime parts for registers that can't store them. */
1996 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1997 switch (aarch64_regno_regclass (regno))
1999 case FP_REGS:
2000 case FP_LO_REGS:
2001 case FP_LO8_REGS:
2003 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2004 if (vec_flags & VEC_SVE_DATA)
2005 return exact_div (GET_MODE_SIZE (mode),
2006 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2007 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2008 return GET_MODE_SIZE (mode).to_constant () / 8;
2009 return CEIL (lowest_size, UNITS_PER_VREG);
2012 case PR_REGS:
2013 case PR_LO_REGS:
2014 case PR_HI_REGS:
2015 return mode == VNx32BImode ? 2 : 1;
2017 case FFR_REGS:
2018 case PR_AND_FFR_REGS:
2019 case FAKE_REGS:
2020 return 1;
2022 default:
2023 return CEIL (lowest_size, UNITS_PER_WORD);
2025 gcc_unreachable ();
2028 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2030 static bool
2031 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2033 if (mode == V8DImode)
2034 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2035 && multiple_p (regno - R0_REGNUM, 2);
2037 if (GET_MODE_CLASS (mode) == MODE_CC)
2038 return regno == CC_REGNUM;
2040 if (regno == VG_REGNUM)
2041 /* This must have the same size as _Unwind_Word. */
2042 return mode == DImode;
2044 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2045 if (vec_flags == VEC_SVE_PRED)
2046 return pr_or_ffr_regnum_p (regno);
2048 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2049 return PR_REGNUM_P (regno);
2051 if (pr_or_ffr_regnum_p (regno))
2052 return false;
2054 /* These registers are abstract; their modes don't matter. */
2055 if (FAKE_REGNUM_P (regno))
2056 return true;
2058 if (regno == SP_REGNUM)
2059 /* The purpose of comparing with ptr_mode is to support the
2060 global register variable associated with the stack pointer
2061 register via the syntax of asm ("wsp") in ILP32. */
2062 return mode == Pmode || mode == ptr_mode;
2064 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2065 return mode == Pmode;
2067 if (GP_REGNUM_P (regno))
2069 if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2070 return false;
2071 if (known_le (GET_MODE_SIZE (mode), 8))
2072 return true;
2073 if (known_le (GET_MODE_SIZE (mode), 16))
2074 return (regno & 1) == 0;
2076 else if (FP_REGNUM_P (regno))
2078 if (vec_flags & VEC_STRUCT)
2079 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2080 else
2081 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2084 return false;
2087 /* Return true if a function with type FNTYPE returns its value in
2088 SVE vector or predicate registers. */
2090 static bool
2091 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2093 tree return_type = TREE_TYPE (fntype);
2095 pure_scalable_type_info pst_info;
2096 switch (pst_info.analyze (return_type))
2098 case pure_scalable_type_info::IS_PST:
2099 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2100 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2102 case pure_scalable_type_info::DOESNT_MATTER:
2103 gcc_assert (aarch64_return_in_memory_1 (return_type));
2104 return false;
2106 case pure_scalable_type_info::NO_ABI_IDENTITY:
2107 case pure_scalable_type_info::ISNT_PST:
2108 return false;
2110 gcc_unreachable ();
2113 /* Return true if a function with type FNTYPE takes arguments in
2114 SVE vector or predicate registers. */
2116 static bool
2117 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2119 CUMULATIVE_ARGS args_so_far_v;
2120 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2121 NULL_TREE, 0, true);
2122 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2124 for (tree chain = TYPE_ARG_TYPES (fntype);
2125 chain && chain != void_list_node;
2126 chain = TREE_CHAIN (chain))
2128 tree arg_type = TREE_VALUE (chain);
2129 if (arg_type == error_mark_node)
2130 return false;
2132 function_arg_info arg (arg_type, /*named=*/true);
2133 apply_pass_by_reference_rules (&args_so_far_v, arg);
2134 pure_scalable_type_info pst_info;
2135 if (pst_info.analyze_registers (arg.type))
2137 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2138 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2139 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2140 return true;
2143 targetm.calls.function_arg_advance (args_so_far, arg);
2145 return false;
2148 /* Implement TARGET_FNTYPE_ABI. */
2150 static const predefined_function_abi &
2151 aarch64_fntype_abi (const_tree fntype)
2153 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2154 return aarch64_simd_abi ();
2156 if (aarch64_returns_value_in_sve_regs_p (fntype)
2157 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2158 return aarch64_sve_abi ();
2160 return default_function_abi;
2163 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE. */
2165 static aarch64_feature_flags
2166 aarch64_fntype_pstate_sm (const_tree fntype)
2168 if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2169 return AARCH64_FL_SM_ON;
2171 if (lookup_attribute ("arm", "streaming_compatible",
2172 TYPE_ATTRIBUTES (fntype)))
2173 return 0;
2175 return AARCH64_FL_SM_OFF;
2178 /* Return state flags that describe whether and how functions of type
2179 FNTYPE share state STATE_NAME with their callers. */
2181 static unsigned int
2182 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2184 return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2185 state_name);
2188 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE. */
2190 static aarch64_feature_flags
2191 aarch64_fntype_pstate_za (const_tree fntype)
2193 if (aarch64_fntype_shared_flags (fntype, "za")
2194 || aarch64_fntype_shared_flags (fntype, "zt0"))
2195 return AARCH64_FL_ZA_ON;
2197 return 0;
2200 /* Return the ISA mode on entry to functions of type FNTYPE. */
2202 static aarch64_feature_flags
2203 aarch64_fntype_isa_mode (const_tree fntype)
2205 return (aarch64_fntype_pstate_sm (fntype)
2206 | aarch64_fntype_pstate_za (fntype));
2209 /* Return true if FNDECL uses streaming mode internally, as an
2210 implementation choice. */
2212 static bool
2213 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2215 return lookup_attribute ("arm", "locally_streaming",
2216 DECL_ATTRIBUTES (fndecl));
2219 /* Return the state of PSTATE.SM when compiling the body of
2220 function FNDECL. This might be different from the state of
2221 PSTATE.SM on entry. */
2223 static aarch64_feature_flags
2224 aarch64_fndecl_pstate_sm (const_tree fndecl)
2226 if (aarch64_fndecl_is_locally_streaming (fndecl))
2227 return AARCH64_FL_SM_ON;
2229 return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2232 /* Return true if function FNDECL has state STATE_NAME, either by creating
2233 new state itself or by sharing state with callers. */
2235 static bool
2236 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2238 return (aarch64_fndecl_has_new_state (fndecl, state_name)
2239 || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2240 state_name) != 0);
2243 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2244 This might be different from the state of PSTATE.ZA on entry. */
2246 static aarch64_feature_flags
2247 aarch64_fndecl_pstate_za (const_tree fndecl)
2249 if (aarch64_fndecl_has_new_state (fndecl, "za")
2250 || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2251 return AARCH64_FL_ZA_ON;
2253 return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2256 /* Return the ISA mode that should be used to compile the body of
2257 function FNDECL. */
2259 static aarch64_feature_flags
2260 aarch64_fndecl_isa_mode (const_tree fndecl)
2262 return (aarch64_fndecl_pstate_sm (fndecl)
2263 | aarch64_fndecl_pstate_za (fndecl));
2266 /* Return the state of PSTATE.SM on entry to the current function.
2267 This might be different from the state of PSTATE.SM in the function
2268 body. */
2270 static aarch64_feature_flags
2271 aarch64_cfun_incoming_pstate_sm ()
2273 return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2276 /* Return the state of PSTATE.ZA on entry to the current function.
2277 This might be different from the state of PSTATE.ZA in the function
2278 body. */
2280 static aarch64_feature_flags
2281 aarch64_cfun_incoming_pstate_za ()
2283 return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2286 /* Return state flags that describe whether and how the current function shares
2287 state STATE_NAME with callers. */
2289 static unsigned int
2290 aarch64_cfun_shared_flags (const char *state_name)
2292 return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2295 /* Return true if the current function creates new state of type STATE_NAME
2296 (as opposed to sharing the state with its callers or ignoring the state
2297 altogether). */
2299 static bool
2300 aarch64_cfun_has_new_state (const char *state_name)
2302 return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2305 /* Return true if PSTATE.SM is 1 in the body of the current function,
2306 but is not guaranteed to be 1 on entry. */
2308 static bool
2309 aarch64_cfun_enables_pstate_sm ()
2311 return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2312 && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
2315 /* Return true if the current function has state STATE_NAME, either by
2316 creating new state itself or by sharing state with callers. */
2318 static bool
2319 aarch64_cfun_has_state (const char *state_name)
2321 return aarch64_fndecl_has_state (cfun->decl, state_name);
2324 /* Return true if a call from the current function to a function with
2325 ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2326 the BL instruction. */
2328 static bool
2329 aarch64_call_switches_pstate_sm (aarch64_feature_flags callee_mode)
2331 return (callee_mode & ~AARCH64_ISA_MODE & AARCH64_FL_SM_STATE) != 0;
2334 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2336 static bool
2337 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2339 return (aarch64_sve::builtin_type_p (type1)
2340 == aarch64_sve::builtin_type_p (type2));
2343 /* Return true if we should emit CFI for register REGNO. */
2345 static bool
2346 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2348 return (GP_REGNUM_P (regno)
2349 || !default_function_abi.clobbers_full_reg_p (regno));
2352 /* Return the mode we should use to save and restore register REGNO. */
2354 static machine_mode
2355 aarch64_reg_save_mode (unsigned int regno)
2357 if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2358 return DImode;
2360 if (FP_REGNUM_P (regno))
2361 switch (crtl->abi->id ())
2363 case ARM_PCS_AAPCS64:
2364 /* Only the low 64 bits are saved by the base PCS. */
2365 return DFmode;
2367 case ARM_PCS_SIMD:
2368 /* The vector PCS saves the low 128 bits (which is the full
2369 register on non-SVE targets). */
2370 return V16QImode;
2372 case ARM_PCS_SVE:
2373 /* Use vectors of DImode for registers that need frame
2374 information, so that the first 64 bytes of the save slot
2375 are always the equivalent of what storing D<n> would give. */
2376 if (aarch64_emit_cfi_for_reg_p (regno))
2377 return VNx2DImode;
2379 /* Use vectors of bytes otherwise, so that the layout is
2380 endian-agnostic, and so that we can use LDR and STR for
2381 big-endian targets. */
2382 return VNx16QImode;
2384 case ARM_PCS_TLSDESC:
2385 case ARM_PCS_UNKNOWN:
2386 break;
2389 if (PR_REGNUM_P (regno))
2390 /* Save the full predicate register. */
2391 return VNx16BImode;
2393 gcc_unreachable ();
2396 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2397 return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx. */
2400 aarch64_gen_callee_cookie (aarch64_feature_flags isa_mode, arm_pcs pcs_variant)
2402 return gen_int_mode ((unsigned int) isa_mode
2403 | (unsigned int) pcs_variant << AARCH64_NUM_ISA_MODES,
2404 DImode);
2407 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2408 callee's ABI. */
2410 static const predefined_function_abi &
2411 aarch64_callee_abi (rtx cookie)
2413 return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES];
2416 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2417 required ISA mode on entry to the callee, which is also the ISA
2418 mode on return from the callee. */
2420 static aarch64_feature_flags
2421 aarch64_callee_isa_mode (rtx cookie)
2423 return UINTVAL (cookie) & AARCH64_FL_ISA_MODES;
2426 /* INSN is a call instruction. Return the CONST_INT stored in its
2427 UNSPEC_CALLEE_ABI rtx. */
2429 static rtx
2430 aarch64_insn_callee_cookie (const rtx_insn *insn)
2432 rtx pat = PATTERN (insn);
2433 gcc_assert (GET_CODE (pat) == PARALLEL);
2434 rtx unspec = XVECEXP (pat, 0, 1);
2435 gcc_assert (GET_CODE (unspec) == UNSPEC
2436 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2437 return XVECEXP (unspec, 0, 0);
2440 /* Implement TARGET_INSN_CALLEE_ABI. */
2442 const predefined_function_abi &
2443 aarch64_insn_callee_abi (const rtx_insn *insn)
2445 return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2448 /* INSN is a call instruction. Return the required ISA mode on entry to
2449 the callee, which is also the ISA mode on return from the callee. */
2451 static aarch64_feature_flags
2452 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2454 return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2457 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2458 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2459 clobbers the top 64 bits when restoring the bottom 64 bits. */
2461 static bool
2462 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2463 unsigned int regno,
2464 machine_mode mode)
2466 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2468 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2469 unsigned int nregs = hard_regno_nregs (regno, mode);
2470 if (nregs > 1)
2471 per_register_size = exact_div (per_register_size, nregs);
2472 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2473 return maybe_gt (per_register_size, 16);
2474 return maybe_gt (per_register_size, 8);
2476 return false;
2479 /* Implement REGMODE_NATURAL_SIZE. */
2480 poly_uint64
2481 aarch64_regmode_natural_size (machine_mode mode)
2483 /* The natural size for SVE data modes is one SVE data vector,
2484 and similarly for predicates. We can't independently modify
2485 anything smaller than that. */
2486 /* ??? For now, only do this for variable-width SVE registers.
2487 Doing it for constant-sized registers breaks lower-subreg.cc. */
2488 /* ??? And once that's fixed, we should probably have similar
2489 code for Advanced SIMD. */
2490 if (!aarch64_sve_vg.is_constant ())
2492 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2493 if (vec_flags & VEC_SVE_PRED)
2494 return BYTES_PER_SVE_PRED;
2495 if (vec_flags & VEC_SVE_DATA)
2496 return BYTES_PER_SVE_VECTOR;
2498 return UNITS_PER_WORD;
2501 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2502 machine_mode
2503 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2504 machine_mode mode)
2506 /* The predicate mode determines which bits are significant and
2507 which are "don't care". Decreasing the number of lanes would
2508 lose data while increasing the number of lanes would make bits
2509 unnecessarily significant. */
2510 if (PR_REGNUM_P (regno))
2511 return mode;
2512 if (known_ge (GET_MODE_SIZE (mode), 4))
2513 return mode;
2514 else
2515 return SImode;
2518 /* Return true if I's bits are consecutive ones from the MSB. */
2519 bool
2520 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2522 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2525 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2526 that strcpy from constants will be faster. */
2528 static HOST_WIDE_INT
2529 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2531 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2532 return MAX (align, BITS_PER_WORD);
2533 return align;
2536 /* Return true if calls to DECL should be treated as
2537 long-calls (ie called via a register). */
2538 static bool
2539 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2541 return false;
2544 /* Return true if calls to symbol-ref SYM should be treated as
2545 long-calls (ie called via a register). */
2546 bool
2547 aarch64_is_long_call_p (rtx sym)
2549 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2552 /* Return true if calls to symbol-ref SYM should not go through
2553 plt stubs. */
2555 bool
2556 aarch64_is_noplt_call_p (rtx sym)
2558 const_tree decl = SYMBOL_REF_DECL (sym);
2560 if (flag_pic
2561 && decl
2562 && (!flag_plt
2563 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2564 && !targetm.binds_local_p (decl))
2565 return true;
2567 return false;
2570 /* Emit an insn that's a simple single-set. Both the operands must be
2571 known to be valid. */
2572 inline static rtx_insn *
2573 emit_set_insn (rtx x, rtx y)
2575 return emit_insn (gen_rtx_SET (x, y));
2578 /* X and Y are two things to compare using CODE. Emit the compare insn and
2579 return the rtx for register 0 in the proper mode. */
2581 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2583 machine_mode cmp_mode = GET_MODE (x);
2584 machine_mode cc_mode;
2585 rtx cc_reg;
2587 if (cmp_mode == TImode)
2589 gcc_assert (code == NE);
2591 cc_mode = CCmode;
2592 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2594 rtx x_lo = operand_subword (x, 0, 0, TImode);
2595 rtx y_lo = operand_subword (y, 0, 0, TImode);
2596 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2598 rtx x_hi = operand_subword (x, 1, 0, TImode);
2599 rtx y_hi = operand_subword (y, 1, 0, TImode);
2600 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2601 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2602 GEN_INT (AARCH64_EQ)));
2604 else
2606 cc_mode = SELECT_CC_MODE (code, x, y);
2607 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2608 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2610 return cc_reg;
2613 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2615 static rtx
2616 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2617 machine_mode y_mode)
2619 if (y_mode == E_QImode || y_mode == E_HImode)
2621 if (CONST_INT_P (y))
2623 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2624 y_mode = SImode;
2626 else
2628 rtx t, cc_reg;
2629 machine_mode cc_mode;
2631 t = gen_rtx_ZERO_EXTEND (SImode, y);
2632 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2633 cc_mode = CC_SWPmode;
2634 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2635 emit_set_insn (cc_reg, t);
2636 return cc_reg;
2640 if (!aarch64_plus_operand (y, y_mode))
2641 y = force_reg (y_mode, y);
2643 return aarch64_gen_compare_reg (code, x, y);
2646 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2647 Return the jump instruction. */
2649 static rtx
2650 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
2651 rtx_code_label *label)
2653 if (aarch64_track_speculation)
2655 /* Emit an explicit compare instruction, so that we can correctly
2656 track the condition codes. */
2657 rtx cc_reg = aarch64_gen_compare_reg (code, x, const0_rtx);
2658 x = gen_rtx_fmt_ee (code, GET_MODE (cc_reg), cc_reg, const0_rtx);
2660 else
2661 x = gen_rtx_fmt_ee (code, VOIDmode, x, const0_rtx);
2663 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
2664 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
2665 return gen_rtx_SET (pc_rtx, x);
2668 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2669 If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2670 it branches to LABEL when the bit is clear. */
2672 static rtx
2673 aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
2674 rtx_code_label *label)
2676 auto mode = GET_MODE (x);
2677 if (aarch64_track_speculation)
2679 auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
2680 emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
2681 rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
2682 rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
2683 return gen_condjump (x, cc_reg, label);
2685 return gen_aarch64_tb (code, mode, mode,
2686 x, gen_int_mode (bitnum, mode), label);
2689 /* Consider the operation:
2691 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2693 where:
2695 - CODE is [SU]MAX or [SU]MIN
2696 - OPERANDS[2] and OPERANDS[3] are constant integers
2697 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2698 - all operands have mode MODE
2700 Decide whether it is possible to implement the operation using:
2702 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2704 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2706 followed by:
2708 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2710 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
2711 If GENERATE_P is true, also update OPERANDS as follows:
2713 OPERANDS[4] = -OPERANDS[3]
2714 OPERANDS[5] = the rtl condition representing <cond>
2715 OPERANDS[6] = <tmp>
2716 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
2717 bool
2718 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2720 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2721 rtx dst = operands[0];
2722 rtx maxmin_op = operands[2];
2723 rtx add_op = operands[3];
2724 machine_mode mode = GET_MODE (dst);
2726 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2727 == (x >= y ? x : y) - z
2728 == (x > y ? x : y) - z
2729 == (x > y - 1 ? x : y) - z
2731 min (x, y) - z == (x <= y - 1 ? x : y) - z
2732 == (x <= y ? x : y) - z
2733 == (x < y ? x : y) - z
2734 == (x < y + 1 ? x : y) - z
2736 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2737 which x is compared with z. Set DIFF to y - z. Thus the supported
2738 combinations are as follows, with DIFF being the value after the ":":
2740 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
2741 == x >= y ? x - y : 0 [z == y]
2742 == x > y ? x - y : 0 [z == y]
2743 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
2745 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
2746 == x <= y ? x - y : 0 [z == y]
2747 == x < y ? x - y : 0 [z == y]
2748 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
2749 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2750 auto add_val = rtx_mode_t (add_op, mode);
2751 auto sub_val = wi::neg (add_val);
2752 auto diff = wi::sub (maxmin_val, sub_val);
2753 if (!(diff == 0
2754 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2755 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2756 return false;
2758 if (!generate_p)
2759 return true;
2761 rtx_code cmp;
2762 switch (code)
2764 case SMAX:
2765 cmp = diff == 1 ? GT : GE;
2766 break;
2767 case UMAX:
2768 cmp = diff == 1 ? GTU : GEU;
2769 break;
2770 case SMIN:
2771 cmp = diff == -1 ? LT : LE;
2772 break;
2773 case UMIN:
2774 cmp = diff == -1 ? LTU : LEU;
2775 break;
2776 default:
2777 gcc_unreachable ();
2779 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2781 operands[4] = immed_wide_int_const (sub_val, mode);
2782 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2783 if (can_create_pseudo_p ())
2784 operands[6] = gen_reg_rtx (mode);
2785 else
2786 operands[6] = dst;
2787 operands[7] = immed_wide_int_const (diff, mode);
2789 return true;
2793 /* Build the SYMBOL_REF for __tls_get_addr. */
2795 static GTY(()) rtx tls_get_addr_libfunc;
2798 aarch64_tls_get_addr (void)
2800 if (!tls_get_addr_libfunc)
2801 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2802 return tls_get_addr_libfunc;
2805 /* Return the TLS model to use for ADDR. */
2807 static enum tls_model
2808 tls_symbolic_operand_type (rtx addr)
2810 enum tls_model tls_kind = TLS_MODEL_NONE;
2811 poly_int64 offset;
2812 addr = strip_offset_and_salt (addr, &offset);
2813 if (SYMBOL_REF_P (addr))
2814 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2816 return tls_kind;
2819 /* We'll allow lo_sum's in addresses in our legitimate addresses
2820 so that combine would take care of combining addresses where
2821 necessary, but for generation purposes, we'll generate the address
2822 as :
2823 RTL Absolute
2824 tmp = hi (symbol_ref); adrp x1, foo
2825 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2828 PIC TLS
2829 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2830 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2831 bl __tls_get_addr
2834 Load TLS symbol, depending on TLS mechanism and TLS access model.
2836 Global Dynamic - Traditional TLS:
2837 adrp tmp, :tlsgd:imm
2838 add dest, tmp, #:tlsgd_lo12:imm
2839 bl __tls_get_addr
2841 Global Dynamic - TLS Descriptors:
2842 adrp dest, :tlsdesc:imm
2843 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2844 add dest, dest, #:tlsdesc_lo12:imm
2845 blr tmp
2846 mrs tp, tpidr_el0
2847 add dest, dest, tp
2849 Initial Exec:
2850 mrs tp, tpidr_el0
2851 adrp tmp, :gottprel:imm
2852 ldr dest, [tmp, #:gottprel_lo12:imm]
2853 add dest, dest, tp
2855 Local Exec:
2856 mrs tp, tpidr_el0
2857 add t0, tp, #:tprel_hi12:imm, lsl #12
2858 add t0, t0, #:tprel_lo12_nc:imm
2861 static void
2862 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2863 enum aarch64_symbol_type type)
2865 switch (type)
2867 case SYMBOL_SMALL_ABSOLUTE:
2869 /* In ILP32, the mode of dest can be either SImode or DImode. */
2870 rtx tmp_reg = dest;
2871 machine_mode mode = GET_MODE (dest);
2873 gcc_assert (mode == Pmode || mode == ptr_mode);
2875 if (can_create_pseudo_p ())
2876 tmp_reg = gen_reg_rtx (mode);
2878 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
2879 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2880 return;
2883 case SYMBOL_TINY_ABSOLUTE:
2884 emit_insn (gen_rtx_SET (dest, imm));
2885 return;
2887 case SYMBOL_SMALL_GOT_28K:
2889 machine_mode mode = GET_MODE (dest);
2890 rtx gp_rtx = pic_offset_table_rtx;
2891 rtx insn;
2892 rtx mem;
2894 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2895 here before rtl expand. Tree IVOPT will generate rtl pattern to
2896 decide rtx costs, in which case pic_offset_table_rtx is not
2897 initialized. For that case no need to generate the first adrp
2898 instruction as the final cost for global variable access is
2899 one instruction. */
2900 if (gp_rtx != NULL)
2902 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2903 using the page base as GOT base, the first page may be wasted,
2904 in the worst scenario, there is only 28K space for GOT).
2906 The generate instruction sequence for accessing global variable
2909 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2911 Only one instruction needed. But we must initialize
2912 pic_offset_table_rtx properly. We generate initialize insn for
2913 every global access, and allow CSE to remove all redundant.
2915 The final instruction sequences will look like the following
2916 for multiply global variables access.
2918 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2920 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2921 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2922 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2923 ... */
2925 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2926 crtl->uses_pic_offset_table = 1;
2927 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2929 if (mode != GET_MODE (gp_rtx))
2930 gp_rtx = gen_lowpart (mode, gp_rtx);
2934 if (mode == ptr_mode)
2936 if (mode == DImode)
2937 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2938 else
2939 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2941 mem = XVECEXP (SET_SRC (insn), 0, 0);
2943 else
2945 gcc_assert (mode == Pmode);
2947 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2948 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2951 /* The operand is expected to be MEM. Whenever the related insn
2952 pattern changed, above code which calculate mem should be
2953 updated. */
2954 gcc_assert (MEM_P (mem));
2955 MEM_READONLY_P (mem) = 1;
2956 MEM_NOTRAP_P (mem) = 1;
2957 emit_insn (insn);
2958 return;
2961 case SYMBOL_SMALL_GOT_4G:
2962 emit_insn (gen_rtx_SET (dest, imm));
2963 return;
2965 case SYMBOL_SMALL_TLSGD:
2967 rtx_insn *insns;
2968 /* The return type of __tls_get_addr is the C pointer type
2969 so use ptr_mode. */
2970 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
2971 rtx tmp_reg = dest;
2973 if (GET_MODE (dest) != ptr_mode)
2974 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
2976 start_sequence ();
2977 if (ptr_mode == SImode)
2978 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2979 else
2980 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2981 insns = get_insns ();
2982 end_sequence ();
2984 RTL_CONST_CALL_P (insns) = 1;
2985 emit_libcall_block (insns, tmp_reg, result, imm);
2986 /* Convert back to the mode of the dest adding a zero_extend
2987 from SImode (ptr_mode) to DImode (Pmode). */
2988 if (dest != tmp_reg)
2989 convert_move (dest, tmp_reg, true);
2990 return;
2993 case SYMBOL_SMALL_TLSDESC:
2995 machine_mode mode = GET_MODE (dest);
2996 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2997 rtx tp;
2999 gcc_assert (mode == Pmode || mode == ptr_mode);
3001 /* In ILP32, the got entry is always of SImode size. Unlike
3002 small GOT, the dest is fixed at reg 0. */
3003 if (TARGET_ILP32)
3004 emit_insn (gen_tlsdesc_small_si (imm));
3005 else
3006 emit_insn (gen_tlsdesc_small_di (imm));
3007 tp = aarch64_load_tp (NULL);
3009 if (mode != Pmode)
3010 tp = gen_lowpart (mode, tp);
3012 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3013 if (REG_P (dest))
3014 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3015 return;
3018 case SYMBOL_SMALL_TLSIE:
3020 /* In ILP32, the mode of dest can be either SImode or DImode,
3021 while the got entry is always of SImode size. The mode of
3022 dest depends on how dest is used: if dest is assigned to a
3023 pointer (e.g. in the memory), it has SImode; it may have
3024 DImode if dest is dereferenced to access the memeory.
3025 This is why we have to handle three different tlsie_small
3026 patterns here (two patterns for ILP32). */
3027 machine_mode mode = GET_MODE (dest);
3028 rtx tmp_reg = gen_reg_rtx (mode);
3029 rtx tp = aarch64_load_tp (NULL);
3031 if (mode == ptr_mode)
3033 if (mode == DImode)
3034 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3035 else
3037 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3038 tp = gen_lowpart (mode, tp);
3041 else
3043 gcc_assert (mode == Pmode);
3044 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3047 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3048 if (REG_P (dest))
3049 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3050 return;
3053 case SYMBOL_TLSLE12:
3054 case SYMBOL_TLSLE24:
3055 case SYMBOL_TLSLE32:
3056 case SYMBOL_TLSLE48:
3058 machine_mode mode = GET_MODE (dest);
3059 rtx tp = aarch64_load_tp (NULL);
3061 if (mode != Pmode)
3062 tp = gen_lowpart (mode, tp);
3064 switch (type)
3066 case SYMBOL_TLSLE12:
3067 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3068 (dest, tp, imm));
3069 break;
3070 case SYMBOL_TLSLE24:
3071 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3072 (dest, tp, imm));
3073 break;
3074 case SYMBOL_TLSLE32:
3075 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3076 (dest, imm));
3077 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3078 (dest, dest, tp));
3079 break;
3080 case SYMBOL_TLSLE48:
3081 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3082 (dest, imm));
3083 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3084 (dest, dest, tp));
3085 break;
3086 default:
3087 gcc_unreachable ();
3090 if (REG_P (dest))
3091 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3092 return;
3095 case SYMBOL_TINY_GOT:
3097 rtx insn;
3098 machine_mode mode = GET_MODE (dest);
3100 if (mode == ptr_mode)
3101 insn = gen_ldr_got_tiny (mode, dest, imm);
3102 else
3104 gcc_assert (mode == Pmode);
3105 insn = gen_ldr_got_tiny_sidi (dest, imm);
3108 emit_insn (insn);
3109 return;
3112 case SYMBOL_TINY_TLSIE:
3114 machine_mode mode = GET_MODE (dest);
3115 rtx tp = aarch64_load_tp (NULL);
3117 if (mode == ptr_mode)
3119 if (mode == DImode)
3120 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3121 else
3123 tp = gen_lowpart (mode, tp);
3124 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3127 else
3129 gcc_assert (mode == Pmode);
3130 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3133 if (REG_P (dest))
3134 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3135 return;
3138 default:
3139 gcc_unreachable ();
3143 /* Emit a move from SRC to DEST. Assume that the move expanders can
3144 handle all moves if !can_create_pseudo_p (). The distinction is
3145 important because, unlike emit_move_insn, the move expanders know
3146 how to force Pmode objects into the constant pool even when the
3147 constant pool address is not itself legitimate. */
3148 static rtx
3149 aarch64_emit_move (rtx dest, rtx src)
3151 return (can_create_pseudo_p ()
3152 ? emit_move_insn (dest, src)
3153 : emit_move_insn_1 (dest, src));
3156 /* Apply UNOPTAB to OP and store the result in DEST. */
3158 static void
3159 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3161 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3162 if (dest != tmp)
3163 emit_move_insn (dest, tmp);
3166 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3168 static void
3169 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3171 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3172 OPTAB_DIRECT);
3173 if (dest != tmp)
3174 emit_move_insn (dest, tmp);
3177 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE. */
3179 void
3180 aarch64_split_double_move (rtx dst, rtx src, machine_mode single_mode)
3182 machine_mode mode = GET_MODE (dst);
3184 rtx dst0 = simplify_gen_subreg (single_mode, dst, mode, 0);
3185 rtx dst1 = simplify_gen_subreg (single_mode, dst, mode,
3186 GET_MODE_SIZE (single_mode));
3187 rtx src0 = simplify_gen_subreg (single_mode, src, mode, 0);
3188 rtx src1 = simplify_gen_subreg (single_mode, src, mode,
3189 GET_MODE_SIZE (single_mode));
3191 /* At most one pairing may overlap. */
3192 if (reg_overlap_mentioned_p (dst0, src1))
3194 aarch64_emit_move (dst1, src1);
3195 aarch64_emit_move (dst0, src0);
3197 else
3199 aarch64_emit_move (dst0, src0);
3200 aarch64_emit_move (dst1, src1);
3204 /* Split a 128-bit move operation into two 64-bit move operations,
3205 taking care to handle partial overlap of register to register
3206 copies. Special cases are needed when moving between GP regs and
3207 FP regs. SRC can be a register, constant or memory; DST a register
3208 or memory. If either operand is memory it must not have any side
3209 effects. */
3210 void
3211 aarch64_split_128bit_move (rtx dst, rtx src)
3213 machine_mode mode = GET_MODE (dst);
3215 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3216 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3217 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3219 if (REG_P (dst) && REG_P (src))
3221 int src_regno = REGNO (src);
3222 int dst_regno = REGNO (dst);
3224 /* Handle FP <-> GP regs. */
3225 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3227 rtx src_lo = gen_lowpart (word_mode, src);
3228 rtx src_hi = gen_highpart (word_mode, src);
3230 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3231 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3232 return;
3234 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3236 rtx dst_lo = gen_lowpart (word_mode, dst);
3237 rtx dst_hi = gen_highpart (word_mode, dst);
3239 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3240 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3241 return;
3245 aarch64_split_double_move (dst, src, word_mode);
3248 /* Return true if we should split a move from 128-bit value SRC
3249 to 128-bit register DEST. */
3251 bool
3252 aarch64_split_128bit_move_p (rtx dst, rtx src)
3254 if (FP_REGNUM_P (REGNO (dst)))
3255 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3256 /* All moves to GPRs need to be split. */
3257 return true;
3260 /* Split a complex SIMD move. */
3262 void
3263 aarch64_split_simd_move (rtx dst, rtx src)
3265 machine_mode src_mode = GET_MODE (src);
3266 machine_mode dst_mode = GET_MODE (dst);
3268 gcc_assert (VECTOR_MODE_P (dst_mode));
3270 if (REG_P (dst) && REG_P (src))
3272 gcc_assert (VECTOR_MODE_P (src_mode));
3273 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3277 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3278 The semantics of those of svreinterpret rather than those of subregs;
3279 see the comment at the head of aarch64-sve.md for details about the
3280 difference. */
3283 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3285 if (GET_MODE (x) == mode)
3286 return x;
3288 /* can_change_mode_class must only return true if subregs and svreinterprets
3289 have the same semantics. */
3290 if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3291 return force_lowpart_subreg (mode, x, GET_MODE (x));
3293 rtx res = gen_reg_rtx (mode);
3294 x = force_reg (GET_MODE (x), x);
3295 emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3296 return res;
3299 bool
3300 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3301 machine_mode ymode, rtx y)
3303 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3304 gcc_assert (r != NULL);
3305 return rtx_equal_p (x, r);
3308 /* Return TARGET if it is nonnull and a register of mode MODE.
3309 Otherwise, return a fresh register of mode MODE if we can,
3310 or TARGET reinterpreted as MODE if we can't. */
3312 static rtx
3313 aarch64_target_reg (rtx target, machine_mode mode)
3315 if (target && REG_P (target) && GET_MODE (target) == mode)
3316 return target;
3317 if (!can_create_pseudo_p ())
3319 gcc_assert (target);
3320 return gen_lowpart (mode, target);
3322 return gen_reg_rtx (mode);
3325 /* Return a register that contains the constant in BUILDER, given that
3326 the constant is a legitimate move operand. Use TARGET as the register
3327 if it is nonnull and convenient. */
3329 static rtx
3330 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3332 rtx src = builder.build ();
3333 target = aarch64_target_reg (target, GET_MODE (src));
3334 emit_insn (gen_rtx_SET (target, src));
3335 return target;
3338 static rtx
3339 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3341 if (can_create_pseudo_p ())
3342 return force_reg (mode, value);
3343 else
3345 gcc_assert (x);
3346 aarch64_emit_move (x, value);
3347 return x;
3351 /* Return true if predicate value X is a constant in which every element
3352 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3353 value, i.e. as a predicate in which all bits are significant. */
3355 static bool
3356 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3358 if (!CONST_VECTOR_P (x))
3359 return false;
3361 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3362 GET_MODE_NUNITS (GET_MODE (x)));
3363 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3364 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3365 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3367 unsigned int nelts = const_vector_encoded_nelts (x);
3368 for (unsigned int i = 0; i < nelts; ++i)
3370 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3371 if (!CONST_INT_P (elt))
3372 return false;
3374 builder.quick_push (elt);
3375 for (unsigned int j = 1; j < factor; ++j)
3376 builder.quick_push (const0_rtx);
3378 builder.finalize ();
3379 return true;
3382 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3383 widest predicate element size it can have (that is, the largest size
3384 for which each element would still be 0 or 1). */
3386 unsigned int
3387 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3389 /* Start with the most optimistic assumption: that we only need
3390 one bit per pattern. This is what we will use if only the first
3391 bit in each pattern is ever set. */
3392 unsigned int mask = GET_MODE_SIZE (DImode);
3393 mask |= builder.npatterns ();
3395 /* Look for set bits. */
3396 unsigned int nelts = builder.encoded_nelts ();
3397 for (unsigned int i = 1; i < nelts; ++i)
3398 if (INTVAL (builder.elt (i)) != 0)
3400 if (i & 1)
3401 return 1;
3402 mask |= i;
3404 return mask & -mask;
3407 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3408 return that predicate mode, otherwise return opt_machine_mode (). */
3410 opt_machine_mode
3411 aarch64_ptrue_all_mode (rtx x)
3413 gcc_assert (GET_MODE (x) == VNx16BImode);
3414 if (!CONST_VECTOR_P (x)
3415 || !CONST_VECTOR_DUPLICATE_P (x)
3416 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3417 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3418 return opt_machine_mode ();
3420 unsigned int nelts = const_vector_encoded_nelts (x);
3421 for (unsigned int i = 1; i < nelts; ++i)
3422 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3423 return opt_machine_mode ();
3425 return aarch64_sve_pred_mode (nelts);
3428 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3429 that the constant would have with predicate element size ELT_SIZE
3430 (ignoring the upper bits in each element) and return:
3432 * -1 if all bits are set
3433 * N if the predicate has N leading set bits followed by all clear bits
3434 * 0 if the predicate does not have any of these forms. */
3437 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3438 unsigned int elt_size)
3440 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3441 followed by set bits. */
3442 if (builder.nelts_per_pattern () == 3)
3443 return 0;
3445 /* Skip over leading set bits. */
3446 unsigned int nelts = builder.encoded_nelts ();
3447 unsigned int i = 0;
3448 for (; i < nelts; i += elt_size)
3449 if (INTVAL (builder.elt (i)) == 0)
3450 break;
3451 unsigned int vl = i / elt_size;
3453 /* Check for the all-true case. */
3454 if (i == nelts)
3455 return -1;
3457 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3458 repeating pattern of set bits followed by clear bits. */
3459 if (builder.nelts_per_pattern () != 2)
3460 return 0;
3462 /* We have a "foreground" value and a duplicated "background" value.
3463 If the background might repeat and the last set bit belongs to it,
3464 we might have set bits followed by clear bits followed by set bits. */
3465 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3466 return 0;
3468 /* Make sure that the rest are all clear. */
3469 for (; i < nelts; i += elt_size)
3470 if (INTVAL (builder.elt (i)) != 0)
3471 return 0;
3473 return vl;
3476 /* See if there is an svpattern that encodes an SVE predicate of mode
3477 PRED_MODE in which the first VL bits are set and the rest are clear.
3478 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3479 A VL of -1 indicates an all-true vector. */
3481 aarch64_svpattern
3482 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3484 if (vl < 0)
3485 return AARCH64_SV_ALL;
3487 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3488 return AARCH64_NUM_SVPATTERNS;
3490 if (vl >= 1 && vl <= 8)
3491 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3493 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3494 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3496 int max_vl;
3497 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3499 if (vl == (max_vl / 3) * 3)
3500 return AARCH64_SV_MUL3;
3501 /* These would only trigger for non-power-of-2 lengths. */
3502 if (vl == (max_vl & -4))
3503 return AARCH64_SV_MUL4;
3504 if (vl == (1 << floor_log2 (max_vl)))
3505 return AARCH64_SV_POW2;
3506 if (vl == max_vl)
3507 return AARCH64_SV_ALL;
3509 return AARCH64_NUM_SVPATTERNS;
3512 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3513 bits has the lowest bit set and the upper bits clear. This is the
3514 VNx16BImode equivalent of a PTRUE for controlling elements of
3515 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3516 all bits are significant, even the upper zeros. */
3519 aarch64_ptrue_all (unsigned int elt_size)
3521 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3522 builder.quick_push (const1_rtx);
3523 for (unsigned int i = 1; i < elt_size; ++i)
3524 builder.quick_push (const0_rtx);
3525 return builder.build ();
3528 /* Return an all-true predicate register of mode MODE. */
3531 aarch64_ptrue_reg (machine_mode mode)
3533 gcc_assert (aarch64_sve_pred_mode_p (mode));
3534 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3535 return gen_lowpart (mode, reg);
3538 /* Return an all-false predicate register of mode MODE. */
3541 aarch64_pfalse_reg (machine_mode mode)
3543 gcc_assert (aarch64_sve_pred_mode_p (mode));
3544 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3545 return gen_lowpart (mode, reg);
3548 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3549 for it. PRED2[0] is the predicate for the instruction whose result
3550 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3551 for it. Return true if we can prove that the two predicates are
3552 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3553 with PRED1[0] without changing behavior. */
3555 bool
3556 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3558 machine_mode mode = GET_MODE (pred1[0]);
3559 gcc_assert (aarch64_sve_pred_mode_p (mode)
3560 && mode == GET_MODE (pred2[0])
3561 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3562 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3564 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3565 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3566 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3567 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3568 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3571 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3572 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3573 Use TARGET as the target register if nonnull and convenient. */
3575 static rtx
3576 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3577 machine_mode data_mode, rtx op1, rtx op2)
3579 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3580 expand_operand ops[5];
3581 create_output_operand (&ops[0], target, pred_mode);
3582 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3583 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3584 create_input_operand (&ops[3], op1, data_mode);
3585 create_input_operand (&ops[4], op2, data_mode);
3586 expand_insn (icode, 5, ops);
3587 return ops[0].value;
3590 /* Use a comparison to convert integer vector SRC into MODE, which is
3591 the corresponding SVE predicate mode. Use TARGET for the result
3592 if it's nonnull and convenient. */
3595 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3597 machine_mode src_mode = GET_MODE (src);
3598 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3599 src, CONST0_RTX (src_mode));
3602 /* Return the assembly token for svprfop value PRFOP. */
3604 static const char *
3605 svprfop_token (enum aarch64_svprfop prfop)
3607 switch (prfop)
3609 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3610 AARCH64_FOR_SVPRFOP (CASE)
3611 #undef CASE
3612 case AARCH64_NUM_SVPRFOPS:
3613 break;
3615 gcc_unreachable ();
3618 /* Return the assembly string for an SVE prefetch operation with
3619 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3620 and that SUFFIX is the format for the remaining operands. */
3622 char *
3623 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3624 const char *suffix)
3626 static char buffer[128];
3627 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3628 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3629 mnemonic, svprfop_token (prfop), suffix);
3630 gcc_assert (written < sizeof (buffer));
3631 return buffer;
3634 /* Check whether we can calculate the number of elements in PATTERN
3635 at compile time, given that there are NELTS_PER_VQ elements per
3636 128-bit block. Return the value if so, otherwise return -1. */
3638 HOST_WIDE_INT
3639 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3641 unsigned int vl, const_vg;
3642 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3643 vl = 1 + (pattern - AARCH64_SV_VL1);
3644 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3645 vl = 16 << (pattern - AARCH64_SV_VL16);
3646 else if (aarch64_sve_vg.is_constant (&const_vg))
3648 /* There are two vector granules per quadword. */
3649 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3650 switch (pattern)
3652 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3653 case AARCH64_SV_MUL4: return nelts & -4;
3654 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3655 case AARCH64_SV_ALL: return nelts;
3656 default: gcc_unreachable ();
3659 else
3660 return -1;
3662 /* There are two vector granules per quadword. */
3663 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3664 if (known_le (vl, nelts_all))
3665 return vl;
3667 /* Requesting more elements than are available results in a PFALSE. */
3668 if (known_gt (vl, nelts_all))
3669 return 0;
3671 return -1;
3674 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3675 by the number of 128-bit quadwords in an SVE vector. */
3677 static bool
3678 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3680 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3681 return (IN_RANGE (factor, 2, 16 * 16)
3682 && (factor & 1) == 0
3683 && factor <= 16 * (factor & -factor));
3686 /* Return true if we can move VALUE into a register using a single
3687 CNT[BHWD] instruction. */
3689 static bool
3690 aarch64_sve_cnt_immediate_p (poly_int64 value)
3692 HOST_WIDE_INT factor = value.coeffs[0];
3693 return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3696 /* Likewise for rtx X. */
3698 bool
3699 aarch64_sve_cnt_immediate_p (rtx x)
3701 poly_int64 value;
3702 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3705 /* Return the asm string for an instruction with a CNT-like vector size
3706 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3707 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3708 first part of the operands template (the part that comes before the
3709 vector size itself). PATTERN is the pattern to use. FACTOR is the
3710 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3711 in each quadword. If it is zero, we can use any element size. */
3713 static char *
3714 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3715 aarch64_svpattern pattern,
3716 unsigned int factor,
3717 unsigned int nelts_per_vq)
3719 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3721 if (nelts_per_vq == 0)
3722 /* There is some overlap in the ranges of the four CNT instructions.
3723 Here we always use the smallest possible element size, so that the
3724 multiplier is 1 whereever possible. */
3725 nelts_per_vq = factor & -factor;
3726 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3727 gcc_assert (IN_RANGE (shift, 1, 4));
3728 char suffix = "dwhb"[shift - 1];
3730 factor >>= shift;
3731 unsigned int written;
3732 if (pattern == AARCH64_SV_ALL && factor == 1)
3733 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3734 prefix, suffix, operands);
3735 else if (factor == 1)
3736 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3737 prefix, suffix, operands, svpattern_token (pattern));
3738 else
3739 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3740 prefix, suffix, operands, svpattern_token (pattern),
3741 factor);
3742 gcc_assert (written < sizeof (buffer));
3743 return buffer;
3746 /* Return the asm string for an instruction with a CNT-like vector size
3747 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3748 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3749 first part of the operands template (the part that comes before the
3750 vector size itself). X is the value of the vector size operand,
3751 as a polynomial integer rtx; we need to convert this into an "all"
3752 pattern with a multiplier. */
3754 char *
3755 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3756 rtx x)
3758 poly_int64 value = rtx_to_poly_int64 (x);
3759 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3760 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3761 value.coeffs[1], 0);
3764 /* Return the asm string for an instruction with a CNT-like vector size
3765 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3766 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3767 first part of the operands template (the part that comes before the
3768 vector size itself). CNT_PAT[0..2] are the operands of the
3769 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3771 char *
3772 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3773 const char *operands, rtx *cnt_pat)
3775 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3776 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3777 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3778 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3779 factor, nelts_per_vq);
3782 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3784 bool
3785 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3787 poly_int64 value;
3788 return (poly_int_rtx_p (x, &value)
3789 && (aarch64_sve_cnt_immediate_p (value)
3790 || aarch64_sve_cnt_immediate_p (-value)));
3793 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3794 operand 0. */
3796 char *
3797 aarch64_output_sve_scalar_inc_dec (rtx offset)
3799 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3800 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3801 if (offset_value.coeffs[1] > 0)
3802 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3803 offset_value.coeffs[1], 0);
3804 else
3805 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3806 -offset_value.coeffs[1], 0);
3809 /* Return true if a single RDVL instruction can multiply FACTOR by the
3810 number of 128-bit quadwords in an SVE vector. This is also the
3811 range of ADDVL. */
3813 static bool
3814 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
3816 return (multiple_p (factor, 16)
3817 && IN_RANGE (factor, -32 * 16, 31 * 16));
3820 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3821 of quadwords in an SVE vector. */
3823 static bool
3824 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
3826 return (multiple_p (factor, 2)
3827 && IN_RANGE (factor, -32 * 2, 31 * 2));
3830 /* Return true if we can move VALUE into a register using a single
3831 RDVL instruction. */
3833 static bool
3834 aarch64_sve_rdvl_immediate_p (poly_int64 value)
3836 HOST_WIDE_INT factor = value.coeffs[0];
3837 return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
3840 /* Likewise for rtx X. */
3842 bool
3843 aarch64_sve_rdvl_immediate_p (rtx x)
3845 poly_int64 value;
3846 return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
3849 /* Return the asm string for moving RDVL immediate OFFSET into register
3850 operand 0. */
3852 char *
3853 aarch64_output_sve_rdvl (rtx offset)
3855 static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3856 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3857 gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
3859 int factor = offset_value.coeffs[1];
3860 snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
3861 return buffer;
3864 /* Return true if we can add VALUE to a register using a single ADDVL
3865 or ADDPL instruction. */
3867 static bool
3868 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3870 HOST_WIDE_INT factor = value.coeffs[0];
3871 if (factor == 0 || value.coeffs[1] != factor)
3872 return false;
3873 return (aarch64_sve_rdvl_addvl_factor_p (factor)
3874 || aarch64_sve_addpl_factor_p (factor));
3877 /* Likewise for rtx X. */
3879 bool
3880 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3882 poly_int64 value;
3883 return (poly_int_rtx_p (x, &value)
3884 && aarch64_sve_addvl_addpl_immediate_p (value));
3887 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3888 to operand 1 and storing the result in operand 0. */
3890 char *
3891 aarch64_output_sve_addvl_addpl (rtx offset)
3893 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3894 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3895 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3897 int factor = offset_value.coeffs[1];
3898 if ((factor & 15) == 0)
3899 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3900 else
3901 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3902 return buffer;
3905 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3906 instruction. If it is, store the number of elements in each vector
3907 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3908 factor in *FACTOR_OUT (if nonnull). */
3910 bool
3911 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3912 unsigned int *nelts_per_vq_out)
3914 rtx elt;
3915 poly_int64 value;
3917 if (!const_vec_duplicate_p (x, &elt)
3918 || !poly_int_rtx_p (elt, &value))
3919 return false;
3921 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3922 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3923 /* There's no vector INCB. */
3924 return false;
3926 HOST_WIDE_INT factor = value.coeffs[0];
3927 if (value.coeffs[1] != factor)
3928 return false;
3930 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3931 if ((factor % nelts_per_vq) != 0
3932 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3933 return false;
3935 if (factor_out)
3936 *factor_out = factor;
3937 if (nelts_per_vq_out)
3938 *nelts_per_vq_out = nelts_per_vq;
3939 return true;
3942 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3943 instruction. */
3945 bool
3946 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3948 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3951 /* Return the asm template for an SVE vector INC or DEC instruction.
3952 OPERANDS gives the operands before the vector count and X is the
3953 value of the vector count operand itself. */
3955 char *
3956 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3958 int factor;
3959 unsigned int nelts_per_vq;
3960 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3961 gcc_unreachable ();
3962 if (factor < 0)
3963 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3964 -factor, nelts_per_vq);
3965 else
3966 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3967 factor, nelts_per_vq);
3970 /* Return a constant that represents FACTOR multiplied by the
3971 number of 128-bit quadwords in an SME vector. ISA_MODE is the
3972 ISA mode in which the calculation is being performed. */
3975 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
3976 aarch64_feature_flags isa_mode)
3978 gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
3979 if (isa_mode & AARCH64_FL_SM_ON)
3980 /* We're in streaming mode, so we can use normal poly-int values. */
3981 return gen_int_mode ({ factor, factor }, mode);
3983 rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
3984 rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
3985 return gen_rtx_CONST (mode, unspec);
3988 /* Return true if X is a constant that represents some number X
3989 multiplied by the number of quadwords in an SME vector. Store this X
3990 in *FACTOR if so. */
3992 static bool
3993 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
3995 if (!TARGET_SME || GET_CODE (x) != CONST)
3996 return false;
3998 x = XEXP (x, 0);
3999 if (GET_CODE (x) != UNSPEC
4000 || XINT (x, 1) != UNSPEC_SME_VQ
4001 || XVECLEN (x, 0) != 1)
4002 return false;
4004 x = XVECEXP (x, 0, 0);
4005 if (!CONST_INT_P (x))
4006 return false;
4008 *factor = INTVAL (x);
4009 return true;
4012 /* Return true if X is a constant that represents some number Y
4013 multiplied by the number of quadwords in an SME vector, and if
4014 that Y is in the range of RDSVL. */
4016 bool
4017 aarch64_rdsvl_immediate_p (const_rtx x)
4019 HOST_WIDE_INT factor;
4020 return (aarch64_sme_vq_unspec_p (x, &factor)
4021 && aarch64_sve_rdvl_addvl_factor_p (factor));
4024 /* Return the asm string for an RDSVL instruction that calculates X,
4025 which is a constant that satisfies aarch64_rdsvl_immediate_p. */
4027 char *
4028 aarch64_output_rdsvl (const_rtx x)
4030 gcc_assert (aarch64_rdsvl_immediate_p (x));
4031 static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4032 x = XVECEXP (XEXP (x, 0), 0, 0);
4033 snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
4034 (int) INTVAL (x) / 16);
4035 return buffer;
4038 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL. */
4040 bool
4041 aarch64_addsvl_addspl_immediate_p (const_rtx x)
4043 HOST_WIDE_INT factor;
4044 return (aarch64_sme_vq_unspec_p (x, &factor)
4045 && (aarch64_sve_rdvl_addvl_factor_p (factor)
4046 || aarch64_sve_addpl_factor_p (factor)));
4049 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4050 Return the asm string for the associated instruction. */
4052 char *
4053 aarch64_output_addsvl_addspl (rtx x)
4055 static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4056 HOST_WIDE_INT factor;
4057 if (!aarch64_sme_vq_unspec_p (x, &factor))
4058 gcc_unreachable ();
4059 if (aarch64_sve_rdvl_addvl_factor_p (factor))
4060 snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4061 (int) factor / 16);
4062 else if (aarch64_sve_addpl_factor_p (factor))
4063 snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4064 (int) factor / 2);
4065 else
4066 gcc_unreachable ();
4067 return buffer;
4070 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4072 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4074 0x0000000100000001ull,
4075 0x0001000100010001ull,
4076 0x0101010101010101ull,
4077 0x1111111111111111ull,
4078 0x5555555555555555ull,
4083 /* Return true if 64-bit VAL is a valid bitmask immediate. */
4084 static bool
4085 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4087 unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4088 int bits;
4090 /* Check for a single sequence of one bits and return quickly if so.
4091 The special cases of all ones and all zeroes returns false. */
4092 tmp = val + (val & -val);
4094 if (tmp == (tmp & -tmp))
4095 return (val + 1) > 1;
4097 /* Invert if the immediate doesn't start with a zero bit - this means we
4098 only need to search for sequences of one bits. */
4099 if (val & 1)
4100 val = ~val;
4102 /* Find the first set bit and set tmp to val with the first sequence of one
4103 bits removed. Return success if there is a single sequence of ones. */
4104 first_one = val & -val;
4105 tmp = val & (val + first_one);
4107 if (tmp == 0)
4108 return true;
4110 /* Find the next set bit and compute the difference in bit position. */
4111 next_one = tmp & -tmp;
4112 bits = clz_hwi (first_one) - clz_hwi (next_one);
4113 mask = val ^ tmp;
4115 /* Check the bit position difference is a power of 2, and that the first
4116 sequence of one bits fits within 'bits' bits. */
4117 if ((mask >> bits) != 0 || bits != (bits & -bits))
4118 return false;
4120 /* Check the sequence of one bits is repeated 64/bits times. */
4121 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4125 /* Return true if VAL is a valid bitmask immediate for MODE. */
4126 bool
4127 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4129 if (mode == DImode)
4130 return aarch64_bitmask_imm (val);
4132 if (mode == SImode)
4133 return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4135 /* Replicate small immediates to fit 64 bits. */
4136 int size = GET_MODE_UNIT_PRECISION (mode);
4137 val &= (HOST_WIDE_INT_1U << size) - 1;
4138 val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4140 return aarch64_bitmask_imm (val);
4144 /* Return true if the immediate VAL can be a bitfield immediate
4145 by changing the given MASK bits in VAL to zeroes, ones or bits
4146 from the other half of VAL. Return the new immediate in VAL2. */
4147 static inline bool
4148 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4149 unsigned HOST_WIDE_INT &val2,
4150 unsigned HOST_WIDE_INT mask)
4152 val2 = val & ~mask;
4153 if (val2 != val && aarch64_bitmask_imm (val2))
4154 return true;
4155 val2 = val | mask;
4156 if (val2 != val && aarch64_bitmask_imm (val2))
4157 return true;
4158 val = val & ~mask;
4159 val2 = val | (((val >> 32) | (val << 32)) & mask);
4160 if (val2 != val && aarch64_bitmask_imm (val2))
4161 return true;
4162 val2 = val | (((val >> 16) | (val << 48)) & mask);
4163 if (val2 != val && aarch64_bitmask_imm (val2))
4164 return true;
4165 return false;
4169 /* Return true if VAL is a valid MOVZ immediate. */
4170 static inline bool
4171 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4173 return (val >> (ctz_hwi (val) & 48)) < 65536;
4177 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
4178 bool
4179 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4181 return aarch64_is_movz (val) || aarch64_is_movz (~val)
4182 || aarch64_bitmask_imm (val);
4186 /* Return true if VAL is an immediate that can be created by a single
4187 MOV instruction. */
4188 bool
4189 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4191 gcc_assert (mode == SImode || mode == DImode);
4193 if (val < 65536)
4194 return true;
4196 unsigned HOST_WIDE_INT mask =
4197 (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4199 if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4200 return true;
4202 val = (val & mask) | ((val << 32) & ~mask);
4203 return aarch64_bitmask_imm (val);
4207 static int
4208 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4209 machine_mode mode)
4211 int i;
4212 unsigned HOST_WIDE_INT val, val2, val3, mask;
4213 int one_match, zero_match;
4214 int num_insns;
4216 gcc_assert (mode == SImode || mode == DImode);
4218 val = INTVAL (imm);
4220 if (aarch64_move_imm (val, mode))
4222 if (generate)
4223 emit_insn (gen_rtx_SET (dest, imm));
4224 return 1;
4227 if ((val >> 32) == 0 || mode == SImode)
4229 if (generate)
4231 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4232 if (mode == SImode)
4233 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4234 GEN_INT ((val >> 16) & 0xffff)));
4235 else
4236 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4237 GEN_INT ((val >> 16) & 0xffff)));
4239 return 2;
4242 /* Remaining cases are all for DImode. */
4244 mask = 0xffff;
4245 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4246 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4247 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4248 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4250 /* Try a bitmask immediate and a movk to generate the immediate
4251 in 2 instructions. */
4253 if (zero_match < 2 && one_match < 2)
4255 for (i = 0; i < 64; i += 16)
4257 if (aarch64_check_bitmask (val, val2, mask << i))
4258 break;
4260 val2 = val & ~(mask << i);
4261 if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4262 break;
4265 if (i != 64)
4267 if (generate)
4269 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4270 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4271 GEN_INT ((val >> i) & 0xffff)));
4273 return 2;
4276 /* Try 2 bitmask immediates which are xor'd together. */
4277 for (i = 0; i < 64; i += 16)
4279 val2 = (val >> i) & mask;
4280 val2 |= val2 << 16;
4281 val2 |= val2 << 32;
4282 if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4283 break;
4286 if (i != 64)
4288 if (generate)
4290 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4291 emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4293 return 2;
4297 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
4298 if (zero_match + one_match == 0)
4300 for (i = 0; i < 48; i += 16)
4301 for (int j = i + 16; j < 64; j += 16)
4302 if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4304 if (generate)
4306 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4307 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4308 GEN_INT ((val >> i) & 0xffff)));
4309 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4310 GEN_INT ((val >> j) & 0xffff)));
4312 return 3;
4315 /* Try shifting and inserting the bottom 32-bits into the top bits. */
4316 val2 = val & 0xffffffff;
4317 val3 = 0xffffffff;
4318 val3 = val2 | (val3 << 32);
4319 for (i = 17; i < 48; i++)
4320 if ((val2 | (val2 << i)) == val)
4322 if (generate)
4324 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4325 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4326 GEN_INT (val2 >> 16)));
4327 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4329 return 3;
4331 else if ((val3 & ~(val3 << i)) == val)
4333 if (generate)
4335 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4336 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4337 GEN_INT (val2 >> 16)));
4338 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4339 dest));
4341 return 3;
4345 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4346 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4347 otherwise skip zero bits. */
4349 num_insns = 1;
4350 mask = 0xffff;
4351 val2 = one_match > zero_match ? ~val : val;
4352 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4354 if (generate)
4355 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4356 ? (val | ~(mask << i))
4357 : (val & (mask << i)))));
4358 for (i += 16; i < 64; i += 16)
4360 if ((val2 & (mask << i)) == 0)
4361 continue;
4362 if (generate)
4363 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4364 GEN_INT ((val >> i) & 0xffff)));
4365 num_insns ++;
4368 return num_insns;
4371 /* Return whether imm is a 128-bit immediate which is simple enough to
4372 expand inline. */
4373 bool
4374 aarch64_mov128_immediate (rtx imm)
4376 if (CONST_INT_P (imm))
4377 return true;
4379 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4381 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4382 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4384 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4385 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4389 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4390 a left shift of 0 or 12 bits. */
4391 bool
4392 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4394 return val < 4096 || (val & 0xfff000) == val;
4397 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4398 that can be created with a left shift of 0 or 12. */
4399 static HOST_WIDE_INT
4400 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4402 /* Check to see if the value fits in 24 bits, as that is the maximum we can
4403 handle correctly. */
4404 gcc_assert (val < 0x1000000);
4406 if (val < 4096)
4407 return val;
4409 return val & 0xfff000;
4413 /* Test whether:
4415 X = (X & AND_VAL) | IOR_VAL;
4417 can be implemented using:
4419 MOVK X, #(IOR_VAL >> shift), LSL #shift
4421 Return the shift if so, otherwise return -1. */
4423 aarch64_movk_shift (const wide_int_ref &and_val,
4424 const wide_int_ref &ior_val)
4426 unsigned int precision = and_val.get_precision ();
4427 unsigned HOST_WIDE_INT mask = 0xffff;
4428 for (unsigned int shift = 0; shift < precision; shift += 16)
4430 if (and_val == ~mask && (ior_val & mask) == ior_val)
4431 return shift;
4432 mask <<= 16;
4434 return -1;
4437 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4438 Assumed precondition: VAL_IN Is not zero. */
4440 unsigned HOST_WIDE_INT
4441 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4443 int lowest_bit_set = ctz_hwi (val_in);
4444 int highest_bit_set = floor_log2 (val_in);
4445 gcc_assert (val_in != 0);
4447 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4448 (HOST_WIDE_INT_1U << lowest_bit_set));
4451 /* Create constant where bits outside of lowest bit set to highest bit set
4452 are set to 1. */
4454 unsigned HOST_WIDE_INT
4455 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4457 return val_in | ~aarch64_and_split_imm1 (val_in);
4460 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4462 bool
4463 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4465 scalar_int_mode int_mode;
4466 if (!is_a <scalar_int_mode> (mode, &int_mode))
4467 return false;
4469 if (aarch64_bitmask_imm (val_in, int_mode))
4470 return false;
4472 if (aarch64_move_imm (val_in, int_mode))
4473 return false;
4475 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4477 return aarch64_bitmask_imm (imm2, int_mode);
4480 /* Return the number of temporary registers that aarch64_add_offset_1
4481 would need to add OFFSET to a register. */
4483 static unsigned int
4484 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4486 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4489 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4490 a non-polynomial OFFSET. MODE is the mode of the addition.
4491 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4492 be set and CFA adjustments added to the generated instructions.
4494 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4495 temporary if register allocation is already complete. This temporary
4496 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4497 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4498 the immediate again.
4500 Since this function may be used to adjust the stack pointer, we must
4501 ensure that it cannot cause transient stack deallocation (for example
4502 by first incrementing SP and then decrementing when adjusting by a
4503 large immediate). */
4505 static void
4506 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4507 rtx src, HOST_WIDE_INT offset, rtx temp1,
4508 bool frame_related_p, bool emit_move_imm)
4510 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4511 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4513 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4514 rtx_insn *insn;
4516 if (!moffset)
4518 if (!rtx_equal_p (dest, src))
4520 insn = emit_insn (gen_rtx_SET (dest, src));
4521 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4523 return;
4526 /* Single instruction adjustment. */
4527 if (aarch64_uimm12_shift (moffset))
4529 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4530 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4531 return;
4534 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4535 and either:
4537 a) the offset cannot be loaded by a 16-bit move or
4538 b) there is no spare register into which we can move it. */
4539 if (moffset < 0x1000000
4540 && ((!temp1 && !can_create_pseudo_p ())
4541 || !aarch64_move_imm (moffset, mode)))
4543 HOST_WIDE_INT low_off = moffset & 0xfff;
4545 low_off = offset < 0 ? -low_off : low_off;
4546 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4547 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4548 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4549 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4550 return;
4553 /* Emit a move immediate if required and an addition/subtraction. */
4554 if (emit_move_imm)
4556 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4557 temp1 = aarch64_force_temporary (mode, temp1,
4558 gen_int_mode (moffset, mode));
4560 insn = emit_insn (offset < 0
4561 ? gen_sub3_insn (dest, src, temp1)
4562 : gen_add3_insn (dest, src, temp1));
4563 if (frame_related_p)
4565 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4566 rtx adj = plus_constant (mode, src, offset);
4567 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4571 /* Return the number of temporary registers that aarch64_add_offset
4572 would need to move OFFSET into a register or add OFFSET to a register;
4573 ADD_P is true if we want the latter rather than the former. */
4575 static unsigned int
4576 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4578 /* This follows the same structure as aarch64_add_offset. */
4579 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4580 return 0;
4582 unsigned int count = 0;
4583 HOST_WIDE_INT factor = offset.coeffs[1];
4584 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4585 poly_int64 poly_offset (factor, factor);
4586 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4587 /* Need one register for the ADDVL/ADDPL result. */
4588 count += 1;
4589 else if (factor != 0)
4591 factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4592 if (!IN_RANGE (factor, -32, 31))
4593 /* Need one register for the CNT or RDVL result and one for the
4594 multiplication factor. If necessary, the second temporary
4595 can be reused for the constant part of the offset. */
4596 return 2;
4597 /* Need one register for the CNT or RDVL result (which might then
4598 be shifted). */
4599 count += 1;
4601 return count + aarch64_add_offset_1_temporaries (constant);
4604 /* If X can be represented as a poly_int64, return the number
4605 of temporaries that are required to add it to a register.
4606 Return -1 otherwise. */
4609 aarch64_add_offset_temporaries (rtx x)
4611 poly_int64 offset;
4612 if (!poly_int_rtx_p (x, &offset))
4613 return -1;
4614 return aarch64_offset_temporaries (true, offset);
4617 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4618 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4619 be set and CFA adjustments added to the generated instructions.
4621 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4622 temporary if register allocation is already complete. This temporary
4623 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4624 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4625 false to avoid emitting the immediate again.
4627 TEMP2, if nonnull, is a second temporary register that doesn't
4628 overlap either DEST or REG.
4630 FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
4631 is measured relative to the SME vector length instead of the current
4632 prevailing vector length. It is 0 otherwise.
4634 Since this function may be used to adjust the stack pointer, we must
4635 ensure that it cannot cause transient stack deallocation (for example
4636 by first incrementing SP and then decrementing when adjusting by a
4637 large immediate). */
4639 static void
4640 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4641 poly_int64 offset, rtx temp1, rtx temp2,
4642 aarch64_feature_flags force_isa_mode,
4643 bool frame_related_p, bool emit_move_imm = true)
4645 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4646 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4647 gcc_assert (temp1 == NULL_RTX
4648 || !frame_related_p
4649 || !reg_overlap_mentioned_p (temp1, dest));
4650 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4652 /* Try using ADDVL or ADDPL to add the whole value. */
4653 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4655 gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4656 rtx offset_rtx;
4657 if (force_isa_mode == 0)
4658 offset_rtx = gen_int_mode (offset, mode);
4659 else
4660 offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4661 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4662 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4663 if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
4664 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4665 gen_rtx_SET (dest, plus_constant (Pmode, src,
4666 offset)));
4667 return;
4670 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4671 SVE vector register, over and above the minimum size of 128 bits.
4672 This is equivalent to half the value returned by CNTD with a
4673 vector shape of ALL. */
4674 HOST_WIDE_INT factor = offset.coeffs[1];
4675 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4677 /* Try using ADDVL or ADDPL to add the VG-based part. */
4678 poly_int64 poly_offset (factor, factor);
4679 if (src != const0_rtx
4680 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4682 rtx offset_rtx;
4683 if (force_isa_mode == 0)
4684 offset_rtx = gen_int_mode (poly_offset, mode);
4685 else
4686 offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4687 if (frame_related_p)
4689 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4690 RTX_FRAME_RELATED_P (insn) = true;
4691 if (force_isa_mode & AARCH64_FL_SM_ON)
4692 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4693 gen_rtx_SET (dest, plus_constant (Pmode, src,
4694 poly_offset)));
4695 src = dest;
4697 else
4699 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4700 src = aarch64_force_temporary (mode, temp1, addr);
4701 temp1 = temp2;
4702 temp2 = NULL_RTX;
4705 /* Otherwise use a CNT-based sequence. */
4706 else if (factor != 0)
4708 /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4709 with negative shifts indicating a shift right. */
4710 HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4711 HOST_WIDE_INT rel_factor = factor / low_bit;
4712 int shift = exact_log2 (low_bit) - 4;
4713 gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4715 /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4716 equal to CNTB * FACTOR / 16, with CODE being the [+-].
4718 We can avoid a multiplication if REL_FACTOR is in the range
4719 of RDVL, although there are then various optimizations that
4720 we can try on top. */
4721 rtx_code code = PLUS;
4722 rtx val;
4723 if (IN_RANGE (rel_factor, -32, 31))
4725 if (force_isa_mode & AARCH64_FL_SM_ON)
4727 /* Try to use an unshifted RDSVL, otherwise fall back on
4728 a shifted RDSVL #1. */
4729 if (aarch64_sve_rdvl_addvl_factor_p (factor))
4730 shift = 0;
4731 else
4732 factor = rel_factor * 16;
4733 val = aarch64_sme_vq_immediate (mode, factor, 0);
4735 /* Try to use an unshifted CNT[BHWD] or RDVL. */
4736 else if (aarch64_sve_cnt_factor_p (factor)
4737 || aarch64_sve_rdvl_addvl_factor_p (factor))
4739 val = gen_int_mode (poly_int64 (factor, factor), mode);
4740 shift = 0;
4742 /* Try to subtract an unshifted CNT[BHWD]. */
4743 else if (aarch64_sve_cnt_factor_p (-factor))
4745 code = MINUS;
4746 val = gen_int_mode (poly_int64 (-factor, -factor), mode);
4747 shift = 0;
4749 /* If subtraction is free, prefer to load a positive constant.
4750 In the best case this will fit a shifted CNTB. */
4751 else if (src != const0_rtx && rel_factor < 0)
4753 code = MINUS;
4754 val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
4756 /* Otherwise use a shifted RDVL or CNT[BHWD]. */
4757 else
4758 val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
4760 else
4762 /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4763 since it should increase the chances of being able to use
4764 a shift and add sequence for the multiplication.
4765 If CNTB << SHIFT is out of range, stick with the current
4766 shift factor. */
4767 if (force_isa_mode == 0
4768 && IN_RANGE (low_bit, 2, 16 * 16))
4770 val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
4771 shift = 0;
4773 else if ((force_isa_mode & AARCH64_FL_SM_ON)
4774 && aarch64_sve_rdvl_addvl_factor_p (low_bit))
4776 val = aarch64_sme_vq_immediate (mode, low_bit, 0);
4777 shift = 0;
4779 else
4780 val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
4782 val = aarch64_force_temporary (mode, temp1, val);
4784 /* Prefer to multiply by a positive factor and subtract rather
4785 than multiply by a negative factor and add, since positive
4786 values are usually easier to move. */
4787 if (rel_factor < 0 && src != const0_rtx)
4789 rel_factor = -rel_factor;
4790 code = MINUS;
4793 if (can_create_pseudo_p ())
4795 rtx coeff1 = gen_int_mode (rel_factor, mode);
4796 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4798 else
4800 rtx coeff1 = gen_int_mode (rel_factor, mode);
4801 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4802 val = gen_rtx_MULT (mode, val, coeff1);
4806 /* Multiply by 2 ** SHIFT. */
4807 if (shift > 0)
4809 val = aarch64_force_temporary (mode, temp1, val);
4810 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4812 else if (shift < 0)
4814 val = aarch64_force_temporary (mode, temp1, val);
4815 val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
4818 /* Add the result to SRC or subtract the result from SRC. */
4819 if (src != const0_rtx)
4821 val = aarch64_force_temporary (mode, temp1, val);
4822 val = gen_rtx_fmt_ee (code, mode, src, val);
4824 else if (code == MINUS)
4826 val = aarch64_force_temporary (mode, temp1, val);
4827 val = gen_rtx_NEG (mode, val);
4830 if (constant == 0 || frame_related_p)
4832 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4833 if (frame_related_p)
4835 RTX_FRAME_RELATED_P (insn) = true;
4836 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4837 gen_rtx_SET (dest, plus_constant (Pmode, src,
4838 poly_offset)));
4840 src = dest;
4841 if (constant == 0)
4842 return;
4844 else
4846 src = aarch64_force_temporary (mode, temp1, val);
4847 temp1 = temp2;
4848 temp2 = NULL_RTX;
4851 emit_move_imm = true;
4854 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4855 frame_related_p, emit_move_imm);
4858 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4859 than a poly_int64. */
4861 void
4862 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4863 rtx offset_rtx, rtx temp1, rtx temp2)
4865 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4866 temp1, temp2, 0, false);
4869 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4870 TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
4871 for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
4872 contains abs (DELTA). */
4874 static inline void
4875 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
4876 aarch64_feature_flags force_isa_mode, bool emit_move_imm)
4878 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4879 temp1, temp2, force_isa_mode, true, emit_move_imm);
4882 /* Subtract DELTA from the stack pointer, marking the instructions
4883 frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
4884 aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
4886 static inline void
4887 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
4888 aarch64_feature_flags force_isa_mode,
4889 bool frame_related_p, bool emit_move_imm = true)
4891 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4892 temp1, temp2, force_isa_mode, frame_related_p,
4893 emit_move_imm);
4896 /* A streaming-compatible function needs to switch temporarily to the known
4897 PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains
4898 the runtime state of PSTATE.SM in the streaming-compatible code, before
4899 the start of the switch to LOCAL_MODE.
4901 Emit instructions to branch around the mode switch if PSTATE.SM already
4902 matches LOCAL_MODE. Return the label that the branch jumps to. */
4904 static rtx_insn *
4905 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
4907 local_mode &= AARCH64_FL_SM_STATE;
4908 gcc_assert (local_mode != 0);
4909 auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
4910 auto *label = gen_label_rtx ();
4911 auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
4912 label);
4913 auto *jump = emit_jump_insn (branch);
4914 JUMP_LABEL (jump) = label;
4915 return label;
4918 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
4919 state in NEW_MODE. This is known to involve either an SMSTART SM or
4920 an SMSTOP SM. */
4922 static void
4923 aarch64_switch_pstate_sm (aarch64_feature_flags old_mode,
4924 aarch64_feature_flags new_mode)
4926 old_mode &= AARCH64_FL_SM_STATE;
4927 new_mode &= AARCH64_FL_SM_STATE;
4928 gcc_assert (old_mode != new_mode);
4930 if ((new_mode & AARCH64_FL_SM_ON)
4931 || (new_mode == 0 && (old_mode & AARCH64_FL_SM_OFF)))
4932 emit_insn (gen_aarch64_smstart_sm ());
4933 else
4934 emit_insn (gen_aarch64_smstop_sm ());
4937 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
4938 FP and predicate registers. This class emits code to preserve any
4939 necessary registers around the mode switch.
4941 The class uses four approaches to saving and restoring contents, enumerated
4942 by group_type:
4944 - GPR: save and restore the contents of FP registers using GPRs.
4945 This is used if the FP register contains no more than 64 significant
4946 bits. The registers used are FIRST_GPR onwards.
4948 - MEM_128: save and restore 128-bit SIMD registers using memory.
4950 - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
4952 - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
4954 The save slots within each memory group are consecutive, with the
4955 MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
4957 There will only be two mode switches for each use of SME, so they should
4958 not be particularly performance-sensitive. It's also rare for SIMD, SVE
4959 or predicate registers to be live across mode switches. We therefore
4960 don't preallocate the save slots but instead allocate them locally on
4961 demand. This makes the code emitted by the class self-contained. */
4963 class aarch64_sme_mode_switch_regs
4965 public:
4966 static const unsigned int FIRST_GPR = R10_REGNUM;
4968 void add_reg (machine_mode, unsigned int);
4969 void add_call_args (rtx_call_insn *);
4970 void add_call_result (rtx_call_insn *);
4971 void add_call_preserved_reg (unsigned int);
4972 void add_call_preserved_regs (bitmap);
4974 void emit_prologue ();
4975 void emit_epilogue ();
4977 /* The number of GPRs needed to save FP registers, starting from
4978 FIRST_GPR. */
4979 unsigned int num_gprs () { return m_group_count[GPR]; }
4981 private:
4982 enum sequence { PROLOGUE, EPILOGUE };
4983 enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
4985 /* Information about the save location for one FP, SIMD, SVE data, or
4986 SVE predicate register. */
4987 struct save_location {
4988 /* The register to be saved. */
4989 rtx reg;
4991 /* Which group the save location belongs to. */
4992 group_type group;
4994 /* A zero-based index of the register within the group. */
4995 unsigned int index;
4998 unsigned int sve_data_headroom ();
4999 rtx get_slot_mem (machine_mode, poly_int64);
5000 void emit_stack_adjust (sequence, poly_int64);
5001 void emit_mem_move (sequence, const save_location &, poly_int64);
5003 void emit_gpr_moves (sequence);
5004 void emit_mem_128_moves (sequence);
5005 void emit_sve_sp_adjust (sequence);
5006 void emit_sve_pred_moves (sequence);
5007 void emit_sve_data_moves (sequence);
5009 /* All save locations, in no particular order. */
5010 auto_vec<save_location, 12> m_save_locations;
5012 /* The number of registers in each group. */
5013 unsigned int m_group_count[NUM_GROUPS] = {};
5016 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5017 switch. */
5019 void
5020 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
5022 if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
5023 return;
5025 unsigned int end_regno = end_hard_regno (mode, regno);
5026 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5027 gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
5028 for (; regno < end_regno; regno++)
5030 /* Force the mode of SVE saves and restores even for single registers.
5031 This is necessary because big-endian targets only allow LDR Z and
5032 STR Z to be used with byte modes. */
5033 machine_mode submode = mode;
5034 if (vec_flags & VEC_SVE_PRED)
5035 submode = VNx16BImode;
5036 else if (vec_flags & VEC_SVE_DATA)
5037 submode = SVE_BYTE_MODE;
5038 else if (vec_flags & VEC_STRUCT)
5040 if (vec_flags & VEC_PARTIAL)
5041 submode = V8QImode;
5042 else
5043 submode = V16QImode;
5045 save_location loc;
5046 loc.reg = gen_rtx_REG (submode, regno);
5047 if (vec_flags & VEC_SVE_PRED)
5049 gcc_assert (PR_REGNUM_P (regno));
5050 loc.group = MEM_SVE_PRED;
5052 else
5054 gcc_assert (FP_REGNUM_P (regno));
5055 if (known_le (GET_MODE_SIZE (submode), 8))
5056 loc.group = GPR;
5057 else if (known_eq (GET_MODE_SIZE (submode), 16))
5058 loc.group = MEM_128;
5059 else
5060 loc.group = MEM_SVE_DATA;
5062 loc.index = m_group_count[loc.group]++;
5063 m_save_locations.quick_push (loc);
5067 /* Record that the arguments to CALL_INSN need to be preserved around
5068 the mode switch. */
5070 void
5071 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5073 for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5074 node; node = XEXP (node, 1))
5076 rtx item = XEXP (node, 0);
5077 if (GET_CODE (item) != USE)
5078 continue;
5079 item = XEXP (item, 0);
5080 if (!REG_P (item))
5081 continue;
5082 add_reg (GET_MODE (item), REGNO (item));
5086 /* Record that the return value from CALL_INSN (if any) needs to be
5087 preserved around the mode switch. */
5089 void
5090 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5092 rtx pat = PATTERN (call_insn);
5093 gcc_assert (GET_CODE (pat) == PARALLEL);
5094 pat = XVECEXP (pat, 0, 0);
5095 if (GET_CODE (pat) == CALL)
5096 return;
5097 rtx dest = SET_DEST (pat);
5098 if (GET_CODE (dest) == PARALLEL)
5099 for (int i = 0; i < XVECLEN (dest, 0); ++i)
5101 rtx x = XVECEXP (dest, 0, i);
5102 gcc_assert (GET_CODE (x) == EXPR_LIST);
5103 rtx reg = XEXP (x, 0);
5104 add_reg (GET_MODE (reg), REGNO (reg));
5106 else
5107 add_reg (GET_MODE (dest), REGNO (dest));
5110 /* REGNO is a register that is call-preserved under the current function's ABI.
5111 Record that it must be preserved around the mode switch. */
5113 void
5114 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5116 if (FP_REGNUM_P (regno))
5117 switch (crtl->abi->id ())
5119 case ARM_PCS_SVE:
5120 add_reg (VNx16QImode, regno);
5121 break;
5122 case ARM_PCS_SIMD:
5123 add_reg (V16QImode, regno);
5124 break;
5125 case ARM_PCS_AAPCS64:
5126 add_reg (DImode, regno);
5127 break;
5128 default:
5129 gcc_unreachable ();
5131 else if (PR_REGNUM_P (regno))
5132 add_reg (VNx16BImode, regno);
5135 /* The hard registers in REGS are call-preserved under the current function's
5136 ABI. Record that they must be preserved around the mode switch. */
5138 void
5139 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5141 bitmap_iterator bi;
5142 unsigned int regno;
5143 EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5144 if (HARD_REGISTER_NUM_P (regno))
5145 add_call_preserved_reg (regno);
5146 else
5147 break;
5150 /* Emit code to save registers before the mode switch. */
5152 void
5153 aarch64_sme_mode_switch_regs::emit_prologue ()
5155 emit_sve_sp_adjust (PROLOGUE);
5156 emit_sve_pred_moves (PROLOGUE);
5157 emit_sve_data_moves (PROLOGUE);
5158 emit_mem_128_moves (PROLOGUE);
5159 emit_gpr_moves (PROLOGUE);
5162 /* Emit code to restore registers after the mode switch. */
5164 void
5165 aarch64_sme_mode_switch_regs::emit_epilogue ()
5167 emit_gpr_moves (EPILOGUE);
5168 emit_mem_128_moves (EPILOGUE);
5169 emit_sve_pred_moves (EPILOGUE);
5170 emit_sve_data_moves (EPILOGUE);
5171 emit_sve_sp_adjust (EPILOGUE);
5174 /* The SVE predicate registers are stored below the SVE data registers,
5175 with the predicate save area being padded to a data-register-sized
5176 boundary. Return the size of this padded area as a whole number
5177 of data register slots. */
5179 unsigned int
5180 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5182 return CEIL (m_group_count[MEM_SVE_PRED], 8);
5185 /* Return a memory reference of mode MODE to OFFSET bytes from the
5186 stack pointer. */
5189 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5190 poly_int64 offset)
5192 rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5193 return gen_rtx_MEM (mode, addr);
5196 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */
5198 void
5199 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5200 poly_int64 size)
5202 if (seq == PROLOGUE)
5203 size = -size;
5204 emit_insn (gen_rtx_SET (stack_pointer_rtx,
5205 plus_constant (Pmode, stack_pointer_rtx, size)));
5208 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5209 the stack pointer. SEQ chooses between saving and restoring. */
5211 void
5212 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5213 const save_location &loc,
5214 poly_int64 offset)
5216 rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5217 if (seq == PROLOGUE)
5218 emit_move_insn (mem, loc.reg);
5219 else
5220 emit_move_insn (loc.reg, mem);
5223 /* Emit instructions to save or restore the GPR group. SEQ chooses between
5224 saving and restoring. */
5226 void
5227 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5229 for (auto &loc : m_save_locations)
5230 if (loc.group == GPR)
5232 gcc_assert (loc.index < 8);
5233 rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5234 if (seq == PROLOGUE)
5235 emit_move_insn (gpr, loc.reg);
5236 else
5237 emit_move_insn (loc.reg, gpr);
5241 /* Emit instructions to save or restore the MEM_128 group. SEQ chooses
5242 between saving and restoring. */
5244 void
5245 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5247 HOST_WIDE_INT count = m_group_count[MEM_128];
5248 if (count == 0)
5249 return;
5251 auto sp = stack_pointer_rtx;
5252 auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5254 /* Pick a common mode that supports LDR & STR with pre/post-modification
5255 and LDP & STP with pre/post-modification. */
5256 auto mode = TFmode;
5258 /* An instruction pattern that should be emitted at the end. */
5259 rtx last_pat = NULL_RTX;
5261 /* A previous MEM_128 location that hasn't been handled yet. */
5262 save_location *prev_loc = nullptr;
5264 /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */
5265 for (auto &loc : m_save_locations)
5266 if (loc.group == MEM_128)
5268 if (!prev_loc)
5270 prev_loc = &loc;
5271 continue;
5273 gcc_assert (loc.index == prev_loc->index + 1);
5275 /* The offset of the base of the save area from the current
5276 stack pointer. */
5277 HOST_WIDE_INT bias = 0;
5278 if (prev_loc->index == 0 && seq == PROLOGUE)
5279 bias = sp_adjust;
5281 /* Get the two sets in the LDP/STP. */
5282 rtx ops[] = {
5283 gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5284 get_slot_mem (mode, prev_loc->index * 16 + bias),
5285 gen_rtx_REG (mode, REGNO (loc.reg)),
5286 get_slot_mem (mode, loc.index * 16 + bias)
5288 unsigned int lhs = (seq == PROLOGUE);
5289 rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5290 rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5292 /* Combine the sets with any stack allocation/deallocation. */
5293 rtx pat;
5294 if (prev_loc->index == 0)
5296 rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5297 rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5298 pat = gen_rtx_PARALLEL (VOIDmode, vec);
5300 else if (seq == PROLOGUE)
5301 pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5302 else
5303 pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5305 /* Queue a deallocation to the end, otherwise emit the
5306 instruction now. */
5307 if (seq == EPILOGUE && prev_loc->index == 0)
5308 last_pat = pat;
5309 else
5310 emit_insn (pat);
5311 prev_loc = nullptr;
5314 /* Handle any leftover LDR/STR. */
5315 if (prev_loc)
5317 rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5318 rtx addr;
5319 if (prev_loc->index != 0)
5320 addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5321 else if (seq == PROLOGUE)
5323 rtx allocate = plus_constant (Pmode, sp, -count * 16);
5324 addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5326 else
5328 rtx deallocate = plus_constant (Pmode, sp, count * 16);
5329 addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5331 rtx mem = gen_rtx_MEM (mode, addr);
5332 if (seq == PROLOGUE)
5333 emit_move_insn (mem, reg);
5334 else
5335 emit_move_insn (reg, mem);
5338 if (last_pat)
5339 emit_insn (last_pat);
5342 /* Allocate or deallocate the stack space needed by the SVE groups.
5343 SEQ chooses between allocating and deallocating. */
5345 void
5346 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5348 if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5349 emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5352 /* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving
5353 and restoring. */
5355 void
5356 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5358 for (auto &loc : m_save_locations)
5359 if (loc.group == MEM_SVE_DATA)
5361 auto index = loc.index + sve_data_headroom ();
5362 emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5366 /* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving
5367 and restoring. */
5369 void
5370 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5372 for (auto &loc : m_save_locations)
5373 if (loc.group == MEM_SVE_PRED)
5374 emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5377 /* Set DEST to (vec_series BASE STEP). */
5379 static void
5380 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5382 machine_mode mode = GET_MODE (dest);
5383 scalar_mode inner = GET_MODE_INNER (mode);
5385 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5386 if (!aarch64_sve_index_immediate_p (base))
5387 base = force_reg (inner, base);
5388 if (!aarch64_sve_index_immediate_p (step))
5389 step = force_reg (inner, step);
5391 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5394 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5395 register of mode MODE. Use TARGET for the result if it's nonnull
5396 and convenient.
5398 The two vector modes must have the same element mode. The behavior
5399 is to duplicate architectural lane N of SRC into architectural lanes
5400 N + I * STEP of the result. On big-endian targets, architectural
5401 lane 0 of an Advanced SIMD vector is the last element of the vector
5402 in memory layout, so for big-endian targets this operation has the
5403 effect of reversing SRC before duplicating it. Callers need to
5404 account for this. */
5407 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5409 machine_mode src_mode = GET_MODE (src);
5410 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5411 insn_code icode = (BYTES_BIG_ENDIAN
5412 ? code_for_aarch64_vec_duplicate_vq_be (mode)
5413 : code_for_aarch64_vec_duplicate_vq_le (mode));
5415 unsigned int i = 0;
5416 expand_operand ops[3];
5417 create_output_operand (&ops[i++], target, mode);
5418 create_output_operand (&ops[i++], src, src_mode);
5419 if (BYTES_BIG_ENDIAN)
5421 /* Create a PARALLEL describing the reversal of SRC. */
5422 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5423 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5424 nelts_per_vq - 1, -1);
5425 create_fixed_operand (&ops[i++], sel);
5427 expand_insn (icode, i, ops);
5428 return ops[0].value;
5431 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5432 the memory image into DEST. Return true on success. */
5434 static bool
5435 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5437 src = force_const_mem (GET_MODE (src), src);
5438 if (!src)
5439 return false;
5441 /* Make sure that the address is legitimate. */
5442 if (!aarch64_sve_ld1rq_operand_p (src))
5444 rtx addr = force_reg (Pmode, XEXP (src, 0));
5445 src = replace_equiv_address (src, addr);
5448 machine_mode mode = GET_MODE (dest);
5449 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5450 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5451 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5452 return true;
5455 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5456 by N "background" values. Try to move it into TARGET using:
5458 PTRUE PRED.<T>, VL<N>
5459 MOV TRUE.<T>, #<foreground>
5460 MOV FALSE.<T>, #<background>
5461 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5463 The PTRUE is always a single instruction but the MOVs might need a
5464 longer sequence. If the background value is zero (as it often is),
5465 the sequence can sometimes collapse to a PTRUE followed by a
5466 zero-predicated move.
5468 Return the target on success, otherwise return null. */
5470 static rtx
5471 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5473 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5475 /* Make sure that the PTRUE is valid. */
5476 machine_mode mode = GET_MODE (src);
5477 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5478 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5479 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5480 == AARCH64_NUM_SVPATTERNS)
5481 return NULL_RTX;
5483 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5484 rtx_vector_builder true_builder (mode, npatterns, 1);
5485 rtx_vector_builder false_builder (mode, npatterns, 1);
5486 for (unsigned int i = 0; i < npatterns; ++i)
5488 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5489 pred_builder.quick_push (CONST1_RTX (BImode));
5491 for (unsigned int i = 0; i < npatterns; ++i)
5493 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5494 pred_builder.quick_push (CONST0_RTX (BImode));
5496 expand_operand ops[4];
5497 create_output_operand (&ops[0], target, mode);
5498 create_input_operand (&ops[1], true_builder.build (), mode);
5499 create_input_operand (&ops[2], false_builder.build (), mode);
5500 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5501 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5502 return target;
5505 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5506 SVE data mode and isn't a legitimate constant. Use TARGET for the
5507 result if convenient.
5509 The returned register can have whatever mode seems most natural
5510 given the contents of SRC. */
5512 static rtx
5513 aarch64_expand_sve_const_vector (rtx target, rtx src)
5515 machine_mode mode = GET_MODE (src);
5516 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5517 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5518 scalar_mode elt_mode = GET_MODE_INNER (mode);
5519 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5520 unsigned int container_bits = aarch64_sve_container_bits (mode);
5521 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5523 if (nelts_per_pattern == 1
5524 && encoded_bits <= 128
5525 && container_bits != elt_bits)
5527 /* We have a partial vector mode and a constant whose full-vector
5528 equivalent would occupy a repeating 128-bit sequence. Build that
5529 full-vector equivalent instead, so that we have the option of
5530 using LD1RQ and Advanced SIMD operations. */
5531 unsigned int repeat = container_bits / elt_bits;
5532 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5533 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5534 for (unsigned int i = 0; i < npatterns; ++i)
5535 for (unsigned int j = 0; j < repeat; ++j)
5536 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5537 target = aarch64_target_reg (target, full_mode);
5538 return aarch64_expand_sve_const_vector (target, builder.build ());
5541 if (nelts_per_pattern == 1 && encoded_bits == 128)
5543 /* The constant is a duplicated quadword but can't be narrowed
5544 beyond a quadword. Get the memory image of the first quadword
5545 as a 128-bit vector and try using LD1RQ to load it from memory.
5547 The effect for both endiannesses is to load memory lane N into
5548 architectural lanes N + I * STEP of the result. On big-endian
5549 targets, the layout of the 128-bit vector in an Advanced SIMD
5550 register would be different from its layout in an SVE register,
5551 but this 128-bit vector is a memory value only. */
5552 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5553 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5554 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5555 return target;
5558 if (nelts_per_pattern == 1 && encoded_bits < 128)
5560 /* The vector is a repeating sequence of 64 bits or fewer.
5561 See if we can load them using an Advanced SIMD move and then
5562 duplicate it to fill a vector. This is better than using a GPR
5563 move because it keeps everything in the same register file. */
5564 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5565 rtx_vector_builder builder (vq_mode, npatterns, 1);
5566 for (unsigned int i = 0; i < npatterns; ++i)
5568 /* We want memory lane N to go into architectural lane N,
5569 so reverse for big-endian targets. The DUP .Q pattern
5570 has a compensating reverse built-in. */
5571 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5572 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5574 rtx vq_src = builder.build ();
5575 if (aarch64_simd_valid_immediate (vq_src, NULL))
5577 vq_src = force_reg (vq_mode, vq_src);
5578 return aarch64_expand_sve_dupq (target, mode, vq_src);
5581 /* Get an integer representation of the repeating part of Advanced
5582 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5583 which for big-endian targets is lane-swapped wrt a normal
5584 Advanced SIMD vector. This means that for both endiannesses,
5585 memory lane N of SVE vector SRC corresponds to architectural
5586 lane N of a register holding VQ_SRC. This in turn means that
5587 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5588 as a single 128-bit value) and thus that memory lane 0 of SRC is
5589 in the lsb of the integer. Duplicating the integer therefore
5590 ensures that memory lane N of SRC goes into architectural lane
5591 N + I * INDEX of the SVE register. */
5592 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5593 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5594 if (elt_value)
5596 /* Pretend that we had a vector of INT_MODE to start with. */
5597 elt_mode = int_mode;
5598 mode = aarch64_full_sve_mode (int_mode).require ();
5600 /* If the integer can be moved into a general register by a
5601 single instruction, do that and duplicate the result. */
5602 if (CONST_INT_P (elt_value)
5603 && aarch64_move_imm (INTVAL (elt_value),
5604 encoded_bits <= 32 ? SImode : DImode))
5606 elt_value = force_reg (elt_mode, elt_value);
5607 return expand_vector_broadcast (mode, elt_value);
5610 else if (npatterns == 1)
5611 /* We're duplicating a single value, but can't do better than
5612 force it to memory and load from there. This handles things
5613 like symbolic constants. */
5614 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5616 if (elt_value)
5618 /* Load the element from memory if we can, otherwise move it into
5619 a register and use a DUP. */
5620 rtx op = force_const_mem (elt_mode, elt_value);
5621 if (!op)
5622 op = force_reg (elt_mode, elt_value);
5623 return expand_vector_broadcast (mode, op);
5627 /* Try using INDEX. */
5628 rtx base, step;
5629 if (const_vec_series_p (src, &base, &step))
5631 aarch64_expand_vec_series (target, base, step);
5632 return target;
5635 /* From here on, it's better to force the whole constant to memory
5636 if we can. */
5637 if (GET_MODE_NUNITS (mode).is_constant ())
5638 return NULL_RTX;
5640 if (nelts_per_pattern == 2)
5641 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5642 return res;
5644 /* Expand each pattern individually. */
5645 gcc_assert (npatterns > 1);
5646 rtx_vector_builder builder;
5647 auto_vec<rtx, 16> vectors (npatterns);
5648 for (unsigned int i = 0; i < npatterns; ++i)
5650 builder.new_vector (mode, 1, nelts_per_pattern);
5651 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5652 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5653 vectors.quick_push (force_reg (mode, builder.build ()));
5656 /* Use permutes to interleave the separate vectors. */
5657 while (npatterns > 1)
5659 npatterns /= 2;
5660 for (unsigned int i = 0; i < npatterns; ++i)
5662 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5663 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5664 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5665 vectors[i] = tmp;
5668 gcc_assert (vectors[0] == target);
5669 return target;
5672 /* Use WHILE to set a predicate register of mode MODE in which the first
5673 VL bits are set and the rest are clear. Use TARGET for the register
5674 if it's nonnull and convenient. */
5676 static rtx
5677 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5678 unsigned int vl)
5680 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5681 target = aarch64_target_reg (target, mode);
5682 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5683 target, const0_rtx, limit));
5684 return target;
5687 static rtx
5688 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5690 /* BUILDER is a constant predicate in which the index of every set bit
5691 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5692 by inverting every element at a multiple of ELT_SIZE and EORing the
5693 result with an ELT_SIZE PTRUE.
5695 Return a register that contains the constant on success, otherwise
5696 return null. Use TARGET as the register if it is nonnull and
5697 convenient. */
5699 static rtx
5700 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5701 unsigned int elt_size)
5703 /* Invert every element at a multiple of ELT_SIZE, keeping the
5704 other bits zero. */
5705 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5706 builder.nelts_per_pattern ());
5707 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5708 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5709 inv_builder.quick_push (const1_rtx);
5710 else
5711 inv_builder.quick_push (const0_rtx);
5712 inv_builder.finalize ();
5714 /* See if we can load the constant cheaply. */
5715 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5716 if (!inv)
5717 return NULL_RTX;
5719 /* EOR the result with an ELT_SIZE PTRUE. */
5720 rtx mask = aarch64_ptrue_all (elt_size);
5721 mask = force_reg (VNx16BImode, mask);
5722 inv = gen_lowpart (VNx16BImode, inv);
5723 target = aarch64_target_reg (target, VNx16BImode);
5724 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5725 return target;
5728 /* BUILDER is a constant predicate in which the index of every set bit
5729 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5730 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5731 register on success, otherwise return null. Use TARGET as the register
5732 if nonnull and convenient. */
5734 static rtx
5735 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5736 unsigned int elt_size,
5737 unsigned int permute_size)
5739 /* We're going to split the constant into two new constants A and B,
5740 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5741 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5743 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5744 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5746 where _ indicates elements that will be discarded by the permute.
5748 First calculate the ELT_SIZEs for A and B. */
5749 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5750 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5751 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5752 if (INTVAL (builder.elt (i)) != 0)
5754 if (i & permute_size)
5755 b_elt_size |= i - permute_size;
5756 else
5757 a_elt_size |= i;
5759 a_elt_size &= -a_elt_size;
5760 b_elt_size &= -b_elt_size;
5762 /* Now construct the vectors themselves. */
5763 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5764 builder.nelts_per_pattern ());
5765 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5766 builder.nelts_per_pattern ());
5767 unsigned int nelts = builder.encoded_nelts ();
5768 for (unsigned int i = 0; i < nelts; ++i)
5769 if (i & (elt_size - 1))
5771 a_builder.quick_push (const0_rtx);
5772 b_builder.quick_push (const0_rtx);
5774 else if ((i & permute_size) == 0)
5776 /* The A and B elements are significant. */
5777 a_builder.quick_push (builder.elt (i));
5778 b_builder.quick_push (builder.elt (i + permute_size));
5780 else
5782 /* The A and B elements are going to be discarded, so pick whatever
5783 is likely to give a nice constant. We are targeting element
5784 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5785 with the aim of each being a sequence of ones followed by
5786 a sequence of zeros. So:
5788 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5789 duplicate the last X_ELT_SIZE element, to extend the
5790 current sequence of ones or zeros.
5792 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5793 zero, so that the constant really does have X_ELT_SIZE and
5794 not a smaller size. */
5795 if (a_elt_size > permute_size)
5796 a_builder.quick_push (const0_rtx);
5797 else
5798 a_builder.quick_push (a_builder.elt (i - a_elt_size));
5799 if (b_elt_size > permute_size)
5800 b_builder.quick_push (const0_rtx);
5801 else
5802 b_builder.quick_push (b_builder.elt (i - b_elt_size));
5804 a_builder.finalize ();
5805 b_builder.finalize ();
5807 /* Try loading A into a register. */
5808 rtx_insn *last = get_last_insn ();
5809 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5810 if (!a)
5811 return NULL_RTX;
5813 /* Try loading B into a register. */
5814 rtx b = a;
5815 if (a_builder != b_builder)
5817 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5818 if (!b)
5820 delete_insns_since (last);
5821 return NULL_RTX;
5825 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
5826 operands but permutes them as though they had mode MODE. */
5827 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5828 target = aarch64_target_reg (target, GET_MODE (a));
5829 rtx type_reg = CONST0_RTX (mode);
5830 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5831 return target;
5834 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5835 constant in BUILDER into an SVE predicate register. Return the register
5836 on success, otherwise return null. Use TARGET for the register if
5837 nonnull and convenient.
5839 ALLOW_RECURSE_P is true if we can use methods that would call this
5840 function recursively. */
5842 static rtx
5843 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5844 bool allow_recurse_p)
5846 if (builder.encoded_nelts () == 1)
5847 /* A PFALSE or a PTRUE .B ALL. */
5848 return aarch64_emit_set_immediate (target, builder);
5850 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5851 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5853 /* If we can load the constant using PTRUE, use it as-is. */
5854 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5855 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5856 return aarch64_emit_set_immediate (target, builder);
5858 /* Otherwise use WHILE to set the first VL bits. */
5859 return aarch64_sve_move_pred_via_while (target, mode, vl);
5862 if (!allow_recurse_p)
5863 return NULL_RTX;
5865 /* Try inverting the vector in element size ELT_SIZE and then EORing
5866 the result with an ELT_SIZE PTRUE. */
5867 if (INTVAL (builder.elt (0)) == 0)
5868 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5869 elt_size))
5870 return res;
5872 /* Try using TRN1 to permute two simpler constants. */
5873 for (unsigned int i = elt_size; i <= 8; i *= 2)
5874 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5875 elt_size, i))
5876 return res;
5878 return NULL_RTX;
5881 /* Return an SVE predicate register that contains the VNx16BImode
5882 constant in BUILDER, without going through the move expanders.
5884 The returned register can have whatever mode seems most natural
5885 given the contents of BUILDER. Use TARGET for the result if
5886 convenient. */
5888 static rtx
5889 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5891 /* Try loading the constant using pure predicate operations. */
5892 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5893 return res;
5895 /* Try forcing the constant to memory. */
5896 if (builder.full_nelts ().is_constant ())
5897 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5899 target = aarch64_target_reg (target, VNx16BImode);
5900 emit_move_insn (target, mem);
5901 return target;
5904 /* The last resort is to load the constant as an integer and then
5905 compare it against zero. Use -1 for set bits in order to increase
5906 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5907 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5908 builder.nelts_per_pattern ());
5909 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5910 int_builder.quick_push (INTVAL (builder.elt (i))
5911 ? constm1_rtx : const0_rtx);
5912 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5913 int_builder.build ());
5916 /* Set DEST to immediate IMM. */
5918 void
5919 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5921 machine_mode mode = GET_MODE (dest);
5923 /* Check on what type of symbol it is. */
5924 scalar_int_mode int_mode;
5925 if ((SYMBOL_REF_P (imm)
5926 || LABEL_REF_P (imm)
5927 || GET_CODE (imm) == CONST
5928 || GET_CODE (imm) == CONST_POLY_INT)
5929 && is_a <scalar_int_mode> (mode, &int_mode))
5931 rtx mem;
5932 poly_int64 offset;
5933 HOST_WIDE_INT const_offset;
5934 enum aarch64_symbol_type sty;
5936 /* If we have (const (plus symbol offset)), separate out the offset
5937 before we start classifying the symbol. */
5938 rtx base = strip_offset (imm, &offset);
5940 /* We must always add an offset involving VL separately, rather than
5941 folding it into the relocation. */
5942 if (!offset.is_constant (&const_offset))
5944 if (!TARGET_SVE)
5946 aarch64_report_sve_required ();
5947 return;
5949 if (base == const0_rtx
5950 && (aarch64_sve_cnt_immediate_p (offset)
5951 || aarch64_sve_rdvl_immediate_p (offset)))
5952 emit_insn (gen_rtx_SET (dest, imm));
5953 else
5955 /* Do arithmetic on 32-bit values if the result is smaller
5956 than that. */
5957 if (partial_subreg_p (int_mode, SImode))
5959 /* It is invalid to do symbol calculations in modes
5960 narrower than SImode. */
5961 gcc_assert (base == const0_rtx);
5962 dest = gen_lowpart (SImode, dest);
5963 int_mode = SImode;
5965 if (base != const0_rtx)
5967 base = aarch64_force_temporary (int_mode, dest, base);
5968 aarch64_add_offset (int_mode, dest, base, offset,
5969 NULL_RTX, NULL_RTX, 0, false);
5971 else
5972 aarch64_add_offset (int_mode, dest, base, offset,
5973 dest, NULL_RTX, 0, false);
5975 return;
5978 if (aarch64_rdsvl_immediate_p (base))
5980 /* We could handle non-constant offsets if they are ever
5981 generated. */
5982 gcc_assert (const_offset == 0);
5983 emit_insn (gen_rtx_SET (dest, imm));
5984 return;
5987 sty = aarch64_classify_symbol (base, const_offset);
5988 switch (sty)
5990 case SYMBOL_FORCE_TO_MEM:
5991 if (int_mode != ptr_mode)
5992 imm = convert_memory_address (ptr_mode, imm);
5994 if (const_offset != 0
5995 && targetm.cannot_force_const_mem (ptr_mode, imm))
5997 gcc_assert (can_create_pseudo_p ());
5998 base = aarch64_force_temporary (int_mode, dest, base);
5999 aarch64_add_offset (int_mode, dest, base, const_offset,
6000 NULL_RTX, NULL_RTX, 0, false);
6001 return;
6004 mem = force_const_mem (ptr_mode, imm);
6005 gcc_assert (mem);
6007 /* If we aren't generating PC relative literals, then
6008 we need to expand the literal pool access carefully.
6009 This is something that needs to be done in a number
6010 of places, so could well live as a separate function. */
6011 if (!aarch64_pcrelative_literal_loads)
6013 gcc_assert (can_create_pseudo_p ());
6014 base = gen_reg_rtx (ptr_mode);
6015 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6016 if (ptr_mode != Pmode)
6017 base = convert_memory_address (Pmode, base);
6018 mem = gen_rtx_MEM (ptr_mode, base);
6021 if (int_mode != ptr_mode)
6022 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6024 emit_insn (gen_rtx_SET (dest, mem));
6026 return;
6028 case SYMBOL_SMALL_TLSGD:
6029 case SYMBOL_SMALL_TLSDESC:
6030 case SYMBOL_SMALL_TLSIE:
6031 case SYMBOL_SMALL_GOT_28K:
6032 case SYMBOL_SMALL_GOT_4G:
6033 case SYMBOL_TINY_GOT:
6034 case SYMBOL_TINY_TLSIE:
6035 if (const_offset != 0)
6037 gcc_assert(can_create_pseudo_p ());
6038 base = aarch64_force_temporary (int_mode, dest, base);
6039 aarch64_add_offset (int_mode, dest, base, const_offset,
6040 NULL_RTX, NULL_RTX, 0, false);
6041 return;
6043 /* FALLTHRU */
6045 case SYMBOL_SMALL_ABSOLUTE:
6046 case SYMBOL_TINY_ABSOLUTE:
6047 case SYMBOL_TLSLE12:
6048 case SYMBOL_TLSLE24:
6049 case SYMBOL_TLSLE32:
6050 case SYMBOL_TLSLE48:
6051 aarch64_load_symref_appropriately (dest, imm, sty);
6052 return;
6054 default:
6055 gcc_unreachable ();
6059 if (!CONST_INT_P (imm))
6061 if (aarch64_sve_pred_mode_p (mode))
6063 /* Only the low bit of each .H, .S and .D element is defined,
6064 so we can set the upper bits to whatever we like. If the
6065 predicate is all-true in MODE, prefer to set all the undefined
6066 bits as well, so that we can share a single .B predicate for
6067 all modes. */
6068 if (imm == CONSTM1_RTX (mode))
6069 imm = CONSTM1_RTX (VNx16BImode);
6071 /* All methods for constructing predicate modes wider than VNx16BI
6072 will set the upper bits of each element to zero. Expose this
6073 by moving such constants as a VNx16BI, so that all bits are
6074 significant and so that constants for different modes can be
6075 shared. The wider constant will still be available as a
6076 REG_EQUAL note. */
6077 rtx_vector_builder builder;
6078 if (aarch64_get_sve_pred_bits (builder, imm))
6080 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6081 if (dest != res)
6082 emit_move_insn (dest, gen_lowpart (mode, res));
6083 return;
6087 if (GET_CODE (imm) == HIGH
6088 || aarch64_simd_valid_immediate (imm, NULL))
6090 emit_insn (gen_rtx_SET (dest, imm));
6091 return;
6094 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6095 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6097 if (dest != res)
6098 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6099 return;
6102 rtx mem = force_const_mem (mode, imm);
6103 gcc_assert (mem);
6104 emit_move_insn (dest, mem);
6105 return;
6108 aarch64_internal_mov_immediate (dest, imm, true, mode);
6111 /* Return the MEM rtx that provides the canary value that should be used
6112 for stack-smashing protection. MODE is the mode of the memory.
6113 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6114 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6115 indicates whether the caller is performing a SET or a TEST operation. */
6118 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6119 aarch64_salt_type salt_type)
6121 rtx addr;
6122 if (aarch64_stack_protector_guard == SSP_GLOBAL)
6124 gcc_assert (MEM_P (decl_rtl));
6125 addr = XEXP (decl_rtl, 0);
6126 poly_int64 offset;
6127 rtx base = strip_offset_and_salt (addr, &offset);
6128 if (!SYMBOL_REF_P (base))
6129 return decl_rtl;
6131 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6132 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6133 addr = gen_rtx_CONST (Pmode, addr);
6134 addr = plus_constant (Pmode, addr, offset);
6136 else
6138 /* Calculate the address from the system register. */
6139 rtx salt = GEN_INT (salt_type);
6140 addr = gen_reg_rtx (mode);
6141 if (mode == DImode)
6142 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6143 else
6145 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6146 addr = convert_memory_address (Pmode, addr);
6148 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6150 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6153 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6154 that is known to contain PTRUE. */
6156 void
6157 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6159 expand_operand ops[3];
6160 machine_mode mode = GET_MODE (dest);
6161 create_output_operand (&ops[0], dest, mode);
6162 create_input_operand (&ops[1], pred, GET_MODE(pred));
6163 create_input_operand (&ops[2], src, mode);
6164 temporary_volatile_ok v (true);
6165 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6168 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6169 operand is in memory. In this case we need to use the predicated LD1
6170 and ST1 instead of LDR and STR, both for correctness on big-endian
6171 targets and because LD1 and ST1 support a wider range of addressing modes.
6172 PRED_MODE is the mode of the predicate.
6174 See the comment at the head of aarch64-sve.md for details about the
6175 big-endian handling. */
6177 void
6178 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6180 machine_mode mode = GET_MODE (dest);
6181 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6182 if (!register_operand (src, mode)
6183 && !register_operand (dest, mode))
6185 rtx tmp = gen_reg_rtx (mode);
6186 if (MEM_P (src))
6187 aarch64_emit_sve_pred_move (tmp, ptrue, src);
6188 else
6189 emit_move_insn (tmp, src);
6190 src = tmp;
6192 aarch64_emit_sve_pred_move (dest, ptrue, src);
6195 /* Called only on big-endian targets. See whether an SVE vector move
6196 from SRC to DEST is effectively a REV[BHW] instruction, because at
6197 least one operand is a subreg of an SVE vector that has wider or
6198 narrower elements. Return true and emit the instruction if so.
6200 For example:
6202 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6204 represents a VIEW_CONVERT between the following vectors, viewed
6205 in memory order:
6207 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6208 R1: { [0], [1], [2], [3], ... }
6210 The high part of lane X in R2 should therefore correspond to lane X*2
6211 of R1, but the register representations are:
6213 msb lsb
6214 R2: ...... [1].high [1].low [0].high [0].low
6215 R1: ...... [3] [2] [1] [0]
6217 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6218 We therefore need a reverse operation to swap the high and low values
6219 around.
6221 This is purely an optimization. Without it we would spill the
6222 subreg operand to the stack in one mode and reload it in the
6223 other mode, which has the same effect as the REV. */
6225 bool
6226 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6228 gcc_assert (BYTES_BIG_ENDIAN);
6230 /* Do not try to optimize subregs that LRA has created for matched
6231 reloads. These subregs only exist as a temporary measure to make
6232 the RTL well-formed, but they are exempt from the usual
6233 TARGET_CAN_CHANGE_MODE_CLASS rules.
6235 For example, if we have:
6237 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6239 and the constraints require R1 and R2 to be in the same register,
6240 LRA may need to create RTL such as:
6242 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6243 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6244 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6246 which forces both the input and output of the original instruction
6247 to use the same hard register. But for this to work, the normal
6248 rules have to be suppressed on the subreg input, otherwise LRA
6249 would need to reload that input too, meaning that the process
6250 would never terminate. To compensate for this, the normal rules
6251 are also suppressed for the subreg output of the first move.
6252 Ignoring the special case and handling the first move normally
6253 would therefore generate wrong code: we would reverse the elements
6254 for the first subreg but not reverse them back for the second subreg. */
6255 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6256 dest = SUBREG_REG (dest);
6257 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6258 src = SUBREG_REG (src);
6260 /* The optimization handles two single SVE REGs with different element
6261 sizes. */
6262 if (!REG_P (dest)
6263 || !REG_P (src)
6264 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6265 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6266 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6267 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6268 return false;
6270 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
6271 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6272 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6273 UNSPEC_REV_SUBREG);
6274 emit_insn (gen_rtx_SET (dest, unspec));
6275 return true;
6278 /* Return a copy of X with mode MODE, without changing its other
6279 attributes. Unlike gen_lowpart, this doesn't care whether the
6280 mode change is valid. */
6283 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6285 if (GET_MODE (x) == mode)
6286 return x;
6288 x = shallow_copy_rtx (x);
6289 set_mode_and_regno (x, mode, REGNO (x));
6290 return x;
6293 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6294 stored in wider integer containers. */
6296 static unsigned int
6297 aarch64_sve_rev_unspec (machine_mode mode)
6299 switch (GET_MODE_UNIT_SIZE (mode))
6301 case 1: return UNSPEC_REVB;
6302 case 2: return UNSPEC_REVH;
6303 case 4: return UNSPEC_REVW;
6305 gcc_unreachable ();
6308 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6309 operands. */
6311 void
6312 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6314 /* Decide which REV operation we need. The mode with wider elements
6315 determines the mode of the operands and the mode with the narrower
6316 elements determines the reverse width. */
6317 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6318 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6319 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6320 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6321 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6323 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6324 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6326 /* Get the operands in the appropriate modes and emit the instruction. */
6327 ptrue = gen_lowpart (pred_mode, ptrue);
6328 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6329 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6330 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6331 dest, ptrue, src));
6334 static bool
6335 aarch64_function_ok_for_sibcall (tree, tree exp)
6337 if (crtl->abi->id () != expr_callee_abi (exp).id ())
6338 return false;
6340 tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6341 if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6342 return false;
6343 for (auto state : { "za", "zt0" })
6344 if (bool (aarch64_cfun_shared_flags (state))
6345 != bool (aarch64_fntype_shared_flags (fntype, state)))
6346 return false;
6347 return true;
6350 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6351 passed in SVE registers. */
6353 static bool
6354 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6355 const function_arg_info &arg)
6357 HOST_WIDE_INT size;
6358 machine_mode dummymode;
6359 int nregs;
6361 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6362 if (arg.mode == BLKmode && arg.type)
6363 size = int_size_in_bytes (arg.type);
6364 else
6365 /* No frontends can create types with variable-sized modes, so we
6366 shouldn't be asked to pass or return them. */
6367 size = GET_MODE_SIZE (arg.mode).to_constant ();
6369 /* Aggregates are passed by reference based on their size. */
6370 if (arg.aggregate_type_p ())
6371 size = int_size_in_bytes (arg.type);
6373 /* Variable sized arguments are always returned by reference. */
6374 if (size < 0)
6375 return true;
6377 /* Can this be a candidate to be passed in fp/simd register(s)? */
6378 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6379 &dummymode, &nregs, NULL,
6380 !pcum || pcum->silent_p))
6381 return false;
6383 /* Arguments which are variable sized or larger than 2 registers are
6384 passed by reference unless they are a homogenous floating point
6385 aggregate. */
6386 return size > 2 * UNITS_PER_WORD;
6389 /* Implement TARGET_PASS_BY_REFERENCE. */
6391 static bool
6392 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6393 const function_arg_info &arg)
6395 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6397 if (!arg.type)
6398 return aarch64_pass_by_reference_1 (pcum, arg);
6400 pure_scalable_type_info pst_info;
6401 switch (pst_info.analyze (arg.type))
6403 case pure_scalable_type_info::IS_PST:
6404 if (pcum && !pcum->silent_p && !TARGET_SVE)
6405 /* We can't gracefully recover at this point, so make this a
6406 fatal error. */
6407 fatal_error (input_location, "arguments of type %qT require"
6408 " the SVE ISA extension", arg.type);
6410 /* Variadic SVE types are passed by reference. Normal non-variadic
6411 arguments are too if we've run out of registers. */
6412 return (!arg.named
6413 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6414 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6416 case pure_scalable_type_info::DOESNT_MATTER:
6417 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6418 return true;
6420 case pure_scalable_type_info::NO_ABI_IDENTITY:
6421 case pure_scalable_type_info::ISNT_PST:
6422 return aarch64_pass_by_reference_1 (pcum, arg);
6424 gcc_unreachable ();
6427 /* Return TRUE if VALTYPE is padded to its least significant bits. */
6428 static bool
6429 aarch64_return_in_msb (const_tree valtype)
6431 machine_mode dummy_mode;
6432 int dummy_int;
6434 /* Never happens in little-endian mode. */
6435 if (!BYTES_BIG_ENDIAN)
6436 return false;
6438 /* Only composite types smaller than or equal to 16 bytes can
6439 be potentially returned in registers. */
6440 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6441 || int_size_in_bytes (valtype) <= 0
6442 || int_size_in_bytes (valtype) > 16)
6443 return false;
6445 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6446 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6447 is always passed/returned in the least significant bits of fp/simd
6448 register(s). */
6449 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6450 &dummy_mode, &dummy_int, NULL,
6451 false))
6452 return false;
6454 /* Likewise pure scalable types for SVE vector and predicate registers. */
6455 pure_scalable_type_info pst_info;
6456 if (pst_info.analyze_registers (valtype))
6457 return false;
6459 return true;
6462 /* Implement TARGET_FUNCTION_VALUE.
6463 Define how to find the value returned by a function. */
6465 static rtx
6466 aarch64_function_value (const_tree type, const_tree func,
6467 bool outgoing ATTRIBUTE_UNUSED)
6469 machine_mode mode;
6470 int unsignedp;
6472 mode = TYPE_MODE (type);
6473 if (INTEGRAL_TYPE_P (type))
6474 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6476 pure_scalable_type_info pst_info;
6477 if (type && pst_info.analyze_registers (type))
6478 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6480 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6481 are returned in memory, not by value. */
6482 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6483 bool sve_p = (vec_flags & VEC_ANY_SVE);
6485 if (aarch64_return_in_msb (type))
6487 HOST_WIDE_INT size = int_size_in_bytes (type);
6489 if (size % UNITS_PER_WORD != 0)
6491 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6492 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6496 int count;
6497 machine_mode ag_mode;
6498 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6499 NULL, false))
6501 gcc_assert (!sve_p);
6502 if (!aarch64_composite_type_p (type, mode))
6504 gcc_assert (count == 1 && mode == ag_mode);
6505 return gen_rtx_REG (mode, V0_REGNUM);
6507 else if (aarch64_advsimd_full_struct_mode_p (mode)
6508 && known_eq (GET_MODE_SIZE (ag_mode), 16))
6509 return gen_rtx_REG (mode, V0_REGNUM);
6510 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6511 && known_eq (GET_MODE_SIZE (ag_mode), 8))
6512 return gen_rtx_REG (mode, V0_REGNUM);
6513 else
6515 int i;
6516 rtx par;
6518 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6519 for (i = 0; i < count; i++)
6521 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6522 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6523 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6524 XVECEXP (par, 0, i) = tmp;
6526 return par;
6529 else
6531 if (sve_p)
6533 /* Vector types can acquire a partial SVE mode using things like
6534 __attribute__((vector_size(N))), and this is potentially useful.
6535 However, the choice of mode doesn't affect the type's ABI
6536 identity, so we should treat the types as though they had
6537 the associated integer mode, just like they did before SVE
6538 was introduced.
6540 We know that the vector must be 128 bits or smaller,
6541 otherwise we'd have returned it in memory instead. */
6542 gcc_assert (type
6543 && (aarch64_some_values_include_pst_objects_p (type)
6544 || (vec_flags & VEC_PARTIAL)));
6546 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6547 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6548 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6549 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6551 return gen_rtx_REG (mode, R0_REGNUM);
6555 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6556 Return true if REGNO is the number of a hard register in which the values
6557 of called function may come back. */
6559 static bool
6560 aarch64_function_value_regno_p (const unsigned int regno)
6562 /* Maximum of 16 bytes can be returned in the general registers. Examples
6563 of 16-byte return values are: 128-bit integers and 16-byte small
6564 structures (excluding homogeneous floating-point aggregates). */
6565 if (regno == R0_REGNUM || regno == R1_REGNUM)
6566 return true;
6568 /* Up to four fp/simd registers can return a function value, e.g. a
6569 homogeneous floating-point aggregate having four members. */
6570 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6571 return TARGET_FLOAT;
6573 if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6574 return TARGET_SVE;
6576 return false;
6579 /* Subroutine for aarch64_return_in_memory for types that are not returned
6580 in SVE registers. */
6582 static bool
6583 aarch64_return_in_memory_1 (const_tree type)
6585 HOST_WIDE_INT size;
6586 machine_mode ag_mode;
6587 int count;
6589 if (!AGGREGATE_TYPE_P (type)
6590 && TREE_CODE (type) != BITINT_TYPE
6591 && TREE_CODE (type) != COMPLEX_TYPE
6592 && TREE_CODE (type) != VECTOR_TYPE)
6593 /* Simple scalar types always returned in registers. */
6594 return false;
6596 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6597 &ag_mode, &count, NULL, false))
6598 return false;
6600 /* Types larger than 2 registers returned in memory. */
6601 size = int_size_in_bytes (type);
6602 return (size < 0 || size > 2 * UNITS_PER_WORD);
6605 /* Implement TARGET_RETURN_IN_MEMORY.
6607 If the type T of the result of a function is such that
6608 void func (T arg)
6609 would require that arg be passed as a value in a register (or set of
6610 registers) according to the parameter passing rules, then the result
6611 is returned in the same registers as would be used for such an
6612 argument. */
6614 static bool
6615 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6617 pure_scalable_type_info pst_info;
6618 switch (pst_info.analyze (type))
6620 case pure_scalable_type_info::IS_PST:
6621 return (pst_info.num_zr () > NUM_FP_ARG_REGS
6622 || pst_info.num_pr () > NUM_PR_ARG_REGS);
6624 case pure_scalable_type_info::DOESNT_MATTER:
6625 gcc_assert (aarch64_return_in_memory_1 (type));
6626 return true;
6628 case pure_scalable_type_info::NO_ABI_IDENTITY:
6629 case pure_scalable_type_info::ISNT_PST:
6630 return aarch64_return_in_memory_1 (type);
6632 gcc_unreachable ();
6635 static bool
6636 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6637 const_tree type, int *nregs)
6639 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6640 return aarch64_vfp_is_call_or_return_candidate (mode, type,
6641 &pcum->aapcs_vfp_rmode,
6642 nregs, NULL, pcum->silent_p);
6645 /* Given MODE and TYPE of a function argument, return the alignment in
6646 bits. The idea is to suppress any stronger alignment requested by
6647 the user and opt for the natural alignment (specified in AAPCS64 \S
6648 4.1). ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6649 was incorrectly calculated in versions of GCC prior to GCC 9.
6650 ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6651 calculated in versions between GCC 9 and GCC 13. If the alignment
6652 might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6653 is the old GCC 13 alignment, otherwise it is zero.
6655 This is a helper function for local use only. */
6657 static unsigned int
6658 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6659 unsigned int *abi_break_gcc_9,
6660 unsigned int *abi_break_gcc_13,
6661 unsigned int *abi_break_gcc_14)
6663 *abi_break_gcc_9 = 0;
6664 *abi_break_gcc_13 = 0;
6665 *abi_break_gcc_14 = 0;
6666 if (!type)
6667 return GET_MODE_ALIGNMENT (mode);
6669 if (integer_zerop (TYPE_SIZE (type)))
6670 return 0;
6672 gcc_assert (TYPE_MODE (type) == mode);
6674 if (!AGGREGATE_TYPE_P (type))
6676 /* The ABI alignment is the natural alignment of the type, without
6677 any attributes applied. Normally this is the alignment of the
6678 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6679 For now we just handle the known exceptions explicitly. */
6680 type = TYPE_MAIN_VARIANT (type);
6681 if (POINTER_TYPE_P (type))
6683 gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6684 return POINTER_SIZE;
6686 if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6688 *abi_break_gcc_14 = TYPE_ALIGN (type);
6689 type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6691 gcc_assert (!TYPE_USER_ALIGN (type));
6692 return TYPE_ALIGN (type);
6695 if (TREE_CODE (type) == ARRAY_TYPE)
6696 return TYPE_ALIGN (TREE_TYPE (type));
6698 unsigned int alignment = 0;
6699 unsigned int bitfield_alignment_with_packed = 0;
6700 unsigned int bitfield_alignment = 0;
6701 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6702 if (TREE_CODE (field) == FIELD_DECL)
6704 /* Note that we explicitly consider zero-sized fields here,
6705 even though they don't map to AAPCS64 machine types.
6706 For example, in:
6708 struct __attribute__((aligned(8))) empty {};
6710 struct s {
6711 [[no_unique_address]] empty e;
6712 int x;
6715 "s" contains only one Fundamental Data Type (the int field)
6716 but gains 8-byte alignment and size thanks to "e". */
6717 alignment = std::max (alignment, DECL_ALIGN (field));
6718 if (DECL_BIT_FIELD_TYPE (field))
6720 /* Take the bit-field type's alignment into account only
6721 if the user didn't reduce this field's alignment with
6722 the packed attribute. */
6723 if (!DECL_PACKED (field))
6724 bitfield_alignment
6725 = std::max (bitfield_alignment,
6726 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6728 /* Compute the alignment even if the bit-field is
6729 packed, so that we can emit a warning in case the
6730 alignment changed between GCC versions. */
6731 bitfield_alignment_with_packed
6732 = std::max (bitfield_alignment_with_packed,
6733 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6737 /* Emit a warning if the alignment is different when taking the
6738 'packed' attribute into account. */
6739 if (bitfield_alignment != bitfield_alignment_with_packed
6740 && bitfield_alignment_with_packed > alignment)
6741 *abi_break_gcc_13 = bitfield_alignment_with_packed;
6743 if (bitfield_alignment > alignment)
6745 *abi_break_gcc_9 = alignment;
6746 return bitfield_alignment;
6749 return alignment;
6752 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
6753 _BitInt(N) type. These include ARRAY_TYPE's with an element that is a
6754 _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
6755 with a field member that is a _BitInt(N) or an aggregate that uses it.
6756 Return false otherwise. */
6758 static bool
6759 bitint_or_aggr_of_bitint_p (tree type)
6761 if (!type)
6762 return false;
6764 if (TREE_CODE (type) == BITINT_TYPE)
6765 return true;
6767 /* If ARRAY_TYPE, check it's element type. */
6768 if (TREE_CODE (type) == ARRAY_TYPE)
6769 return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
6771 /* If RECORD_TYPE or UNION_TYPE, check the fields' types. */
6772 if (RECORD_OR_UNION_TYPE_P (type))
6773 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
6775 if (TREE_CODE (field) != FIELD_DECL)
6776 continue;
6777 if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
6778 return true;
6780 return false;
6783 /* Layout a function argument according to the AAPCS64 rules. The rule
6784 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
6785 mode that was originally given to us by the target hook, whereas the
6786 mode in ARG might be the result of replacing partial SVE modes with
6787 the equivalent integer mode. */
6789 static void
6790 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6792 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6793 tree type = arg.type;
6794 machine_mode mode = arg.mode;
6795 int ncrn, nvrn, nregs;
6796 bool allocate_ncrn, allocate_nvrn;
6797 HOST_WIDE_INT size;
6798 unsigned int abi_break_gcc_9;
6799 unsigned int abi_break_gcc_13;
6800 unsigned int abi_break_gcc_14;
6802 /* We need to do this once per argument. */
6803 if (pcum->aapcs_arg_processed)
6804 return;
6806 bool warn_pcs_change
6807 = (warn_psabi
6808 && !pcum->silent_p
6809 && (currently_expanding_function_start
6810 || currently_expanding_gimple_stmt));
6812 /* HFAs and HVAs can have an alignment greater than 16 bytes. For example:
6814 typedef struct foo {
6815 __Int8x16_t foo[2] __attribute__((aligned(32)));
6816 } foo;
6818 is still a HVA despite its larger-than-normal alignment.
6819 However, such over-aligned HFAs and HVAs are guaranteed to have
6820 no padding.
6822 If we exclude HFAs and HVAs from the discussion below, then there
6823 are several things to note:
6825 - Both the C and AAPCS64 interpretations of a type's alignment should
6826 give a value that is no greater than the type's size.
6828 - Types bigger than 16 bytes are passed indirectly.
6830 - If an argument of type T is passed indirectly, TYPE and MODE describe
6831 a pointer to T rather than T iself.
6833 It follows that the AAPCS64 alignment of TYPE must be no greater
6834 than 16 bytes.
6836 Versions prior to GCC 9.1 ignored a bitfield's underlying type
6837 and so could calculate an alignment that was too small. If this
6838 happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6840 Although GCC 9.1 fixed that bug, it introduced a different one:
6841 it would consider the alignment of a bitfield's underlying type even
6842 if the field was packed (which should have the effect of overriding
6843 the alignment of the underlying type). This was fixed in GCC 13.1.
6845 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6846 that was too big. If this happened for TYPE, ABI_BREAK_GCC_13 is
6847 this older, too-big alignment.
6849 Also, the fact that GCC 9 to GCC 12 considered irrelevant
6850 alignments meant they could calculate type alignments that were
6851 bigger than the type's size, contrary to the assumption above.
6852 The handling of register arguments was nevertheless (and justifiably)
6853 written to follow the assumption that the alignment can never be
6854 greater than the size. The same was not true for stack arguments;
6855 their alignment was instead handled by MIN bounds in
6856 aarch64_function_arg_boundary.
6858 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6859 an alignment of more than 16 bytes for TYPE then:
6861 - If the argument was passed in registers, these GCC versions
6862 would treat the alignment as though it was *less than* 16 bytes.
6864 - If the argument was passed on the stack, these GCC versions
6865 would treat the alignment as though it was *equal to* 16 bytes.
6867 Both behaviors were wrong, but in different cases. */
6869 pcum->aapcs_arg_processed = true;
6871 pure_scalable_type_info pst_info;
6872 if (type && pst_info.analyze_registers (type))
6874 /* aarch64_function_arg_alignment has never had an effect on
6875 this case. */
6877 /* The PCS says that it is invalid to pass an SVE value to an
6878 unprototyped function. There is no ABI-defined location we
6879 can return in this case, so we have no real choice but to raise
6880 an error immediately, even though this is only a query function. */
6881 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6883 gcc_assert (!pcum->silent_p);
6884 error ("SVE type %qT cannot be passed to an unprototyped function",
6885 arg.type);
6886 /* Avoid repeating the message, and avoid tripping the assert
6887 below. */
6888 pcum->pcs_variant = ARM_PCS_SVE;
6891 /* We would have converted the argument into pass-by-reference
6892 form if it didn't fit in registers. */
6893 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6894 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6895 gcc_assert (arg.named
6896 && pcum->pcs_variant == ARM_PCS_SVE
6897 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6898 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6899 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6900 P0_REGNUM + pcum->aapcs_nprn);
6901 return;
6904 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6905 are passed by reference, not by value. */
6906 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6907 bool sve_p = (vec_flags & VEC_ANY_SVE);
6908 if (sve_p)
6909 /* Vector types can acquire a partial SVE mode using things like
6910 __attribute__((vector_size(N))), and this is potentially useful.
6911 However, the choice of mode doesn't affect the type's ABI
6912 identity, so we should treat the types as though they had
6913 the associated integer mode, just like they did before SVE
6914 was introduced.
6916 We know that the vector must be 128 bits or smaller,
6917 otherwise we'd have passed it in memory instead. */
6918 gcc_assert (type
6919 && (aarch64_some_values_include_pst_objects_p (type)
6920 || (vec_flags & VEC_PARTIAL)));
6922 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6923 if (type)
6924 size = int_size_in_bytes (type);
6925 else
6926 /* No frontends can create types with variable-sized modes, so we
6927 shouldn't be asked to pass or return them. */
6928 size = GET_MODE_SIZE (mode).to_constant ();
6929 size = ROUND_UP (size, UNITS_PER_WORD);
6931 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6932 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6933 mode,
6934 type,
6935 &nregs);
6936 gcc_assert (!sve_p || !allocate_nvrn);
6938 unsigned int alignment
6939 = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
6940 &abi_break_gcc_13, &abi_break_gcc_14);
6942 gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
6943 && (!alignment || abi_break_gcc_9 < alignment)
6944 && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
6946 /* _BitInt(N) was only added in GCC 14. */
6947 bool warn_pcs_change_le_gcc14
6948 = warn_pcs_change && !bitint_or_aggr_of_bitint_p (type);
6950 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6951 The following code thus handles passing by SIMD/FP registers first. */
6953 nvrn = pcum->aapcs_nvrn;
6955 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6956 and homogenous short-vector aggregates (HVA). */
6957 if (allocate_nvrn)
6959 /* aarch64_function_arg_alignment has never had an effect on
6960 this case. */
6961 if (!pcum->silent_p && !TARGET_FLOAT)
6962 aarch64_err_no_fpadvsimd (mode);
6964 if (nvrn + nregs <= NUM_FP_ARG_REGS)
6966 pcum->aapcs_nextnvrn = nvrn + nregs;
6967 if (!aarch64_composite_type_p (type, mode))
6969 gcc_assert (nregs == 1);
6970 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6972 else if (aarch64_advsimd_full_struct_mode_p (mode)
6973 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
6974 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6975 else if (aarch64_advsimd_partial_struct_mode_p (mode)
6976 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
6977 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6978 else
6980 rtx par;
6981 int i;
6982 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6983 for (i = 0; i < nregs; i++)
6985 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6986 V0_REGNUM + nvrn + i);
6987 rtx offset = gen_int_mode
6988 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6989 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6990 XVECEXP (par, 0, i) = tmp;
6992 pcum->aapcs_reg = par;
6994 return;
6996 else
6998 /* C.3 NSRN is set to 8. */
6999 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7000 goto on_stack;
7004 ncrn = pcum->aapcs_ncrn;
7005 nregs = size / UNITS_PER_WORD;
7007 /* C6 - C9. though the sign and zero extension semantics are
7008 handled elsewhere. This is the case where the argument fits
7009 entirely general registers. */
7010 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7012 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7014 /* C.8 if the argument has an alignment of 16 then the NGRN is
7015 rounded up to the next even number. */
7016 if (nregs == 2
7017 && ncrn % 2)
7019 /* Emit a warning if the alignment changed when taking the
7020 'packed' attribute into account. */
7021 if (warn_pcs_change_le_gcc14
7022 && abi_break_gcc_13
7023 && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
7024 != (alignment == 16 * BITS_PER_UNIT)))
7025 inform (input_location, "parameter passing for argument of type "
7026 "%qT changed in GCC 13.1", type);
7028 if (warn_pcs_change_le_gcc14
7029 && abi_break_gcc_14
7030 && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
7031 != (alignment == 16 * BITS_PER_UNIT)))
7032 inform (input_location, "parameter passing for argument of type "
7033 "%qT changed in GCC 14.1", type);
7035 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7036 comparison is there because for > 16 * BITS_PER_UNIT
7037 alignment nregs should be > 2 and therefore it should be
7038 passed by reference rather than value. */
7039 if (alignment == 16 * BITS_PER_UNIT)
7041 if (warn_pcs_change_le_gcc14
7042 && abi_break_gcc_9)
7043 inform (input_location, "parameter passing for argument of type "
7044 "%qT changed in GCC 9.1", type);
7045 ++ncrn;
7046 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7050 /* If an argument with an SVE mode needs to be shifted up to the
7051 high part of the register, treat it as though it had an integer mode.
7052 Using the normal (parallel [...]) would suppress the shifting. */
7053 if (sve_p
7054 && BYTES_BIG_ENDIAN
7055 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7056 && aarch64_pad_reg_upward (mode, type, false))
7058 mode = int_mode_for_mode (mode).require ();
7059 sve_p = false;
7062 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7063 A reg is still generated for it, but the caller should be smart
7064 enough not to use it. */
7065 if (nregs == 0
7066 || (nregs == 1 && !sve_p)
7067 || GET_MODE_CLASS (mode) == MODE_INT)
7068 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7069 else
7071 rtx par;
7072 int i;
7074 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7075 for (i = 0; i < nregs; i++)
7077 scalar_int_mode reg_mode = word_mode;
7078 if (nregs == 1)
7079 reg_mode = int_mode_for_mode (mode).require ();
7080 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7081 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7082 GEN_INT (i * UNITS_PER_WORD));
7083 XVECEXP (par, 0, i) = tmp;
7085 pcum->aapcs_reg = par;
7088 pcum->aapcs_nextncrn = ncrn + nregs;
7089 return;
7092 /* C.11 */
7093 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7095 /* The argument is passed on stack; record the needed number of words for
7096 this argument and align the total size if necessary. */
7097 on_stack:
7098 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7100 if (warn_pcs_change_le_gcc14
7101 && abi_break_gcc_13
7102 && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7103 != (alignment >= 16 * BITS_PER_UNIT)))
7104 inform (input_location, "parameter passing for argument of type "
7105 "%qT changed in GCC 13.1", type);
7107 if (warn_pcs_change_le_gcc14
7108 && abi_break_gcc_14
7109 && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7110 != (alignment >= 16 * BITS_PER_UNIT)))
7111 inform (input_location, "parameter passing for argument of type "
7112 "%qT changed in GCC 14.1", type);
7114 if (alignment == 16 * BITS_PER_UNIT)
7116 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7117 if (pcum->aapcs_stack_size != new_size)
7119 if (warn_pcs_change_le_gcc14
7120 && abi_break_gcc_9)
7121 inform (input_location, "parameter passing for argument of type "
7122 "%qT changed in GCC 9.1", type);
7123 pcum->aapcs_stack_size = new_size;
7126 return;
7129 /* Add the current argument register to the set of those that need
7130 to be saved and restored around a change to PSTATE.SM. */
7132 static void
7133 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7135 subrtx_var_iterator::array_type array;
7136 FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7138 rtx x = *iter;
7139 if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7141 unsigned int i = pcum->num_sme_mode_switch_args++;
7142 gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7143 pcum->sme_mode_switch_args[i] = x;
7148 /* Return a parallel that contains all the registers that need to be
7149 saved around a change to PSTATE.SM. Return const0_rtx if there is
7150 no such mode switch, or if no registers need to be saved. */
7152 static rtx
7153 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7155 if (!pcum->num_sme_mode_switch_args)
7156 return const0_rtx;
7158 auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7159 pcum->sme_mode_switch_args);
7160 return gen_rtx_PARALLEL (VOIDmode, argvec);
7163 /* Implement TARGET_FUNCTION_ARG. */
7165 static rtx
7166 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7168 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7169 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7170 || pcum->pcs_variant == ARM_PCS_SIMD
7171 || pcum->pcs_variant == ARM_PCS_SVE);
7173 if (arg.end_marker_p ())
7175 rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7176 pcum->pcs_variant);
7177 rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7178 rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7179 rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7180 return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7181 sme_mode_switch_args,
7182 shared_za_flags,
7183 shared_zt0_flags));
7186 aarch64_layout_arg (pcum_v, arg);
7187 return pcum->aapcs_reg;
7190 void
7191 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7192 const_tree fntype,
7193 rtx libname ATTRIBUTE_UNUSED,
7194 const_tree fndecl,
7195 unsigned n_named ATTRIBUTE_UNUSED,
7196 bool silent_p)
7198 pcum->aapcs_ncrn = 0;
7199 pcum->aapcs_nvrn = 0;
7200 pcum->aapcs_nprn = 0;
7201 pcum->aapcs_nextncrn = 0;
7202 pcum->aapcs_nextnvrn = 0;
7203 pcum->aapcs_nextnprn = 0;
7204 if (fntype)
7206 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7207 pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7209 else
7211 pcum->pcs_variant = ARM_PCS_AAPCS64;
7212 pcum->isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
7214 pcum->aapcs_reg = NULL_RTX;
7215 pcum->aapcs_arg_processed = false;
7216 pcum->aapcs_stack_words = 0;
7217 pcum->aapcs_stack_size = 0;
7218 pcum->silent_p = silent_p;
7219 pcum->shared_za_flags
7220 = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7221 pcum->shared_zt0_flags
7222 = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7223 pcum->num_sme_mode_switch_args = 0;
7225 if (!silent_p
7226 && !TARGET_FLOAT
7227 && fntype && fntype != error_mark_node)
7229 const_tree type = TREE_TYPE (fntype);
7230 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7231 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7232 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7233 &mode, &nregs, NULL, false))
7234 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7237 if (!silent_p
7238 && !TARGET_SVE
7239 && pcum->pcs_variant == ARM_PCS_SVE)
7241 /* We can't gracefully recover at this point, so make this a
7242 fatal error. */
7243 if (fndecl)
7244 fatal_error (input_location, "%qE requires the SVE ISA extension",
7245 fndecl);
7246 else
7247 fatal_error (input_location, "calls to functions of type %qT require"
7248 " the SVE ISA extension", fntype);
7252 static void
7253 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7254 const function_arg_info &arg)
7256 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7257 if (pcum->pcs_variant == ARM_PCS_AAPCS64
7258 || pcum->pcs_variant == ARM_PCS_SIMD
7259 || pcum->pcs_variant == ARM_PCS_SVE)
7261 aarch64_layout_arg (pcum_v, arg);
7262 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7263 != (pcum->aapcs_stack_words != 0));
7264 if (pcum->aapcs_reg
7265 && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7266 aarch64_record_sme_mode_switch_args (pcum);
7268 pcum->aapcs_arg_processed = false;
7269 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7270 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7271 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7272 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7273 pcum->aapcs_stack_words = 0;
7274 pcum->aapcs_reg = NULL_RTX;
7278 bool
7279 aarch64_function_arg_regno_p (unsigned regno)
7281 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7282 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7283 || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7286 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7287 PARM_BOUNDARY bits of alignment, but will be given anything up
7288 to STACK_BOUNDARY bits if the type requires it. This makes sure
7289 that both before and after the layout of each argument, the Next
7290 Stacked Argument Address (NSAA) will have a minimum alignment of
7291 8 bytes. */
7293 static unsigned int
7294 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7296 unsigned int abi_break_gcc_9;
7297 unsigned int abi_break_gcc_13;
7298 unsigned int abi_break_gcc_14;
7299 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7300 &abi_break_gcc_9,
7301 &abi_break_gcc_13,
7302 &abi_break_gcc_14);
7303 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7304 to emit warnings about ABI incompatibility. */
7305 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7306 return alignment;
7309 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7311 static fixed_size_mode
7312 aarch64_get_reg_raw_mode (int regno)
7314 /* Don't use any non GP registers for __builtin_apply and
7315 __builtin_return if general registers only mode is requested. */
7316 if (TARGET_GENERAL_REGS_ONLY && !GP_REGNUM_P (regno))
7317 return as_a <fixed_size_mode> (VOIDmode);
7318 if (TARGET_SVE && FP_REGNUM_P (regno))
7319 /* Don't use the SVE part of the register for __builtin_apply and
7320 __builtin_return. The SVE registers aren't used by the normal PCS,
7321 so using them there would be a waste of time. The PCS extensions
7322 for SVE types are fundamentally incompatible with the
7323 __builtin_return/__builtin_apply interface. */
7324 return as_a <fixed_size_mode> (V16QImode);
7325 if (PR_REGNUM_P (regno))
7326 /* For SVE PR regs, indicate that they should be ignored for
7327 __builtin_apply/__builtin_return. */
7328 return as_a <fixed_size_mode> (VOIDmode);
7329 return default_get_reg_raw_mode (regno);
7332 /* Implement TARGET_FUNCTION_ARG_PADDING.
7334 Small aggregate types are placed in the lowest memory address.
7336 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7338 static pad_direction
7339 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7341 /* On little-endian targets, the least significant byte of every stack
7342 argument is passed at the lowest byte address of the stack slot. */
7343 if (!BYTES_BIG_ENDIAN)
7344 return PAD_UPWARD;
7346 /* Otherwise, integral, floating-point and pointer types are padded downward:
7347 the least significant byte of a stack argument is passed at the highest
7348 byte address of the stack slot. */
7349 if (type
7350 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7351 || POINTER_TYPE_P (type))
7352 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7353 return PAD_DOWNWARD;
7355 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7356 return PAD_UPWARD;
7359 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7361 It specifies padding for the last (may also be the only)
7362 element of a block move between registers and memory. If
7363 assuming the block is in the memory, padding upward means that
7364 the last element is padded after its highest significant byte,
7365 while in downward padding, the last element is padded at the
7366 its least significant byte side.
7368 Small aggregates and small complex types are always padded
7369 upwards.
7371 We don't need to worry about homogeneous floating-point or
7372 short-vector aggregates; their move is not affected by the
7373 padding direction determined here. Regardless of endianness,
7374 each element of such an aggregate is put in the least
7375 significant bits of a fp/simd register.
7377 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7378 register has useful data, and return the opposite if the most
7379 significant byte does. */
7381 bool
7382 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7383 bool first ATTRIBUTE_UNUSED)
7386 /* Aside from pure scalable types, small composite types are always
7387 padded upward. */
7388 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7390 HOST_WIDE_INT size;
7391 if (type)
7392 size = int_size_in_bytes (type);
7393 else
7394 /* No frontends can create types with variable-sized modes, so we
7395 shouldn't be asked to pass or return them. */
7396 size = GET_MODE_SIZE (mode).to_constant ();
7397 if (size < 2 * UNITS_PER_WORD)
7399 pure_scalable_type_info pst_info;
7400 if (pst_info.analyze_registers (type))
7401 return false;
7402 return true;
7406 /* Otherwise, use the default padding. */
7407 return !BYTES_BIG_ENDIAN;
7410 static scalar_int_mode
7411 aarch64_libgcc_cmp_return_mode (void)
7413 return SImode;
7416 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7418 /* We use the 12-bit shifted immediate arithmetic instructions so values
7419 must be multiple of (1 << 12), i.e. 4096. */
7420 #define ARITH_FACTOR 4096
7422 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7423 #error Cannot use simple address calculation for stack probing
7424 #endif
7426 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7427 inclusive. These are offsets from the current stack pointer. */
7429 static void
7430 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7432 HOST_WIDE_INT size;
7433 if (!poly_size.is_constant (&size))
7435 sorry ("stack probes for SVE frames");
7436 return;
7439 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7441 /* See the same assertion on PROBE_INTERVAL above. */
7442 gcc_assert ((first % ARITH_FACTOR) == 0);
7444 /* See if we have a constant small number of probes to generate. If so,
7445 that's the easy case. */
7446 if (size <= PROBE_INTERVAL)
7448 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7450 emit_set_insn (reg1,
7451 plus_constant (Pmode,
7452 stack_pointer_rtx, -(first + base)));
7453 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7456 /* The run-time loop is made up of 8 insns in the generic case while the
7457 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7458 else if (size <= 4 * PROBE_INTERVAL)
7460 HOST_WIDE_INT i, rem;
7462 emit_set_insn (reg1,
7463 plus_constant (Pmode,
7464 stack_pointer_rtx,
7465 -(first + PROBE_INTERVAL)));
7466 emit_stack_probe (reg1);
7468 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7469 it exceeds SIZE. If only two probes are needed, this will not
7470 generate any code. Then probe at FIRST + SIZE. */
7471 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7473 emit_set_insn (reg1,
7474 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7475 emit_stack_probe (reg1);
7478 rem = size - (i - PROBE_INTERVAL);
7479 if (rem > 256)
7481 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7483 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7484 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7486 else
7487 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7490 /* Otherwise, do the same as above, but in a loop. Note that we must be
7491 extra careful with variables wrapping around because we might be at
7492 the very top (or the very bottom) of the address space and we have
7493 to be able to handle this case properly; in particular, we use an
7494 equality test for the loop condition. */
7495 else
7497 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7499 /* Step 1: round SIZE to the previous multiple of the interval. */
7501 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7504 /* Step 2: compute initial and final value of the loop counter. */
7506 /* TEST_ADDR = SP + FIRST. */
7507 emit_set_insn (reg1,
7508 plus_constant (Pmode, stack_pointer_rtx, -first));
7510 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
7511 HOST_WIDE_INT adjustment = - (first + rounded_size);
7512 if (! aarch64_uimm12_shift (adjustment))
7514 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7515 true, Pmode);
7516 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7518 else
7519 emit_set_insn (reg2,
7520 plus_constant (Pmode, stack_pointer_rtx, adjustment));
7522 /* Step 3: the loop
7526 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7527 probe at TEST_ADDR
7529 while (TEST_ADDR != LAST_ADDR)
7531 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7532 until it is equal to ROUNDED_SIZE. */
7534 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7537 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7538 that SIZE is equal to ROUNDED_SIZE. */
7540 if (size != rounded_size)
7542 HOST_WIDE_INT rem = size - rounded_size;
7544 if (rem > 256)
7546 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7548 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7549 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7551 else
7552 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7556 /* Make sure nothing is scheduled before we are done. */
7557 emit_insn (gen_blockage ());
7560 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7561 absolute addresses. */
7563 const char *
7564 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7566 static int labelno = 0;
7567 char loop_lab[32];
7568 rtx xops[2];
7570 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7572 /* Loop. */
7573 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7575 HOST_WIDE_INT stack_clash_probe_interval
7576 = 1 << param_stack_clash_protection_guard_size;
7578 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7579 xops[0] = reg1;
7580 HOST_WIDE_INT interval;
7581 if (flag_stack_clash_protection)
7582 interval = stack_clash_probe_interval;
7583 else
7584 interval = PROBE_INTERVAL;
7586 gcc_assert (aarch64_uimm12_shift (interval));
7587 xops[1] = GEN_INT (interval);
7589 output_asm_insn ("sub\t%0, %0, %1", xops);
7591 /* If doing stack clash protection then we probe up by the ABI specified
7592 amount. We do this because we're dropping full pages at a time in the
7593 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7594 if (flag_stack_clash_protection)
7595 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7596 else
7597 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7599 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7600 by this amount for each iteration. */
7601 output_asm_insn ("str\txzr, [%0, %1]", xops);
7603 /* Test if TEST_ADDR == LAST_ADDR. */
7604 xops[1] = reg2;
7605 output_asm_insn ("cmp\t%0, %1", xops);
7607 /* Branch. */
7608 fputs ("\tb.ne\t", asm_out_file);
7609 assemble_name_raw (asm_out_file, loop_lab);
7610 fputc ('\n', asm_out_file);
7612 return "";
7615 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7616 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7617 of GUARD_SIZE. When a probe is emitted it is done at most
7618 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7619 at most MIN_PROBE_THRESHOLD. By the end of this function
7620 BASE = BASE - ADJUSTMENT. */
7622 const char *
7623 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7624 rtx min_probe_threshold, rtx guard_size)
7626 /* This function is not allowed to use any instruction generation function
7627 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7628 so instead emit the code you want using output_asm_insn. */
7629 gcc_assert (flag_stack_clash_protection);
7630 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7631 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7633 /* The minimum required allocation before the residual requires probing. */
7634 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7636 /* Clamp the value down to the nearest value that can be used with a cmp. */
7637 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7638 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7640 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7641 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7643 static int labelno = 0;
7644 char loop_start_lab[32];
7645 char loop_end_lab[32];
7646 rtx xops[2];
7648 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7649 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7651 /* Emit loop start label. */
7652 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7654 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7655 xops[0] = adjustment;
7656 xops[1] = probe_offset_value_rtx;
7657 output_asm_insn ("cmp\t%0, %1", xops);
7659 /* Branch to end if not enough adjustment to probe. */
7660 fputs ("\tb.lt\t", asm_out_file);
7661 assemble_name_raw (asm_out_file, loop_end_lab);
7662 fputc ('\n', asm_out_file);
7664 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7665 xops[0] = base;
7666 xops[1] = probe_offset_value_rtx;
7667 output_asm_insn ("sub\t%0, %0, %1", xops);
7669 /* Probe at BASE. */
7670 xops[1] = const0_rtx;
7671 output_asm_insn ("str\txzr, [%0, %1]", xops);
7673 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7674 xops[0] = adjustment;
7675 xops[1] = probe_offset_value_rtx;
7676 output_asm_insn ("sub\t%0, %0, %1", xops);
7678 /* Branch to start if still more bytes to allocate. */
7679 fputs ("\tb\t", asm_out_file);
7680 assemble_name_raw (asm_out_file, loop_start_lab);
7681 fputc ('\n', asm_out_file);
7683 /* No probe leave. */
7684 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7686 /* BASE = BASE - ADJUSTMENT. */
7687 xops[0] = base;
7688 xops[1] = adjustment;
7689 output_asm_insn ("sub\t%0, %0, %1", xops);
7690 return "";
7693 /* Determine whether a frame chain needs to be generated. */
7694 static bool
7695 aarch64_needs_frame_chain (void)
7697 if (frame_pointer_needed)
7698 return true;
7700 /* A leaf function cannot have calls or write LR. */
7701 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7703 /* Don't use a frame chain in leaf functions if leaf frame pointers
7704 are disabled. */
7705 if (flag_omit_leaf_frame_pointer && is_leaf)
7706 return false;
7708 return aarch64_use_frame_pointer;
7711 /* Return true if the current function should save registers above
7712 the locals area, rather than below it. */
7714 static bool
7715 aarch64_save_regs_above_locals_p ()
7717 /* When using stack smash protection, make sure that the canary slot
7718 comes between the locals and the saved registers. Otherwise,
7719 it would be possible for a carefully sized smash attack to change
7720 the saved registers (particularly LR and FP) without reaching the
7721 canary. */
7722 return crtl->stack_protect_guard;
7725 /* Return true if the current function needs to record the incoming
7726 value of PSTATE.SM. */
7727 static bool
7728 aarch64_need_old_pstate_sm ()
7730 /* Exit early if the incoming value of PSTATE.SM is known at
7731 compile time. */
7732 if (aarch64_cfun_incoming_pstate_sm () != 0)
7733 return false;
7735 if (aarch64_cfun_enables_pstate_sm ())
7736 return true;
7738 /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7739 but the function needs to return with PSTATE.SM unchanged. */
7740 if (nonlocal_goto_handler_labels)
7741 return true;
7743 /* Likewise for exception handlers. */
7744 eh_landing_pad lp;
7745 for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
7746 if (lp && lp->post_landing_pad)
7747 return true;
7749 /* Non-local gotos need to set PSTATE.SM to zero. It's possible to call
7750 streaming-compatible functions without SME being available, so PSTATE.SM
7751 should only be changed if it is currently set to one. */
7752 if (crtl->has_nonlocal_goto)
7753 return true;
7755 if (cfun->machine->call_switches_pstate_sm)
7756 for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
7757 if (auto *call = dyn_cast<rtx_call_insn *> (insn))
7758 if (!SIBLING_CALL_P (call))
7760 /* Return true if there is a call to a non-streaming-compatible
7761 function. */
7762 auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
7763 if (aarch64_call_switches_pstate_sm (callee_isa_mode))
7764 return true;
7766 return false;
7769 /* Mark the registers that need to be saved by the callee and calculate
7770 the size of the callee-saved registers area and frame record (both FP
7771 and LR may be omitted). */
7772 static void
7773 aarch64_layout_frame (void)
7775 unsigned regno, last_fp_reg = INVALID_REGNUM;
7776 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7777 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7778 bool frame_related_fp_reg_p = false;
7779 aarch64_frame &frame = cfun->machine->frame;
7780 poly_int64 top_of_locals = -1;
7781 bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
7783 vec_safe_truncate (frame.saved_gprs, 0);
7784 vec_safe_truncate (frame.saved_fprs, 0);
7785 vec_safe_truncate (frame.saved_prs, 0);
7787 frame.emit_frame_chain = aarch64_needs_frame_chain ();
7789 /* Adjust the outgoing arguments size if required. Keep it in sync with what
7790 the mid-end is doing. */
7791 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7793 #define SLOT_NOT_REQUIRED (-2)
7794 #define SLOT_REQUIRED (-1)
7796 frame.wb_push_candidate1 = INVALID_REGNUM;
7797 frame.wb_push_candidate2 = INVALID_REGNUM;
7798 frame.spare_pred_reg = INVALID_REGNUM;
7800 /* First mark all the registers that really need to be saved... */
7801 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7802 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7803 frame.old_svcr_offset = SLOT_NOT_REQUIRED;
7805 /* ... that includes the eh data registers (if needed)... */
7806 if (crtl->calls_eh_return)
7807 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7808 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7810 /* ... and any callee saved register that dataflow says is live. */
7811 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7812 if (df_regs_ever_live_p (regno)
7813 && !fixed_regs[regno]
7814 && (regno == R30_REGNUM
7815 || !crtl->abi->clobbers_full_reg_p (regno)))
7816 frame.reg_offset[regno] = SLOT_REQUIRED;
7818 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7819 if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7820 && !fixed_regs[regno]
7821 && !crtl->abi->clobbers_full_reg_p (regno))
7823 frame.reg_offset[regno] = SLOT_REQUIRED;
7824 last_fp_reg = regno;
7825 if (aarch64_emit_cfi_for_reg_p (regno))
7826 frame_related_fp_reg_p = true;
7829 /* Big-endian SVE frames need a spare predicate register in order
7830 to save Z8-Z15. Decide which register they should use. Prefer
7831 an unused argument register if possible, so that we don't force P4
7832 to be saved unnecessarily. */
7833 if (frame_related_fp_reg_p
7834 && crtl->abi->id () == ARM_PCS_SVE
7835 && BYTES_BIG_ENDIAN)
7837 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7838 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7839 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7840 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7841 break;
7842 gcc_assert (regno <= P7_REGNUM);
7843 frame.spare_pred_reg = regno;
7844 df_set_regs_ever_live (regno, true);
7847 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7848 if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7849 && !fixed_regs[regno]
7850 && !crtl->abi->clobbers_full_reg_p (regno))
7851 frame.reg_offset[regno] = SLOT_REQUIRED;
7853 bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
7855 poly_int64 offset = crtl->outgoing_args_size;
7856 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7857 if (regs_at_top_p)
7859 offset += get_frame_size ();
7860 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7861 top_of_locals = offset;
7863 frame.bytes_below_saved_regs = offset;
7864 frame.sve_save_and_probe = INVALID_REGNUM;
7866 /* Now assign stack slots for the registers. Start with the predicate
7867 registers, since predicate LDR and STR have a relatively small
7868 offset range. These saves happen below the hard frame pointer. */
7869 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7870 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7872 vec_safe_push (frame.saved_prs, regno);
7873 if (frame.sve_save_and_probe == INVALID_REGNUM)
7874 frame.sve_save_and_probe = regno;
7875 frame.reg_offset[regno] = offset;
7876 offset += BYTES_PER_SVE_PRED;
7879 poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
7880 if (maybe_ne (saved_prs_size, 0))
7882 /* If we have any vector registers to save above the predicate registers,
7883 the offset of the vector register save slots need to be a multiple
7884 of the vector size. This lets us use the immediate forms of LDR/STR
7885 (or LD1/ST1 for big-endian).
7887 A vector register is 8 times the size of a predicate register,
7888 and we need to save a maximum of 12 predicate registers, so the
7889 first vector register will be at either #1, MUL VL or #2, MUL VL.
7891 If we don't have any vector registers to save, and we know how
7892 big the predicate save area is, we can just round it up to the
7893 next 16-byte boundary. */
7894 if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
7895 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7896 else
7898 if (known_le (saved_prs_size, vector_save_size))
7899 offset = frame.bytes_below_saved_regs + vector_save_size;
7900 else if (known_le (saved_prs_size, vector_save_size * 2))
7901 offset = frame.bytes_below_saved_regs + vector_save_size * 2;
7902 else
7903 gcc_unreachable ();
7907 /* If we need to save any SVE vector registers, add them next. */
7908 if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7909 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7910 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7912 vec_safe_push (frame.saved_fprs, regno);
7913 if (frame.sve_save_and_probe == INVALID_REGNUM)
7914 frame.sve_save_and_probe = regno;
7915 frame.reg_offset[regno] = offset;
7916 offset += vector_save_size;
7919 /* OFFSET is now the offset of the hard frame pointer from the bottom
7920 of the callee save area. */
7921 auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
7922 bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
7923 gcc_assert (!saves_below_hard_fp_p
7924 || (frame.sve_save_and_probe != INVALID_REGNUM
7925 && known_eq (frame.reg_offset[frame.sve_save_and_probe],
7926 frame.bytes_below_saved_regs)));
7928 frame.bytes_below_hard_fp = offset;
7929 frame.hard_fp_save_and_probe = INVALID_REGNUM;
7931 auto allocate_gpr_slot = [&](unsigned int regno)
7933 vec_safe_push (frame.saved_gprs, regno);
7934 frame.reg_offset[regno] = offset;
7935 offset += UNITS_PER_WORD;
7938 if (frame.emit_frame_chain)
7940 /* FP and LR are placed in the linkage record. */
7941 allocate_gpr_slot (R29_REGNUM);
7942 allocate_gpr_slot (R30_REGNUM);
7944 else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
7945 && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
7946 /* Put the LR save slot first, since it makes a good choice of probe
7947 for stack clash purposes. The idea is that the link register usually
7948 has to be saved before a call anyway, and so we lose little by
7949 stopping it from being individually shrink-wrapped. */
7950 allocate_gpr_slot (R30_REGNUM);
7952 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7953 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7954 allocate_gpr_slot (regno);
7956 if (aarch64_need_old_pstate_sm ())
7958 frame.old_svcr_offset = offset;
7959 offset += UNITS_PER_WORD;
7962 /* If the current function changes the SVE vector length, ensure that the
7963 old value of the DWARF VG register is saved and available in the CFI,
7964 so that outer frames with VL-sized offsets can be processed correctly. */
7965 if (cfun->machine->call_switches_pstate_sm
7966 || aarch64_cfun_enables_pstate_sm ())
7968 frame.reg_offset[VG_REGNUM] = offset;
7969 offset += UNITS_PER_WORD;
7972 poly_int64 max_int_offset = offset;
7973 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7974 bool has_align_gap = maybe_ne (offset, max_int_offset);
7976 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7977 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7979 vec_safe_push (frame.saved_fprs, regno);
7980 /* If there is an alignment gap between integer and fp callee-saves,
7981 allocate the last fp register to it if possible. */
7982 if (regno == last_fp_reg
7983 && has_align_gap
7984 && known_eq (vector_save_size, 8)
7985 && multiple_p (offset, 16))
7987 frame.reg_offset[regno] = max_int_offset;
7988 break;
7991 frame.reg_offset[regno] = offset;
7992 offset += vector_save_size;
7995 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7996 auto saved_regs_size = offset - frame.bytes_below_saved_regs;
7998 array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
7999 ? frame.saved_gprs
8000 : frame.saved_fprs);
8001 if (!push_regs.empty ()
8002 && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
8004 frame.hard_fp_save_and_probe = push_regs[0];
8005 frame.wb_push_candidate1 = push_regs[0];
8006 if (push_regs.size () > 1)
8007 frame.wb_push_candidate2 = push_regs[1];
8010 /* With stack-clash, a register must be saved in non-leaf functions.
8011 The saving of the bottommost register counts as an implicit probe,
8012 which allows us to maintain the invariant described in the comment
8013 at expand_prologue. */
8014 gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
8016 if (!regs_at_top_p)
8018 offset += get_frame_size ();
8019 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8020 top_of_locals = offset;
8022 offset += frame.saved_varargs_size;
8023 gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8024 frame.frame_size = offset;
8026 frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
8027 gcc_assert (known_ge (top_of_locals, 0));
8028 frame.bytes_above_locals = frame.frame_size - top_of_locals;
8030 frame.initial_adjust = 0;
8031 frame.final_adjust = 0;
8032 frame.callee_adjust = 0;
8033 frame.sve_callee_adjust = 0;
8035 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8036 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8038 /* Shadow call stack only deals with functions where the LR is pushed
8039 onto the stack and without specifying the "no_sanitize" attribute
8040 with the argument "shadow-call-stack". */
8041 frame.is_scs_enabled
8042 = (!crtl->calls_eh_return
8043 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8044 && known_ge (frame.reg_offset[LR_REGNUM], 0));
8046 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8047 restore x30, and we don't need to pop x30 again in the traditional
8048 way. Pop candidates record the registers that need to be popped
8049 eventually. */
8050 if (frame.is_scs_enabled)
8052 if (frame.wb_pop_candidate2 == R30_REGNUM)
8053 frame.wb_pop_candidate2 = INVALID_REGNUM;
8054 else if (frame.wb_pop_candidate1 == R30_REGNUM)
8055 frame.wb_pop_candidate1 = INVALID_REGNUM;
8058 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8059 256 to ensure that the offset meets the requirements of emit_move_insn.
8060 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8061 max_push_offset to 0, because no registers are popped at this time,
8062 so callee_adjust cannot be adjusted. */
8063 HOST_WIDE_INT max_push_offset = 0;
8064 if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8066 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8067 max_push_offset = 512;
8068 else
8069 max_push_offset = 256;
8072 HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
8073 HOST_WIDE_INT const_saved_regs_size;
8074 if (known_eq (saved_regs_size, 0))
8075 frame.initial_adjust = frame.frame_size;
8076 else if (frame.frame_size.is_constant (&const_size)
8077 && const_size < max_push_offset
8078 && known_eq (frame.bytes_above_hard_fp, const_size))
8080 /* Simple, small frame with no data below the saved registers.
8082 stp reg1, reg2, [sp, -frame_size]!
8083 stp reg3, reg4, [sp, 16] */
8084 frame.callee_adjust = const_size;
8086 else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
8087 && saved_regs_size.is_constant (&const_saved_regs_size)
8088 && const_below_saved_regs + const_saved_regs_size < 512
8089 /* We could handle this case even with data below the saved
8090 registers, provided that that data left us with valid offsets
8091 for all predicate and vector save slots. It's such a rare
8092 case that it hardly seems worth the effort though. */
8093 && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8094 && !(cfun->calls_alloca
8095 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8096 && const_above_fp < max_push_offset))
8098 /* Frame with small area below the saved registers:
8100 sub sp, sp, frame_size
8101 stp reg1, reg2, [sp, bytes_below_saved_regs]
8102 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
8103 frame.initial_adjust = frame.frame_size;
8105 else if (saves_below_hard_fp_p
8106 && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8108 /* Frame in which all saves are SVE saves:
8110 sub sp, sp, frame_size - bytes_below_saved_regs
8111 save SVE registers relative to SP
8112 sub sp, sp, bytes_below_saved_regs */
8113 frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8114 frame.final_adjust = frame.bytes_below_saved_regs;
8116 else if (frame.wb_push_candidate1 != INVALID_REGNUM
8117 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8118 && const_above_fp < max_push_offset)
8120 /* Frame with large area below the saved registers, or with SVE saves,
8121 but with a small area above:
8123 stp reg1, reg2, [sp, -hard_fp_offset]!
8124 stp reg3, reg4, [sp, 16]
8125 [sub sp, sp, below_hard_fp_saved_regs_size]
8126 [save SVE registers relative to SP]
8127 sub sp, sp, bytes_below_saved_regs */
8128 frame.callee_adjust = const_above_fp;
8129 frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8130 frame.final_adjust = frame.bytes_below_saved_regs;
8132 else
8134 /* General case:
8136 sub sp, sp, hard_fp_offset
8137 stp x29, x30, [sp, 0]
8138 add x29, sp, 0
8139 stp reg3, reg4, [sp, 16]
8140 [sub sp, sp, below_hard_fp_saved_regs_size]
8141 [save SVE registers relative to SP]
8142 sub sp, sp, bytes_below_saved_regs */
8143 frame.initial_adjust = frame.bytes_above_hard_fp;
8144 frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8145 frame.final_adjust = frame.bytes_below_saved_regs;
8148 /* The frame is allocated in pieces, with each non-final piece
8149 including a register save at offset 0 that acts as a probe for
8150 the following piece. In addition, the save of the bottommost register
8151 acts as a probe for callees and allocas. Roll back any probes that
8152 aren't needed.
8154 A probe isn't needed if it is associated with the final allocation
8155 (including callees and allocas) that happens before the epilogue is
8156 executed. */
8157 if (crtl->is_leaf
8158 && !cfun->calls_alloca
8159 && known_eq (frame.final_adjust, 0))
8161 if (maybe_ne (frame.sve_callee_adjust, 0))
8162 frame.sve_save_and_probe = INVALID_REGNUM;
8163 else
8164 frame.hard_fp_save_and_probe = INVALID_REGNUM;
8167 /* Make sure the individual adjustments add up to the full frame size. */
8168 gcc_assert (known_eq (frame.initial_adjust
8169 + frame.callee_adjust
8170 + frame.sve_callee_adjust
8171 + frame.final_adjust, frame.frame_size));
8173 if (frame.callee_adjust == 0)
8175 /* We've decided not to do a "real" push and pop. However,
8176 setting up the frame chain is treated as being essentially
8177 a multi-instruction push. */
8178 frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8179 if (!frame.emit_frame_chain)
8180 frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8183 frame.laid_out = true;
8186 /* Return true if the register REGNO is saved on entry to
8187 the current function. */
8189 static bool
8190 aarch64_register_saved_on_entry (int regno)
8192 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8195 /* Push the register number REGNO of mode MODE to the stack with write-back
8196 adjusting the stack by ADJUSTMENT. */
8198 static void
8199 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8200 HOST_WIDE_INT adjustment)
8202 rtx base_rtx = stack_pointer_rtx;
8203 rtx insn, reg, mem;
8205 reg = gen_rtx_REG (mode, regno);
8206 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8207 plus_constant (Pmode, base_rtx, -adjustment));
8208 mem = gen_frame_mem (mode, mem);
8210 insn = emit_move_insn (mem, reg);
8211 RTX_FRAME_RELATED_P (insn) = 1;
8214 /* Generate and return an instruction to store the pair of registers
8215 REG and REG2 of mode MODE to location BASE with write-back adjusting
8216 the stack location BASE by ADJUSTMENT. */
8218 static rtx
8219 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8220 HOST_WIDE_INT adjustment)
8222 rtx new_base = plus_constant (Pmode, base, -adjustment);
8223 rtx mem = gen_frame_mem (mode, new_base);
8224 rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8226 return gen_rtx_PARALLEL (VOIDmode,
8227 gen_rtvec (3,
8228 gen_rtx_SET (base, new_base),
8229 gen_rtx_SET (mem, reg),
8230 gen_rtx_SET (mem2, reg2)));
8233 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8234 stack pointer by ADJUSTMENT. */
8236 static void
8237 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8239 rtx_insn *insn;
8240 machine_mode mode = aarch64_reg_save_mode (regno1);
8242 if (regno2 == INVALID_REGNUM)
8243 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8245 rtx reg1 = gen_rtx_REG (mode, regno1);
8246 rtx reg2 = gen_rtx_REG (mode, regno2);
8248 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8249 reg2, adjustment));
8250 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8251 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8252 RTX_FRAME_RELATED_P (insn) = 1;
8255 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8256 adjusting it by ADJUSTMENT afterwards. */
8258 static rtx
8259 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8260 HOST_WIDE_INT adjustment)
8262 rtx mem = gen_frame_mem (mode, base);
8263 rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8264 rtx new_base = plus_constant (Pmode, base, adjustment);
8266 return gen_rtx_PARALLEL (VOIDmode,
8267 gen_rtvec (3,
8268 gen_rtx_SET (base, new_base),
8269 gen_rtx_SET (reg, mem),
8270 gen_rtx_SET (reg2, mem2)));
8273 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8274 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8275 into CFI_OPS. */
8277 static void
8278 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8279 rtx *cfi_ops)
8281 machine_mode mode = aarch64_reg_save_mode (regno1);
8282 rtx reg1 = gen_rtx_REG (mode, regno1);
8284 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8286 if (regno2 == INVALID_REGNUM)
8288 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8289 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8290 emit_move_insn (reg1, gen_frame_mem (mode, mem));
8292 else
8294 rtx reg2 = gen_rtx_REG (mode, regno2);
8295 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8296 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8297 reg2, adjustment));
8301 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8302 for a mem rtx representing the entire pair. */
8304 static machine_mode
8305 aarch64_pair_mode_for_mode (machine_mode mode)
8307 if (known_eq (GET_MODE_SIZE (mode), 4))
8308 return V2x4QImode;
8309 else if (known_eq (GET_MODE_SIZE (mode), 8))
8310 return V2x8QImode;
8311 else if (known_eq (GET_MODE_SIZE (mode), 16))
8312 return V2x16QImode;
8313 else
8314 gcc_unreachable ();
8317 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8318 operand, return an rtx like MEM which instead represents the entire pair. */
8320 static rtx
8321 aarch64_pair_mem_from_base (rtx mem)
8323 auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8324 mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8325 gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8326 return mem;
8329 /* Generate and return a store pair instruction to store REG1 and REG2
8330 into memory starting at BASE_MEM. All three rtxes should have modes of the
8331 same size. */
8334 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8336 rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8338 return gen_rtx_SET (pair_mem,
8339 gen_rtx_UNSPEC (GET_MODE (pair_mem),
8340 gen_rtvec (2, reg1, reg2),
8341 UNSPEC_STP));
8344 /* Generate and return a load pair instruction to load a pair of
8345 registers starting at BASE_MEM into REG1 and REG2. If CODE is
8346 UNKNOWN, all three rtxes should have modes of the same size.
8347 Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8348 and REG{1,2} should be in DImode. */
8351 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8353 rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8355 const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8356 if (any_extend_p)
8357 gcc_checking_assert (GET_MODE (base_mem) == SImode
8358 && GET_MODE (reg1) == DImode
8359 && GET_MODE (reg2) == DImode);
8360 else
8361 gcc_assert (code == UNKNOWN);
8363 rtx unspecs[2] = {
8364 gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8365 gen_rtvec (1, pair_mem),
8366 UNSPEC_LDP_FST),
8367 gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8368 gen_rtvec (1, copy_rtx (pair_mem)),
8369 UNSPEC_LDP_SND)
8372 if (any_extend_p)
8373 for (int i = 0; i < 2; i++)
8374 unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8376 return gen_rtx_PARALLEL (VOIDmode,
8377 gen_rtvec (2,
8378 gen_rtx_SET (reg1, unspecs[0]),
8379 gen_rtx_SET (reg2, unspecs[1])));
8382 /* Return TRUE if return address signing should be enabled for the current
8383 function, otherwise return FALSE. */
8385 bool
8386 aarch64_return_address_signing_enabled (void)
8388 /* This function should only be called after frame laid out. */
8389 gcc_assert (cfun->machine->frame.laid_out);
8391 /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8392 if its LR is pushed onto stack. */
8393 return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8394 || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8395 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8398 /* Only used by the arm backend. */
8399 void aarch_bti_arch_check (void)
8402 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8403 bool
8404 aarch_bti_enabled (void)
8406 return (aarch_enable_bti == 1);
8409 /* Check if INSN is a BTI J insn. */
8410 bool
8411 aarch_bti_j_insn_p (rtx_insn *insn)
8413 if (!insn || !INSN_P (insn))
8414 return false;
8416 rtx pat = PATTERN (insn);
8417 return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8420 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction. */
8421 bool
8422 aarch_pac_insn_p (rtx x)
8424 if (!INSN_P (x))
8425 return false;
8427 subrtx_var_iterator::array_type array;
8428 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8430 rtx sub = *iter;
8431 if (sub && GET_CODE (sub) == UNSPEC)
8433 int unspec_val = XINT (sub, 1);
8434 switch (unspec_val)
8436 case UNSPEC_PACIASP:
8437 case UNSPEC_PACIBSP:
8438 return true;
8440 default:
8441 return false;
8443 iter.skip_subrtxes ();
8446 return false;
8449 rtx aarch_gen_bti_c (void)
8451 return gen_bti_c ();
8454 rtx aarch_gen_bti_j (void)
8456 return gen_bti_j ();
8459 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8460 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8461 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8463 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8464 or LD1D address
8466 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8467 if the variable isn't already nonnull
8469 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8470 Handle this case using a temporary base register that is suitable for
8471 all offsets in that range. Use ANCHOR_REG as this base register if it
8472 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8474 static inline void
8475 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8476 rtx &anchor_reg, poly_int64 &offset,
8477 rtx &ptrue)
8479 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8481 /* This is the maximum valid offset of the anchor from the base.
8482 Lower values would be valid too. */
8483 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8484 if (!anchor_reg)
8486 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8487 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8488 gen_int_mode (anchor_offset, Pmode)));
8490 base_rtx = anchor_reg;
8491 offset -= anchor_offset;
8493 if (!ptrue)
8495 int pred_reg = cfun->machine->frame.spare_pred_reg;
8496 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8497 CONSTM1_RTX (VNx16BImode));
8498 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8502 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8503 is saved at BASE + OFFSET. */
8505 static void
8506 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8507 rtx base, poly_int64 offset)
8509 rtx mem = gen_frame_mem (GET_MODE (reg),
8510 plus_constant (Pmode, base, offset));
8511 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8514 /* Emit code to save the callee-saved registers in REGS. Skip any
8515 write-back candidates if SKIP_WB is true, otherwise consider only
8516 write-back candidates.
8518 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8519 of the static frame. HARD_FP_VALID_P is true if the hard frame pointer
8520 has been set up. */
8522 static void
8523 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8524 array_slice<unsigned int> regs, bool skip_wb,
8525 bool hard_fp_valid_p)
8527 aarch64_frame &frame = cfun->machine->frame;
8528 rtx_insn *insn;
8529 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8531 auto skip_save_p = [&](unsigned int regno)
8533 if (cfun->machine->reg_is_wrapped_separately[regno])
8534 return true;
8536 if (skip_wb == (regno == frame.wb_push_candidate1
8537 || regno == frame.wb_push_candidate2))
8538 return true;
8540 return false;
8543 for (unsigned int i = 0; i < regs.size (); ++i)
8545 unsigned int regno = regs[i];
8546 poly_int64 offset;
8547 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8549 if (skip_save_p (regno))
8550 continue;
8552 machine_mode mode = aarch64_reg_save_mode (regno);
8553 rtx reg = gen_rtx_REG (mode, regno);
8554 rtx move_src = reg;
8555 offset = frame.reg_offset[regno] - bytes_below_sp;
8556 if (regno == VG_REGNUM)
8558 move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8559 emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8561 rtx base_rtx = stack_pointer_rtx;
8562 poly_int64 sp_offset = offset;
8564 HOST_WIDE_INT const_offset;
8565 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8566 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8567 offset, ptrue);
8568 else if (GP_REGNUM_P (REGNO (reg))
8569 && (!offset.is_constant (&const_offset) || const_offset >= 512))
8571 poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8572 if (hard_fp_valid_p)
8573 base_rtx = hard_frame_pointer_rtx;
8574 else
8576 if (!anchor_reg)
8578 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8579 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8580 gen_int_mode (fp_offset, Pmode)));
8582 base_rtx = anchor_reg;
8584 offset -= fp_offset;
8586 rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8587 rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8588 stack_pointer_rtx,
8589 sp_offset));
8590 rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8591 bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8593 unsigned int regno2;
8594 if (!aarch64_sve_mode_p (mode)
8595 && reg == move_src
8596 && i + 1 < regs.size ()
8597 && (regno2 = regs[i + 1], !skip_save_p (regno2))
8598 && known_eq (GET_MODE_SIZE (mode),
8599 frame.reg_offset[regno2] - frame.reg_offset[regno]))
8601 rtx reg2 = gen_rtx_REG (mode, regno2);
8603 offset += GET_MODE_SIZE (mode);
8604 insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8606 rtx cfi_mem2
8607 = gen_frame_mem (mode,
8608 plus_constant (Pmode,
8609 stack_pointer_rtx,
8610 sp_offset + GET_MODE_SIZE (mode)));
8611 rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8613 /* The first part of a frame-related parallel insn is always
8614 assumed to be relevant to the frame calculations;
8615 subsequent parts, are only frame-related if
8616 explicitly marked. */
8617 if (aarch64_emit_cfi_for_reg_p (regno2))
8618 RTX_FRAME_RELATED_P (cfi_set2) = 1;
8620 /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8621 representation of stp cannot be understood directly by
8622 dwarf2cfi. */
8623 rtx par = gen_rtx_PARALLEL (VOIDmode,
8624 gen_rtvec (2, cfi_set, cfi_set2));
8625 add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8627 regno = regno2;
8628 ++i;
8630 else
8632 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8634 insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8635 ptrue, move_src));
8636 need_cfi_note_p = true;
8638 else if (aarch64_sve_mode_p (mode))
8639 insn = emit_insn (gen_rtx_SET (mem, move_src));
8640 else
8641 insn = emit_move_insn (mem, move_src);
8643 if (frame_related_p && (need_cfi_note_p || move_src != reg))
8644 add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8647 RTX_FRAME_RELATED_P (insn) = frame_related_p;
8649 /* Emit a fake instruction to indicate that the VG save slot has
8650 been initialized. */
8651 if (regno == VG_REGNUM)
8652 emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8656 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8657 and any other registers that are handled separately. Write the appropriate
8658 REG_CFA_RESTORE notes into CFI_OPS.
8660 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8661 of the static frame. */
8663 static void
8664 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8665 array_slice<unsigned int> regs, rtx *cfi_ops)
8667 aarch64_frame &frame = cfun->machine->frame;
8668 poly_int64 offset;
8669 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8671 auto skip_restore_p = [&](unsigned int regno)
8673 if (cfun->machine->reg_is_wrapped_separately[regno])
8674 return true;
8676 if (regno == frame.wb_pop_candidate1
8677 || regno == frame.wb_pop_candidate2)
8678 return true;
8680 /* The shadow call stack code restores LR separately. */
8681 if (frame.is_scs_enabled && regno == LR_REGNUM)
8682 return true;
8684 return false;
8687 for (unsigned int i = 0; i < regs.size (); ++i)
8689 unsigned int regno = regs[i];
8690 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8691 if (skip_restore_p (regno))
8692 continue;
8694 machine_mode mode = aarch64_reg_save_mode (regno);
8695 rtx reg = gen_rtx_REG (mode, regno);
8696 offset = frame.reg_offset[regno] - bytes_below_sp;
8697 rtx base_rtx = stack_pointer_rtx;
8698 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8699 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8700 offset, ptrue);
8701 rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8703 unsigned int regno2;
8704 if (!aarch64_sve_mode_p (mode)
8705 && i + 1 < regs.size ()
8706 && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8707 && known_eq (GET_MODE_SIZE (mode),
8708 frame.reg_offset[regno2] - frame.reg_offset[regno]))
8710 rtx reg2 = gen_rtx_REG (mode, regno2);
8712 offset += GET_MODE_SIZE (mode);
8713 emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8715 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8716 regno = regno2;
8717 ++i;
8719 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8720 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8721 else if (aarch64_sve_mode_p (mode))
8722 emit_insn (gen_rtx_SET (reg, mem));
8723 else
8724 emit_move_insn (reg, mem);
8725 if (frame_related_p)
8726 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8730 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8731 of MODE. */
8733 static inline bool
8734 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8736 HOST_WIDE_INT multiple;
8737 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8738 && IN_RANGE (multiple, -8, 7));
8741 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8742 of MODE. */
8744 static inline bool
8745 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8747 HOST_WIDE_INT multiple;
8748 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8749 && IN_RANGE (multiple, -32, 31));
8752 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8753 of MODE. */
8755 static inline bool
8756 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8758 HOST_WIDE_INT multiple;
8759 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8760 && IN_RANGE (multiple, 0, 63));
8763 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8764 of MODE. */
8766 bool
8767 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8769 HOST_WIDE_INT multiple;
8770 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8771 && IN_RANGE (multiple, -64, 63));
8774 /* Return true if OFFSET is a signed 9-bit value. */
8776 bool
8777 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8778 poly_int64 offset)
8780 HOST_WIDE_INT const_offset;
8781 return (offset.is_constant (&const_offset)
8782 && IN_RANGE (const_offset, -256, 255));
8785 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8786 of MODE. */
8788 static inline bool
8789 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8791 HOST_WIDE_INT multiple;
8792 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8793 && IN_RANGE (multiple, -256, 255));
8796 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8797 of MODE. */
8799 static inline bool
8800 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8802 HOST_WIDE_INT multiple;
8803 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8804 && IN_RANGE (multiple, 0, 4095));
8807 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
8809 static sbitmap
8810 aarch64_get_separate_components (void)
8812 aarch64_frame &frame = cfun->machine->frame;
8813 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8814 bitmap_clear (components);
8816 /* The registers we need saved to the frame. */
8817 bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8818 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8819 if (aarch64_register_saved_on_entry (regno))
8821 /* Disallow shrink wrapping for registers that will be clobbered
8822 by an SMSTART SM in the prologue. */
8823 if (enables_pstate_sm
8824 && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
8825 continue;
8827 /* Punt on saves and restores that use ST1D and LD1D. We could
8828 try to be smarter, but it would involve making sure that the
8829 spare predicate register itself is safe to use at the save
8830 and restore points. Also, when a frame pointer is being used,
8831 the slots are often out of reach of ST1D and LD1D anyway. */
8832 machine_mode mode = aarch64_reg_save_mode (regno);
8833 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8834 continue;
8836 poly_int64 offset = frame.reg_offset[regno];
8838 /* Get the offset relative to the register we'll use. */
8839 if (frame_pointer_needed)
8840 offset -= frame.bytes_below_hard_fp;
8842 /* Check that we can access the stack slot of the register with one
8843 direct load with no adjustments needed. */
8844 if (aarch64_sve_mode_p (mode)
8845 ? offset_9bit_signed_scaled_p (mode, offset)
8846 : offset_12bit_unsigned_scaled_p (mode, offset))
8847 bitmap_set_bit (components, regno);
8850 /* Don't mess with the hard frame pointer. */
8851 if (frame_pointer_needed)
8852 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8854 /* If the spare predicate register used by big-endian SVE code
8855 is call-preserved, it must be saved in the main prologue
8856 before any saves that use it. */
8857 if (frame.spare_pred_reg != INVALID_REGNUM)
8858 bitmap_clear_bit (components, frame.spare_pred_reg);
8860 unsigned reg1 = frame.wb_push_candidate1;
8861 unsigned reg2 = frame.wb_push_candidate2;
8862 /* If registers have been chosen to be stored/restored with
8863 writeback don't interfere with them to avoid having to output explicit
8864 stack adjustment instructions. */
8865 if (reg2 != INVALID_REGNUM)
8866 bitmap_clear_bit (components, reg2);
8867 if (reg1 != INVALID_REGNUM)
8868 bitmap_clear_bit (components, reg1);
8870 bitmap_clear_bit (components, LR_REGNUM);
8871 bitmap_clear_bit (components, SP_REGNUM);
8872 if (flag_stack_clash_protection)
8874 if (frame.sve_save_and_probe != INVALID_REGNUM)
8875 bitmap_clear_bit (components, frame.sve_save_and_probe);
8876 if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
8877 bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
8880 /* The VG save sequence needs a temporary GPR. Punt for now on trying
8881 to find one. */
8882 bitmap_clear_bit (components, VG_REGNUM);
8884 return components;
8887 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
8889 static sbitmap
8890 aarch64_components_for_bb (basic_block bb)
8892 bitmap in = DF_LIVE_IN (bb);
8893 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8894 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8896 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8897 bitmap_clear (components);
8899 /* Clobbered registers don't generate values in any meaningful sense,
8900 since nothing after the clobber can rely on their value. And we can't
8901 say that partially-clobbered registers are unconditionally killed,
8902 because whether they're killed or not depends on the mode of the
8903 value they're holding. Thus partially call-clobbered registers
8904 appear in neither the kill set nor the gen set.
8906 Check manually for any calls that clobber more of a register than the
8907 current function can. */
8908 function_abi_aggregator callee_abis;
8909 rtx_insn *insn;
8910 FOR_BB_INSNS (bb, insn)
8911 if (CALL_P (insn))
8912 callee_abis.note_callee_abi (insn_callee_abi (insn));
8913 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8915 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
8916 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8917 if (!fixed_regs[regno]
8918 && !crtl->abi->clobbers_full_reg_p (regno)
8919 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8920 || bitmap_bit_p (in, regno)
8921 || bitmap_bit_p (gen, regno)
8922 || bitmap_bit_p (kill, regno)))
8924 bitmap_set_bit (components, regno);
8926 /* If there is a callee-save at an adjacent offset, add it too
8927 to increase the use of LDP/STP. */
8928 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8929 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8931 if (regno2 <= LAST_SAVED_REGNUM)
8933 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8934 if (regno < regno2
8935 ? known_eq (offset + 8, offset2)
8936 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8937 bitmap_set_bit (components, regno2);
8941 return components;
8944 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8945 Nothing to do for aarch64. */
8947 static void
8948 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8952 /* Return the next set bit in BMP from START onwards. Return the total number
8953 of bits in BMP if no set bit is found at or after START. */
8955 static unsigned int
8956 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8958 unsigned int nbits = SBITMAP_SIZE (bmp);
8959 if (start == nbits)
8960 return start;
8962 gcc_assert (start < nbits);
8963 for (unsigned int i = start; i < nbits; i++)
8964 if (bitmap_bit_p (bmp, i))
8965 return i;
8967 return nbits;
8970 /* Do the work for aarch64_emit_prologue_components and
8971 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
8972 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8973 for these components or the epilogue sequence. That is, it determines
8974 whether we should emit stores or loads and what kind of CFA notes to attach
8975 to the insns. Otherwise the logic for the two sequences is very
8976 similar. */
8978 static void
8979 aarch64_process_components (sbitmap components, bool prologue_p)
8981 aarch64_frame &frame = cfun->machine->frame;
8982 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8983 ? HARD_FRAME_POINTER_REGNUM
8984 : STACK_POINTER_REGNUM);
8986 unsigned last_regno = SBITMAP_SIZE (components);
8987 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
8988 rtx_insn *insn = NULL;
8990 while (regno != last_regno)
8992 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8993 machine_mode mode = aarch64_reg_save_mode (regno);
8995 rtx reg = gen_rtx_REG (mode, regno);
8996 poly_int64 offset = frame.reg_offset[regno];
8997 if (frame_pointer_needed)
8998 offset -= frame.bytes_below_hard_fp;
9000 rtx addr = plus_constant (Pmode, ptr_reg, offset);
9001 rtx mem = gen_frame_mem (mode, addr);
9003 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9004 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9005 /* No more registers to handle after REGNO.
9006 Emit a single save/restore and exit. */
9007 if (regno2 == last_regno)
9009 insn = emit_insn (set);
9010 if (frame_related_p)
9012 RTX_FRAME_RELATED_P (insn) = 1;
9013 if (prologue_p)
9014 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9015 else
9016 add_reg_note (insn, REG_CFA_RESTORE, reg);
9018 break;
9021 poly_int64 offset2 = frame.reg_offset[regno2];
9022 /* The next register is not of the same class or its offset is not
9023 mergeable with the current one into a pair. */
9024 if (aarch64_sve_mode_p (mode)
9025 || !satisfies_constraint_Ump (mem)
9026 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9027 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9028 || maybe_ne ((offset2 - frame.reg_offset[regno]),
9029 GET_MODE_SIZE (mode)))
9031 insn = emit_insn (set);
9032 if (frame_related_p)
9034 RTX_FRAME_RELATED_P (insn) = 1;
9035 if (prologue_p)
9036 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9037 else
9038 add_reg_note (insn, REG_CFA_RESTORE, reg);
9041 regno = regno2;
9042 continue;
9045 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9047 /* REGNO2 can be saved/restored in a pair with REGNO. */
9048 rtx reg2 = gen_rtx_REG (mode, regno2);
9049 if (frame_pointer_needed)
9050 offset2 -= frame.bytes_below_hard_fp;
9051 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9052 rtx mem2 = gen_frame_mem (mode, addr2);
9053 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9054 : gen_rtx_SET (reg2, mem2);
9056 if (prologue_p)
9057 insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
9058 else
9059 insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9061 if (frame_related_p || frame_related2_p)
9063 RTX_FRAME_RELATED_P (insn) = 1;
9064 if (prologue_p)
9066 if (frame_related_p)
9067 add_reg_note (insn, REG_CFA_OFFSET, set);
9068 if (frame_related2_p)
9069 add_reg_note (insn, REG_CFA_OFFSET, set2);
9071 else
9073 if (frame_related_p)
9074 add_reg_note (insn, REG_CFA_RESTORE, reg);
9075 if (frame_related2_p)
9076 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9080 regno = aarch64_get_next_set_bit (components, regno2 + 1);
9084 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9086 static void
9087 aarch64_emit_prologue_components (sbitmap components)
9089 aarch64_process_components (components, true);
9092 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9094 static void
9095 aarch64_emit_epilogue_components (sbitmap components)
9097 aarch64_process_components (components, false);
9100 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9102 static void
9103 aarch64_set_handled_components (sbitmap components)
9105 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9106 if (bitmap_bit_p (components, regno))
9107 cfun->machine->reg_is_wrapped_separately[regno] = true;
9110 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9111 determining the probe offset for alloca. */
9113 static HOST_WIDE_INT
9114 aarch64_stack_clash_protection_alloca_probe_range (void)
9116 return STACK_CLASH_CALLER_GUARD;
9119 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9120 subsequent memory accesses and that requires the stack pointer and REG
9121 to have their current values. REG can be stack_pointer_rtx if no
9122 other register's value needs to be fixed. */
9124 static void
9125 aarch64_emit_stack_tie (rtx reg)
9127 emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9130 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9131 registers. If POLY_SIZE is not large enough to require a probe this function
9132 will only adjust the stack. When allocating the stack space
9133 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9134 FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9135 the saved registers. If we are then we ensure that any allocation
9136 larger than the ABI defined buffer needs a probe so that the
9137 invariant of having a 1KB buffer is maintained.
9139 We emit barriers after each stack adjustment to prevent optimizations from
9140 breaking the invariant that we never drop the stack more than a page. This
9141 invariant is needed to make it easier to correctly handle asynchronous
9142 events, e.g. if we were to allow the stack to be dropped by more than a page
9143 and then have multiple probes up and we take a signal somewhere in between
9144 then the signal handler doesn't know the state of the stack and can make no
9145 assumptions about which pages have been probed.
9147 FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
9148 is measured relative to the SME vector length instead of the current
9149 prevailing vector length. It is 0 otherwise. */
9151 static void
9152 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9153 poly_int64 poly_size,
9154 aarch64_feature_flags force_isa_mode,
9155 bool frame_related_p,
9156 bool final_adjustment_p)
9158 aarch64_frame &frame = cfun->machine->frame;
9159 HOST_WIDE_INT guard_size
9160 = 1 << param_stack_clash_protection_guard_size;
9161 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9162 HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9163 gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9164 HOST_WIDE_INT min_probe_threshold
9165 = (final_adjustment_p
9166 ? guard_used_by_caller + byte_sp_alignment
9167 : guard_size - guard_used_by_caller);
9168 poly_int64 frame_size = frame.frame_size;
9170 /* We should always have a positive probe threshold. */
9171 gcc_assert (min_probe_threshold > 0);
9173 if (flag_stack_clash_protection && !final_adjustment_p)
9175 poly_int64 initial_adjust = frame.initial_adjust;
9176 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9177 poly_int64 final_adjust = frame.final_adjust;
9179 if (known_eq (frame_size, 0))
9181 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9183 else if (known_lt (initial_adjust + sve_callee_adjust,
9184 guard_size - guard_used_by_caller)
9185 && known_lt (final_adjust, guard_used_by_caller))
9187 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9191 /* If SIZE is not large enough to require probing, just adjust the stack and
9192 exit. */
9193 if (known_lt (poly_size, min_probe_threshold)
9194 || !flag_stack_clash_protection)
9196 aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9197 frame_related_p);
9198 return;
9201 HOST_WIDE_INT size;
9202 /* Handle the SVE non-constant case first. */
9203 if (!poly_size.is_constant (&size))
9205 if (dump_file)
9207 fprintf (dump_file, "Stack clash SVE prologue: ");
9208 print_dec (poly_size, dump_file);
9209 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9212 /* First calculate the amount of bytes we're actually spilling. */
9213 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9214 poly_size, temp1, temp2, force_isa_mode,
9215 false, true);
9217 rtx_insn *insn = get_last_insn ();
9219 if (frame_related_p)
9221 /* This is done to provide unwinding information for the stack
9222 adjustments we're about to do, however to prevent the optimizers
9223 from removing the R11 move and leaving the CFA note (which would be
9224 very wrong) we tie the old and new stack pointer together.
9225 The tie will expand to nothing but the optimizers will not touch
9226 the instruction. */
9227 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9228 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9229 aarch64_emit_stack_tie (stack_ptr_copy);
9231 /* We want the CFA independent of the stack pointer for the
9232 duration of the loop. */
9233 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9234 RTX_FRAME_RELATED_P (insn) = 1;
9237 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9238 rtx guard_const = gen_int_mode (guard_size, Pmode);
9240 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9241 stack_pointer_rtx, temp1,
9242 probe_const, guard_const));
9244 /* Now reset the CFA register if needed. */
9245 if (frame_related_p)
9247 add_reg_note (insn, REG_CFA_DEF_CFA,
9248 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9249 gen_int_mode (poly_size, Pmode)));
9250 RTX_FRAME_RELATED_P (insn) = 1;
9253 return;
9256 if (dump_file)
9257 fprintf (dump_file,
9258 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9259 " bytes, probing will be required.\n", size);
9261 /* Round size to the nearest multiple of guard_size, and calculate the
9262 residual as the difference between the original size and the rounded
9263 size. */
9264 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9265 HOST_WIDE_INT residual = size - rounded_size;
9267 /* We can handle a small number of allocations/probes inline. Otherwise
9268 punt to a loop. */
9269 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9271 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9273 aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9274 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9275 guard_used_by_caller));
9276 emit_insn (gen_blockage ());
9278 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9280 else
9282 /* Compute the ending address. */
9283 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9284 temp1, NULL, force_isa_mode, false, true);
9285 rtx_insn *insn = get_last_insn ();
9287 /* For the initial allocation, we don't have a frame pointer
9288 set up, so we always need CFI notes. If we're doing the
9289 final allocation, then we may have a frame pointer, in which
9290 case it is the CFA, otherwise we need CFI notes.
9292 We can determine which allocation we are doing by looking at
9293 the value of FRAME_RELATED_P since the final allocations are not
9294 frame related. */
9295 if (frame_related_p)
9297 /* We want the CFA independent of the stack pointer for the
9298 duration of the loop. */
9299 add_reg_note (insn, REG_CFA_DEF_CFA,
9300 plus_constant (Pmode, temp1, rounded_size));
9301 RTX_FRAME_RELATED_P (insn) = 1;
9304 /* This allocates and probes the stack. Note that this re-uses some of
9305 the existing Ada stack protection code. However we are guaranteed not
9306 to enter the non loop or residual branches of that code.
9308 The non-loop part won't be entered because if our allocation amount
9309 doesn't require a loop, the case above would handle it.
9311 The residual amount won't be entered because TEMP1 is a mutliple of
9312 the allocation size. The residual will always be 0. As such, the only
9313 part we are actually using from that code is the loop setup. The
9314 actual probing is done in aarch64_output_probe_stack_range. */
9315 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9316 stack_pointer_rtx, temp1));
9318 /* Now reset the CFA register if needed. */
9319 if (frame_related_p)
9321 add_reg_note (insn, REG_CFA_DEF_CFA,
9322 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9323 RTX_FRAME_RELATED_P (insn) = 1;
9326 emit_insn (gen_blockage ());
9327 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9330 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9331 be probed. This maintains the requirement that each page is probed at
9332 least once. For initial probing we probe only if the allocation is
9333 more than GUARD_SIZE - buffer, and below the saved registers we probe
9334 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9335 GUARD_SIZE. This works that for any allocation that is large enough to
9336 trigger a probe here, we'll have at least one, and if they're not large
9337 enough for this code to emit anything for them, The page would have been
9338 probed by the saving of FP/LR either by this function or any callees. If
9339 we don't have any callees then we won't have more stack adjustments and so
9340 are still safe. */
9341 if (residual)
9343 gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9345 /* If we're doing final adjustments, and we've done any full page
9346 allocations then any residual needs to be probed. */
9347 if (final_adjustment_p && rounded_size != 0)
9348 min_probe_threshold = 0;
9350 aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9351 if (residual >= min_probe_threshold)
9353 if (dump_file)
9354 fprintf (dump_file,
9355 "Stack clash AArch64 prologue residuals: "
9356 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9357 "\n", residual);
9359 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9360 guard_used_by_caller));
9361 emit_insn (gen_blockage ());
9366 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY. */
9368 void
9369 aarch64_extra_live_on_entry (bitmap regs)
9371 if (TARGET_ZA)
9373 bitmap_set_bit (regs, LOWERING_REGNUM);
9374 bitmap_set_bit (regs, SME_STATE_REGNUM);
9375 bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9376 bitmap_set_bit (regs, ZA_FREE_REGNUM);
9377 bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9379 /* The only time ZA can't have live contents on entry is when
9380 the function explicitly treats it as a pure output. */
9381 auto za_flags = aarch64_cfun_shared_flags ("za");
9382 if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9383 bitmap_set_bit (regs, ZA_REGNUM);
9385 /* Since ZT0 is call-clobbered, it is only live on input if
9386 it is explicitly shared, and is not a pure output. */
9387 auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9388 if (zt0_flags != 0
9389 && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9390 bitmap_set_bit (regs, ZT0_REGNUM);
9394 /* Return 1 if the register is used by the epilogue. We need to say the
9395 return register is used, but only after epilogue generation is complete.
9396 Note that in the case of sibcalls, the values "used by the epilogue" are
9397 considered live at the start of the called function. */
9400 aarch64_epilogue_uses (int regno)
9402 if (epilogue_completed)
9404 if (regno == LR_REGNUM)
9405 return 1;
9407 if (regno == LOWERING_REGNUM && TARGET_ZA)
9408 return 1;
9409 if (regno == SME_STATE_REGNUM && TARGET_ZA)
9410 return 1;
9411 if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9412 return 1;
9413 /* If the function shares SME state with its caller, ensure that that
9414 data is not in the lazy save buffer on exit. */
9415 if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9416 return 1;
9417 if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9418 return 1;
9419 if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9420 return 1;
9421 return 0;
9424 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
9426 static bool
9427 aarch64_use_late_prologue_epilogue ()
9429 return aarch64_cfun_enables_pstate_sm ();
9432 /* The current function's frame has a save slot for the incoming state
9433 of SVCR. Return a legitimate memory for the slot, based on the hard
9434 frame pointer. */
9436 static rtx
9437 aarch64_old_svcr_mem ()
9439 gcc_assert (frame_pointer_needed
9440 && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9441 rtx base = hard_frame_pointer_rtx;
9442 poly_int64 offset = (0
9443 /* hard fp -> bottom of frame. */
9444 - cfun->machine->frame.bytes_below_hard_fp
9445 /* bottom of frame -> save slot. */
9446 + cfun->machine->frame.old_svcr_offset);
9447 return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9450 /* The current function's frame has a save slot for the incoming state
9451 of SVCR. Load the slot into register REGNO and return the register. */
9453 static rtx
9454 aarch64_read_old_svcr (unsigned int regno)
9456 rtx svcr = gen_rtx_REG (DImode, regno);
9457 emit_move_insn (svcr, aarch64_old_svcr_mem ());
9458 return svcr;
9461 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9462 load the incoming value of SVCR from its save slot into temporary
9463 register REGNO. */
9465 static rtx_insn *
9466 aarch64_guard_switch_pstate_sm (unsigned int regno,
9467 aarch64_feature_flags local_mode)
9469 rtx old_svcr = aarch64_read_old_svcr (regno);
9470 return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9473 /* AArch64 stack frames generated by this compiler look like:
9475 +-------------------------------+
9477 | incoming stack arguments |
9479 +-------------------------------+
9480 | | <-- incoming stack pointer (aligned)
9481 | callee-allocated save area |
9482 | for register varargs |
9484 +-------------------------------+
9485 | local variables (1) | <-- frame_pointer_rtx
9487 +-------------------------------+
9488 | padding (1) |
9489 +-------------------------------+
9490 | callee-saved registers |
9491 +-------------------------------+
9492 | LR' |
9493 +-------------------------------+
9494 | FP' |
9495 +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9496 | SVE vector registers |
9497 +-------------------------------+
9498 | SVE predicate registers |
9499 +-------------------------------+
9500 | local variables (2) |
9501 +-------------------------------+
9502 | padding (2) |
9503 +-------------------------------+
9504 | dynamic allocation |
9505 +-------------------------------+
9506 | padding |
9507 +-------------------------------+
9508 | outgoing stack arguments | <-- arg_pointer
9510 +-------------------------------+
9511 | | <-- stack_pointer_rtx (aligned)
9513 The regions marked (1) and (2) are mutually exclusive. (2) is used
9514 when aarch64_save_regs_above_locals_p is true.
9516 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9517 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9518 unchanged.
9520 By default for stack-clash we assume the guard is at least 64KB, but this
9521 value is configurable to either 4KB or 64KB. We also force the guard size to
9522 be the same as the probing interval and both values are kept in sync.
9524 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9525 on the guard size) of stack space without probing.
9527 When probing is needed, we emit a probe at the start of the prologue
9528 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9530 We can also use register saves as probes. These are stored in
9531 sve_save_and_probe and hard_fp_save_and_probe.
9533 For outgoing arguments we probe if the size is larger than 1KB, such that
9534 the ABI specified buffer is maintained for the next callee.
9536 The following registers are reserved during frame layout and should not be
9537 used for any other purpose:
9539 - r11: Used by stack clash protection when SVE is enabled, and also
9540 as an anchor register when saving and restoring registers
9541 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9542 - r14 and r15: Used for speculation tracking.
9543 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9544 - r30(LR), r29(FP): Used by standard frame layout.
9546 These registers must be avoided in frame layout related code unless the
9547 explicit intention is to interact with one of the features listed above. */
9549 /* Generate the prologue instructions for entry into a function.
9550 Establish the stack frame by decreasing the stack pointer with a
9551 properly calculated size and, if necessary, create a frame record
9552 filled with the values of LR and previous frame pointer. The
9553 current FP is also set up if it is in use. */
9555 void
9556 aarch64_expand_prologue (void)
9558 aarch64_frame &frame = cfun->machine->frame;
9559 poly_int64 frame_size = frame.frame_size;
9560 poly_int64 initial_adjust = frame.initial_adjust;
9561 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9562 poly_int64 final_adjust = frame.final_adjust;
9563 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9564 unsigned reg1 = frame.wb_push_candidate1;
9565 unsigned reg2 = frame.wb_push_candidate2;
9566 bool emit_frame_chain = frame.emit_frame_chain;
9567 rtx_insn *insn;
9568 aarch64_feature_flags force_isa_mode = 0;
9569 if (aarch64_cfun_enables_pstate_sm ())
9570 force_isa_mode = AARCH64_FL_SM_ON;
9572 if (flag_stack_clash_protection
9573 && known_eq (callee_adjust, 0)
9574 && known_lt (frame.reg_offset[VG_REGNUM], 0))
9576 /* Fold the SVE allocation into the initial allocation.
9577 We don't do this in aarch64_layout_arg to avoid pessimizing
9578 the epilogue code. */
9579 initial_adjust += sve_callee_adjust;
9580 sve_callee_adjust = 0;
9583 /* Sign return address for functions. */
9584 if (aarch64_return_address_signing_enabled ())
9586 switch (aarch64_ra_sign_key)
9588 case AARCH64_KEY_A:
9589 insn = emit_insn (gen_paciasp ());
9590 break;
9591 case AARCH64_KEY_B:
9592 insn = emit_insn (gen_pacibsp ());
9593 break;
9594 default:
9595 gcc_unreachable ();
9597 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9598 RTX_FRAME_RELATED_P (insn) = 1;
9601 /* Push return address to shadow call stack. */
9602 if (frame.is_scs_enabled)
9603 emit_insn (gen_scs_push ());
9605 if (flag_stack_usage_info)
9606 current_function_static_stack_size = constant_lower_bound (frame_size);
9608 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9610 if (crtl->is_leaf && !cfun->calls_alloca)
9612 if (maybe_gt (frame_size, PROBE_INTERVAL)
9613 && maybe_gt (frame_size, get_stack_check_protect ()))
9614 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9615 (frame_size
9616 - get_stack_check_protect ()));
9618 else if (maybe_gt (frame_size, 0))
9619 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9622 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9623 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9625 /* In theory we should never have both an initial adjustment
9626 and a callee save adjustment. Verify that is the case since the
9627 code below does not handle it for -fstack-clash-protection. */
9628 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9630 /* Will only probe if the initial adjustment is larger than the guard
9631 less the amount of the guard reserved for use by the caller's
9632 outgoing args. */
9633 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9634 force_isa_mode, true, false);
9636 if (callee_adjust != 0)
9637 aarch64_push_regs (reg1, reg2, callee_adjust);
9639 /* The offset of the current SP from the bottom of the static frame. */
9640 poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9642 if (emit_frame_chain)
9644 /* The offset of the frame chain record (if any) from the current SP. */
9645 poly_int64 chain_offset = (initial_adjust + callee_adjust
9646 - frame.bytes_above_hard_fp);
9647 gcc_assert (known_ge (chain_offset, 0));
9649 gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9650 if (callee_adjust == 0)
9651 aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9652 false, false);
9653 else
9654 gcc_assert (known_eq (chain_offset, 0));
9655 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9656 stack_pointer_rtx, chain_offset,
9657 tmp1_rtx, tmp0_rtx, force_isa_mode,
9658 frame_pointer_needed);
9659 if (frame_pointer_needed && !frame_size.is_constant ())
9661 /* Variable-sized frames need to describe the save slot
9662 address using DW_CFA_expression rather than DW_CFA_offset.
9663 This means that, without taking further action, the
9664 locations of the registers that we've already saved would
9665 remain based on the stack pointer even after we redefine
9666 the CFA based on the frame pointer. We therefore need new
9667 DW_CFA_expressions to re-express the save slots with addresses
9668 based on the frame pointer. */
9669 rtx_insn *insn = get_last_insn ();
9670 gcc_assert (RTX_FRAME_RELATED_P (insn));
9672 /* Add an explicit CFA definition if this was previously
9673 implicit. */
9674 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9676 rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9677 add_reg_note (insn, REG_CFA_ADJUST_CFA,
9678 gen_rtx_SET (hard_frame_pointer_rtx, src));
9681 /* Change the save slot expressions for the registers that
9682 we've already saved. */
9683 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9684 hard_frame_pointer_rtx, UNITS_PER_WORD);
9685 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9686 hard_frame_pointer_rtx, 0);
9688 aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9691 aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9692 emit_frame_chain);
9693 if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9695 unsigned int saved_regs[] = { VG_REGNUM };
9696 aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9697 emit_frame_chain);
9699 if (maybe_ne (sve_callee_adjust, 0))
9701 gcc_assert (!flag_stack_clash_protection
9702 || known_eq (initial_adjust, 0)
9703 /* The VG save isn't shrink-wrapped and so serves as
9704 a probe of the initial allocation. */
9705 || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp));
9706 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9707 sve_callee_adjust,
9708 force_isa_mode,
9709 !frame_pointer_needed, false);
9710 bytes_below_sp -= sve_callee_adjust;
9712 aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9713 emit_frame_chain);
9714 aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
9715 emit_frame_chain);
9717 /* We may need to probe the final adjustment if it is larger than the guard
9718 that is assumed by the called. */
9719 gcc_assert (known_eq (bytes_below_sp, final_adjust));
9720 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9721 force_isa_mode,
9722 !frame_pointer_needed, true);
9723 if (emit_frame_chain && maybe_ne (final_adjust, 0))
9724 aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9726 /* Save the incoming value of PSTATE.SM, if required. Code further
9727 down does this for locally-streaming functions. */
9728 if (known_ge (frame.old_svcr_offset, 0)
9729 && !aarch64_cfun_enables_pstate_sm ())
9731 rtx mem = aarch64_old_svcr_mem ();
9732 MEM_VOLATILE_P (mem) = 1;
9733 if (TARGET_SME)
9735 rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
9736 emit_insn (gen_aarch64_read_svcr (reg));
9737 emit_move_insn (mem, reg);
9739 else
9741 rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
9742 auto &args = crtl->args.info;
9743 if (args.aapcs_ncrn > 0)
9745 old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
9746 emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
9748 if (args.aapcs_ncrn > 1)
9750 old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
9751 emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
9753 emit_insn (gen_aarch64_get_sme_state ());
9754 emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
9755 if (old_r0)
9756 emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
9757 if (old_r1)
9758 emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
9762 /* Enable PSTATE.SM, if required. */
9763 if (aarch64_cfun_enables_pstate_sm ())
9765 rtx_insn *guard_label = nullptr;
9766 if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9768 /* The current function is streaming-compatible. Save the
9769 original state of PSTATE.SM. */
9770 rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
9771 emit_insn (gen_aarch64_read_svcr (svcr));
9772 emit_move_insn (aarch64_old_svcr_mem (), svcr);
9773 guard_label = aarch64_guard_switch_pstate_sm (svcr,
9774 aarch64_isa_flags);
9776 aarch64_sme_mode_switch_regs args_switch;
9777 auto &args = crtl->args.info;
9778 for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
9780 rtx x = args.sme_mode_switch_args[i];
9781 args_switch.add_reg (GET_MODE (x), REGNO (x));
9783 args_switch.emit_prologue ();
9784 emit_insn (gen_aarch64_smstart_sm ());
9785 args_switch.emit_epilogue ();
9786 if (guard_label)
9787 emit_label (guard_label);
9791 /* Return TRUE if we can use a simple_return insn.
9793 This function checks whether the callee saved stack is empty, which
9794 means no restore actions are need. The pro_and_epilogue will use
9795 this to check whether shrink-wrapping opt is feasible. */
9797 bool
9798 aarch64_use_return_insn_p (void)
9800 if (!reload_completed)
9801 return false;
9803 if (crtl->profile)
9804 return false;
9806 return known_eq (cfun->machine->frame.frame_size, 0);
9809 /* Generate the epilogue instructions for returning from a function.
9810 This is almost exactly the reverse of the prolog sequence, except
9811 that we need to insert barriers to avoid scheduling loads that read
9812 from a deallocated stack, and we optimize the unwind records by
9813 emitting them all together if possible. */
9814 void
9815 aarch64_expand_epilogue (rtx_call_insn *sibcall)
9817 aarch64_frame &frame = cfun->machine->frame;
9818 poly_int64 initial_adjust = frame.initial_adjust;
9819 HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9820 poly_int64 final_adjust = frame.final_adjust;
9821 poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9822 poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
9823 unsigned reg1 = frame.wb_pop_candidate1;
9824 unsigned reg2 = frame.wb_pop_candidate2;
9825 rtx cfi_ops = NULL;
9826 rtx_insn *insn;
9827 /* A stack clash protection prologue may not have left EP0_REGNUM or
9828 EP1_REGNUM in a usable state. The same is true for allocations
9829 with an SVE component, since we then need both temporary registers
9830 for each allocation. For stack clash we are in a usable state if
9831 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
9832 HOST_WIDE_INT guard_size
9833 = 1 << param_stack_clash_protection_guard_size;
9834 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9835 aarch64_feature_flags force_isa_mode = 0;
9836 if (aarch64_cfun_enables_pstate_sm ())
9837 force_isa_mode = AARCH64_FL_SM_ON;
9839 /* We can re-use the registers when:
9841 (a) the deallocation amount is the same as the corresponding
9842 allocation amount (which is false if we combine the initial
9843 and SVE callee save allocations in the prologue); and
9845 (b) the allocation amount doesn't need a probe (which is false
9846 if the amount is guard_size - guard_used_by_caller or greater).
9848 In such situations the register should remain live with the correct
9849 value. */
9850 bool can_inherit_p = (initial_adjust.is_constant ()
9851 && final_adjust.is_constant ()
9852 && (!flag_stack_clash_protection
9853 || (known_lt (initial_adjust,
9854 guard_size - guard_used_by_caller)
9855 && known_eq (sve_callee_adjust, 0))));
9857 /* We need to add memory barrier to prevent read from deallocated stack. */
9858 bool need_barrier_p
9859 = maybe_ne (get_frame_size ()
9860 + frame.saved_varargs_size, 0);
9862 /* Reset PSTATE.SM, if required. */
9863 if (aarch64_cfun_enables_pstate_sm ())
9865 rtx_insn *guard_label = nullptr;
9866 if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9867 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
9868 aarch64_isa_flags);
9869 aarch64_sme_mode_switch_regs return_switch;
9870 if (sibcall)
9871 return_switch.add_call_args (sibcall);
9872 else if (crtl->return_rtx && REG_P (crtl->return_rtx))
9873 return_switch.add_reg (GET_MODE (crtl->return_rtx),
9874 REGNO (crtl->return_rtx));
9875 return_switch.emit_prologue ();
9876 emit_insn (gen_aarch64_smstop_sm ());
9877 return_switch.emit_epilogue ();
9878 if (guard_label)
9879 emit_label (guard_label);
9882 /* Emit a barrier to prevent loads from a deallocated stack. */
9883 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9884 || cfun->calls_alloca
9885 || crtl->calls_eh_return)
9887 aarch64_emit_stack_tie (stack_pointer_rtx);
9888 need_barrier_p = false;
9891 /* Restore the stack pointer from the frame pointer if it may not
9892 be the same as the stack pointer. */
9893 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9894 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9895 if (frame_pointer_needed
9896 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9897 /* If writeback is used when restoring callee-saves, the CFA
9898 is restored on the instruction doing the writeback. */
9899 aarch64_add_offset (Pmode, stack_pointer_rtx,
9900 hard_frame_pointer_rtx,
9901 -bytes_below_hard_fp + final_adjust,
9902 tmp1_rtx, tmp0_rtx, force_isa_mode,
9903 callee_adjust == 0);
9904 else
9905 /* The case where we need to re-use the register here is very rare, so
9906 avoid the complicated condition and just always emit a move if the
9907 immediate doesn't fit. */
9908 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
9910 /* Restore the vector registers before the predicate registers,
9911 so that we can use P4 as a temporary for big-endian SVE frames. */
9912 aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
9913 aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
9914 if (maybe_ne (sve_callee_adjust, 0))
9915 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
9916 force_isa_mode, true);
9918 /* When shadow call stack is enabled, the scs_pop in the epilogue will
9919 restore x30, we don't need to restore x30 again in the traditional
9920 way. */
9921 aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
9922 frame.saved_gprs, &cfi_ops);
9924 if (need_barrier_p)
9925 aarch64_emit_stack_tie (stack_pointer_rtx);
9927 if (callee_adjust != 0)
9928 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9930 /* If we have no register restore information, the CFA must have been
9931 defined in terms of the stack pointer since the end of the prologue. */
9932 gcc_assert (cfi_ops || !frame_pointer_needed);
9934 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9936 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
9937 insn = get_last_insn ();
9938 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9939 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9940 RTX_FRAME_RELATED_P (insn) = 1;
9941 cfi_ops = NULL;
9944 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9945 add restriction on emit_move optimization to leaf functions. */
9946 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
9947 (!can_inherit_p || !crtl->is_leaf
9948 || df_regs_ever_live_p (EP0_REGNUM)));
9950 if (cfi_ops)
9952 /* Emit delayed restores and reset the CFA to be SP. */
9953 insn = get_last_insn ();
9954 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9955 REG_NOTES (insn) = cfi_ops;
9956 RTX_FRAME_RELATED_P (insn) = 1;
9959 /* Pop return address from shadow call stack. */
9960 if (frame.is_scs_enabled)
9962 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
9963 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
9965 insn = emit_insn (gen_scs_pop ());
9966 add_reg_note (insn, REG_CFA_RESTORE, reg);
9967 RTX_FRAME_RELATED_P (insn) = 1;
9970 /* Stack adjustment for exception handler. */
9971 if (crtl->calls_eh_return && !sibcall)
9973 /* If the EH_RETURN_TAKEN_RTX flag is set then we need
9974 to unwind the stack and jump to the handler, otherwise
9975 skip this eh_return logic and continue with normal
9976 return after the label. We have already reset the CFA
9977 to be SP; letting the CFA move during this adjustment
9978 is just as correct as retaining the CFA from the body
9979 of the function. Therefore, do nothing special. */
9980 rtx_code_label *label = gen_label_rtx ();
9981 rtx x = aarch64_gen_compare_zero_and_branch (EQ, EH_RETURN_TAKEN_RTX,
9982 label);
9983 rtx jump = emit_jump_insn (x);
9984 JUMP_LABEL (jump) = label;
9985 LABEL_NUSES (label)++;
9986 emit_insn (gen_add2_insn (stack_pointer_rtx,
9987 EH_RETURN_STACKADJ_RTX));
9988 emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
9989 emit_barrier ();
9990 emit_label (label);
9993 /* We prefer to emit the combined return/authenticate instruction RETAA,
9994 however there are three cases in which we must instead emit an explicit
9995 authentication instruction.
9997 1) Sibcalls don't return in a normal way, so if we're about to call one
9998 we must authenticate.
10000 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10001 generating code for !TARGET_ARMV8_3 we can't use it and must
10002 explicitly authenticate.
10004 if (aarch64_return_address_signing_enabled ()
10005 && (sibcall || !TARGET_ARMV8_3))
10007 switch (aarch64_ra_sign_key)
10009 case AARCH64_KEY_A:
10010 insn = emit_insn (gen_autiasp ());
10011 break;
10012 case AARCH64_KEY_B:
10013 insn = emit_insn (gen_autibsp ());
10014 break;
10015 default:
10016 gcc_unreachable ();
10018 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10019 RTX_FRAME_RELATED_P (insn) = 1;
10022 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10023 if (!sibcall)
10024 emit_jump_insn (ret_rtx);
10027 /* Output code to add DELTA to the first argument, and then jump
10028 to FUNCTION. Used for C++ multiple inheritance. */
10029 static void
10030 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10031 HOST_WIDE_INT delta,
10032 HOST_WIDE_INT vcall_offset,
10033 tree function)
10035 /* The this pointer is always in x0. Note that this differs from
10036 Arm where the this pointer maybe bumped to r1 if r0 is required
10037 to return a pointer to an aggregate. On AArch64 a result value
10038 pointer will be in x8. */
10039 int this_regno = R0_REGNUM;
10040 rtx this_rtx, temp0, temp1, addr, funexp;
10041 rtx_insn *insn;
10042 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10044 if (aarch_bti_enabled ())
10045 emit_insn (gen_bti_c());
10047 reload_completed = 1;
10048 emit_note (NOTE_INSN_PROLOGUE_END);
10050 this_rtx = gen_rtx_REG (Pmode, this_regno);
10051 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10052 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10054 if (vcall_offset == 0)
10055 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
10056 0, false);
10057 else
10059 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10061 addr = this_rtx;
10062 if (delta != 0)
10064 if (delta >= -256 && delta < 256)
10065 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10066 plus_constant (Pmode, this_rtx, delta));
10067 else
10068 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10069 temp1, temp0, 0, false);
10072 if (Pmode == ptr_mode)
10073 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10074 else
10075 aarch64_emit_move (temp0,
10076 gen_rtx_ZERO_EXTEND (Pmode,
10077 gen_rtx_MEM (ptr_mode, addr)));
10079 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10080 addr = plus_constant (Pmode, temp0, vcall_offset);
10081 else
10083 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10084 Pmode);
10085 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10088 if (Pmode == ptr_mode)
10089 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10090 else
10091 aarch64_emit_move (temp1,
10092 gen_rtx_SIGN_EXTEND (Pmode,
10093 gen_rtx_MEM (ptr_mode, addr)));
10095 emit_insn (gen_add2_insn (this_rtx, temp1));
10098 /* Generate a tail call to the target function. */
10099 if (!TREE_USED (function))
10101 assemble_external (function);
10102 TREE_USED (function) = 1;
10104 funexp = XEXP (DECL_RTL (function), 0);
10105 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10106 auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10107 auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10108 rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant);
10109 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10110 SIBLING_CALL_P (insn) = 1;
10112 insn = get_insns ();
10113 shorten_branches (insn);
10115 assemble_start_function (thunk, fnname);
10116 final_start_function (insn, file, 1);
10117 final (insn, file, 1);
10118 final_end_function ();
10119 assemble_end_function (thunk, fnname);
10121 /* Stop pretending to be a post-reload pass. */
10122 reload_completed = 0;
10125 static bool
10126 aarch64_tls_referenced_p (rtx x)
10128 if (!TARGET_HAVE_TLS)
10129 return false;
10130 subrtx_iterator::array_type array;
10131 FOR_EACH_SUBRTX (iter, array, x, ALL)
10133 const_rtx x = *iter;
10134 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10135 return true;
10136 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10137 TLS offsets, not real symbol references. */
10138 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10139 iter.skip_subrtxes ();
10141 return false;
10145 static bool
10146 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10148 if (GET_CODE (x) == HIGH)
10149 return true;
10151 /* There's no way to calculate VL-based values using relocations. */
10152 subrtx_iterator::array_type array;
10153 HOST_WIDE_INT factor;
10154 FOR_EACH_SUBRTX (iter, array, x, ALL)
10155 if (GET_CODE (*iter) == CONST_POLY_INT
10156 || aarch64_sme_vq_unspec_p (x, &factor))
10157 return true;
10159 poly_int64 offset;
10160 rtx base = strip_offset_and_salt (x, &offset);
10161 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10163 /* We checked for POLY_INT_CST offsets above. */
10164 if (aarch64_classify_symbol (base, offset.to_constant ())
10165 != SYMBOL_FORCE_TO_MEM)
10166 return true;
10167 else
10168 /* Avoid generating a 64-bit relocation in ILP32; leave
10169 to aarch64_expand_mov_immediate to handle it properly. */
10170 return mode != ptr_mode;
10173 return aarch64_tls_referenced_p (x);
10176 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10177 The expansion for a table switch is quite expensive due to the number
10178 of instructions, the table lookup and hard to predict indirect jump.
10179 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10180 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10181 performance. When optimizing for size, use 8 for smallest codesize. */
10183 static unsigned int
10184 aarch64_case_values_threshold (void)
10186 /* Use the specified limit for the number of cases before using jump
10187 tables at higher optimization levels. */
10188 if (optimize > 2
10189 && aarch64_tune_params.max_case_values != 0)
10190 return aarch64_tune_params.max_case_values;
10191 else
10192 return optimize_size ? 8 : 11;
10195 /* Return true if register REGNO is a valid index register.
10196 STRICT_P is true if REG_OK_STRICT is in effect. */
10198 bool
10199 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10201 if (!HARD_REGISTER_NUM_P (regno))
10203 if (!strict_p)
10204 return true;
10206 if (!reg_renumber)
10207 return false;
10209 regno = reg_renumber[regno];
10211 return GP_REGNUM_P (regno);
10214 /* Return true if register REGNO is a valid base register for mode MODE.
10215 STRICT_P is true if REG_OK_STRICT is in effect. */
10217 bool
10218 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10220 if (!HARD_REGISTER_NUM_P (regno))
10222 if (!strict_p)
10223 return true;
10225 if (!reg_renumber)
10226 return false;
10228 regno = reg_renumber[regno];
10231 /* The fake registers will be eliminated to either the stack or
10232 hard frame pointer, both of which are usually valid base registers.
10233 Reload deals with the cases where the eliminated form isn't valid. */
10234 return (GP_REGNUM_P (regno)
10235 || regno == SP_REGNUM
10236 || regno == FRAME_POINTER_REGNUM
10237 || regno == ARG_POINTER_REGNUM);
10240 /* Return true if X is a valid base register for mode MODE.
10241 STRICT_P is true if REG_OK_STRICT is in effect. */
10243 static bool
10244 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10246 if (!strict_p
10247 && SUBREG_P (x)
10248 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10249 x = SUBREG_REG (x);
10251 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10254 /* Return true if address offset is a valid index. If it is, fill in INFO
10255 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10257 static bool
10258 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10259 machine_mode mode, bool strict_p)
10261 enum aarch64_address_type type;
10262 rtx index;
10263 int shift;
10265 /* (reg:P) */
10266 if ((REG_P (x) || SUBREG_P (x))
10267 && GET_MODE (x) == Pmode)
10269 type = ADDRESS_REG_REG;
10270 index = x;
10271 shift = 0;
10273 /* (sign_extend:DI (reg:SI)) */
10274 else if ((GET_CODE (x) == SIGN_EXTEND
10275 || GET_CODE (x) == ZERO_EXTEND)
10276 && GET_MODE (x) == DImode
10277 && GET_MODE (XEXP (x, 0)) == SImode)
10279 type = (GET_CODE (x) == SIGN_EXTEND)
10280 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10281 index = XEXP (x, 0);
10282 shift = 0;
10284 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10285 else if (GET_CODE (x) == MULT
10286 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10287 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10288 && GET_MODE (XEXP (x, 0)) == DImode
10289 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10290 && CONST_INT_P (XEXP (x, 1)))
10292 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10293 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10294 index = XEXP (XEXP (x, 0), 0);
10295 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10297 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10298 else if (GET_CODE (x) == ASHIFT
10299 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10300 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10301 && GET_MODE (XEXP (x, 0)) == DImode
10302 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10303 && CONST_INT_P (XEXP (x, 1)))
10305 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10306 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10307 index = XEXP (XEXP (x, 0), 0);
10308 shift = INTVAL (XEXP (x, 1));
10310 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10311 (const_int 0xffffffff<<shift)) */
10312 else if (GET_CODE (x) == AND
10313 && GET_MODE (x) == DImode
10314 && GET_CODE (XEXP (x, 0)) == MULT
10315 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10316 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10317 && CONST_INT_P (XEXP (x, 1)))
10319 type = ADDRESS_REG_UXTW;
10320 index = XEXP (XEXP (x, 0), 0);
10321 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10322 /* Avoid undefined code dealing with shift being -1. */
10323 if (shift != -1
10324 && INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10325 shift = -1;
10327 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10328 (const_int 0xffffffff<<shift)) */
10329 else if (GET_CODE (x) == AND
10330 && GET_MODE (x) == DImode
10331 && GET_CODE (XEXP (x, 0)) == ASHIFT
10332 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10333 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10334 && CONST_INT_P (XEXP (x, 1)))
10336 type = ADDRESS_REG_UXTW;
10337 index = XEXP (XEXP (x, 0), 0);
10338 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10339 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10340 shift = -1;
10342 /* (mult:P (reg:P) (const_int scale)) */
10343 else if (GET_CODE (x) == MULT
10344 && GET_MODE (x) == Pmode
10345 && GET_MODE (XEXP (x, 0)) == Pmode
10346 && CONST_INT_P (XEXP (x, 1)))
10348 type = ADDRESS_REG_REG;
10349 index = XEXP (x, 0);
10350 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10352 /* (ashift:P (reg:P) (const_int shift)) */
10353 else if (GET_CODE (x) == ASHIFT
10354 && GET_MODE (x) == Pmode
10355 && GET_MODE (XEXP (x, 0)) == Pmode
10356 && CONST_INT_P (XEXP (x, 1)))
10358 type = ADDRESS_REG_REG;
10359 index = XEXP (x, 0);
10360 shift = INTVAL (XEXP (x, 1));
10362 else
10363 return false;
10365 if (!strict_p
10366 && SUBREG_P (index)
10367 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10368 index = SUBREG_REG (index);
10370 if (aarch64_sve_data_mode_p (mode) || mode == VNx1TImode)
10372 if (type != ADDRESS_REG_REG
10373 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10374 return false;
10376 else
10378 if (shift != 0
10379 && !(IN_RANGE (shift, 1, 3)
10380 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10381 return false;
10384 if (REG_P (index)
10385 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10387 info->type = type;
10388 info->offset = index;
10389 info->shift = shift;
10390 return true;
10393 return false;
10396 /* Return true if MODE is one of the modes for which we
10397 support LDP/STP operations. */
10399 static bool
10400 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10402 return mode == SImode || mode == DImode
10403 || mode == SFmode || mode == DFmode
10404 || mode == SDmode || mode == DDmode
10405 || (aarch64_vector_mode_supported_p (mode)
10406 && (known_eq (GET_MODE_SIZE (mode), 8)
10407 || known_eq (GET_MODE_SIZE (mode), 16)));
10410 /* Return true if REGNO is a virtual pointer register, or an eliminable
10411 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10412 include stack_pointer or hard_frame_pointer. */
10413 static bool
10414 virt_or_elim_regno_p (unsigned regno)
10416 return ((regno >= FIRST_VIRTUAL_REGISTER
10417 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10418 || regno == FRAME_POINTER_REGNUM
10419 || regno == ARG_POINTER_REGNUM);
10422 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10423 If it is, fill in INFO appropriately. STRICT_P is true if
10424 REG_OK_STRICT is in effect. */
10426 bool
10427 aarch64_classify_address (struct aarch64_address_info *info,
10428 rtx x, machine_mode mode, bool strict_p,
10429 aarch64_addr_query_type type)
10431 enum rtx_code code = GET_CODE (x);
10432 rtx op0, op1;
10433 poly_int64 offset;
10435 HOST_WIDE_INT const_size;
10437 /* Whether a vector mode is partial doesn't affect address legitimacy.
10438 Partial vectors like VNx8QImode allow the same indexed addressing
10439 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10440 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10441 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10442 vec_flags &= ~VEC_PARTIAL;
10444 /* On BE, we use load/store pair for all large int mode load/stores.
10445 TI/TF/TDmode may also use a load/store pair. */
10446 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10447 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10448 || type == ADDR_QUERY_LDP_STP_N
10449 || mode == TImode
10450 || mode == TFmode
10451 || mode == TDmode
10452 || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10453 && advsimd_struct_p));
10454 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10455 corresponds to the actual size of the memory being loaded/stored and the
10456 mode of the corresponding addressing mode is half of that. */
10457 if (type == ADDR_QUERY_LDP_STP_N)
10459 if (known_eq (GET_MODE_SIZE (mode), 32))
10460 mode = V16QImode;
10461 else if (known_eq (GET_MODE_SIZE (mode), 16))
10462 mode = DFmode;
10463 else if (known_eq (GET_MODE_SIZE (mode), 8))
10464 mode = SFmode;
10465 else
10466 return false;
10468 /* This isn't really an Advanced SIMD struct mode, but a mode
10469 used to represent the complete mem in a load/store pair. */
10470 advsimd_struct_p = false;
10473 bool allow_reg_index_p = (!load_store_pair_p
10474 && ((vec_flags == 0
10475 && known_lt (GET_MODE_SIZE (mode), 16))
10476 || vec_flags == VEC_ADVSIMD
10477 || vec_flags & VEC_SVE_DATA
10478 || mode == VNx1TImode));
10480 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10481 The latter is not valid for SVE predicates, and that's rejected through
10482 allow_reg_index_p above. */
10483 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10484 && (code != REG && code != PLUS))
10485 return false;
10487 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10488 REG addressing. */
10489 if (advsimd_struct_p
10490 && TARGET_SIMD
10491 && !BYTES_BIG_ENDIAN
10492 && (code != POST_INC && code != REG))
10493 return false;
10495 gcc_checking_assert (GET_MODE (x) == VOIDmode
10496 || SCALAR_INT_MODE_P (GET_MODE (x)));
10498 switch (code)
10500 case REG:
10501 case SUBREG:
10502 info->type = ADDRESS_REG_IMM;
10503 info->base = x;
10504 info->offset = const0_rtx;
10505 info->const_offset = 0;
10506 return aarch64_base_register_rtx_p (x, strict_p);
10508 case PLUS:
10509 op0 = XEXP (x, 0);
10510 op1 = XEXP (x, 1);
10512 if (! strict_p
10513 && REG_P (op0)
10514 && virt_or_elim_regno_p (REGNO (op0))
10515 && poly_int_rtx_p (op1, &offset))
10517 info->type = ADDRESS_REG_IMM;
10518 info->base = op0;
10519 info->offset = op1;
10520 info->const_offset = offset;
10522 return true;
10525 if (maybe_ne (GET_MODE_SIZE (mode), 0)
10526 && aarch64_base_register_rtx_p (op0, strict_p)
10527 && poly_int_rtx_p (op1, &offset))
10529 info->type = ADDRESS_REG_IMM;
10530 info->base = op0;
10531 info->offset = op1;
10532 info->const_offset = offset;
10534 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10535 registers and individual Q registers. The available
10536 address modes are:
10537 X,X: 7-bit signed scaled offset
10538 Q: 9-bit signed offset
10539 We conservatively require an offset representable in either mode.
10540 When performing the check for pairs of X registers i.e. LDP/STP
10541 pass down DImode since that is the natural size of the LDP/STP
10542 instruction memory accesses. */
10543 if (mode == TImode || mode == TFmode || mode == TDmode)
10544 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10545 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10546 || offset_12bit_unsigned_scaled_p (mode, offset)));
10548 if (mode == V8DImode)
10549 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10550 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10552 /* A 7bit offset check because OImode will emit a ldp/stp
10553 instruction (only !TARGET_SIMD or big endian will get here).
10554 For ldp/stp instructions, the offset is scaled for the size of a
10555 single element of the pair. */
10556 if (aarch64_advsimd_partial_struct_mode_p (mode)
10557 && known_eq (GET_MODE_SIZE (mode), 16))
10558 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10559 if (aarch64_advsimd_full_struct_mode_p (mode)
10560 && known_eq (GET_MODE_SIZE (mode), 32))
10561 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10563 /* Three 9/12 bit offsets checks because CImode will emit three
10564 ldr/str instructions (only !TARGET_SIMD or big endian will
10565 get here). */
10566 if (aarch64_advsimd_partial_struct_mode_p (mode)
10567 && known_eq (GET_MODE_SIZE (mode), 24))
10568 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10569 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10570 offset + 16)
10571 || offset_12bit_unsigned_scaled_p (DImode,
10572 offset + 16)));
10573 if (aarch64_advsimd_full_struct_mode_p (mode)
10574 && known_eq (GET_MODE_SIZE (mode), 48))
10575 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10576 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10577 offset + 32)
10578 || offset_12bit_unsigned_scaled_p (TImode,
10579 offset + 32)));
10581 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10582 instructions (only big endian will get here). */
10583 if (aarch64_advsimd_partial_struct_mode_p (mode)
10584 && known_eq (GET_MODE_SIZE (mode), 32))
10585 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10586 && aarch64_offset_7bit_signed_scaled_p (DImode,
10587 offset + 16));
10588 if (aarch64_advsimd_full_struct_mode_p (mode)
10589 && known_eq (GET_MODE_SIZE (mode), 64))
10590 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10591 && aarch64_offset_7bit_signed_scaled_p (TImode,
10592 offset + 32));
10594 /* Make "m" use the LD1 offset range for SVE data modes, so
10595 that pre-RTL optimizers like ivopts will work to that
10596 instead of the wider LDR/STR range. */
10597 if (vec_flags == VEC_SVE_DATA || mode == VNx1TImode)
10598 return (type == ADDR_QUERY_M
10599 ? offset_4bit_signed_scaled_p (mode, offset)
10600 : offset_9bit_signed_scaled_p (mode, offset));
10602 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10604 poly_int64 end_offset = (offset
10605 + GET_MODE_SIZE (mode)
10606 - BYTES_PER_SVE_VECTOR);
10607 return (type == ADDR_QUERY_M
10608 ? offset_4bit_signed_scaled_p (mode, offset)
10609 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10610 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10611 end_offset)));
10614 if (vec_flags == VEC_SVE_PRED)
10615 return offset_9bit_signed_scaled_p (mode, offset);
10617 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10619 poly_int64 end_offset = (offset
10620 + GET_MODE_SIZE (mode)
10621 - BYTES_PER_SVE_PRED);
10622 return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10623 && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10626 if (load_store_pair_p)
10627 return ((known_eq (GET_MODE_SIZE (mode), 4)
10628 || known_eq (GET_MODE_SIZE (mode), 8)
10629 || known_eq (GET_MODE_SIZE (mode), 16))
10630 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10631 else
10632 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10633 || offset_12bit_unsigned_scaled_p (mode, offset));
10636 if (allow_reg_index_p)
10638 /* Look for base + (scaled/extended) index register. */
10639 if (aarch64_base_register_rtx_p (op0, strict_p)
10640 && aarch64_classify_index (info, op1, mode, strict_p))
10642 info->base = op0;
10643 return true;
10645 if (aarch64_base_register_rtx_p (op1, strict_p)
10646 && aarch64_classify_index (info, op0, mode, strict_p))
10648 info->base = op1;
10649 return true;
10653 return false;
10655 case POST_INC:
10656 case POST_DEC:
10657 case PRE_INC:
10658 case PRE_DEC:
10659 info->type = ADDRESS_REG_WB;
10660 info->base = XEXP (x, 0);
10661 info->offset = NULL_RTX;
10662 return aarch64_base_register_rtx_p (info->base, strict_p);
10664 case POST_MODIFY:
10665 case PRE_MODIFY:
10666 info->type = ADDRESS_REG_WB;
10667 info->base = XEXP (x, 0);
10668 if (GET_CODE (XEXP (x, 1)) == PLUS
10669 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10670 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10671 && aarch64_base_register_rtx_p (info->base, strict_p))
10673 info->offset = XEXP (XEXP (x, 1), 1);
10674 info->const_offset = offset;
10676 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10677 registers and individual Q registers. The available
10678 address modes are:
10679 X,X: 7-bit signed scaled offset
10680 Q: 9-bit signed offset
10681 We conservatively require an offset representable in either mode.
10683 if (mode == TImode || mode == TFmode || mode == TDmode)
10684 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10685 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10687 if (load_store_pair_p)
10688 return ((known_eq (GET_MODE_SIZE (mode), 4)
10689 || known_eq (GET_MODE_SIZE (mode), 8)
10690 || known_eq (GET_MODE_SIZE (mode), 16))
10691 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10692 else
10693 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10695 return false;
10697 case CONST:
10698 case SYMBOL_REF:
10699 case LABEL_REF:
10700 /* load literal: pc-relative constant pool entry. Only supported
10701 for SI mode or larger. */
10702 info->type = ADDRESS_SYMBOLIC;
10704 if (!load_store_pair_p
10705 && GET_MODE_SIZE (mode).is_constant (&const_size)
10706 && const_size >= 4)
10708 poly_int64 offset;
10709 rtx sym = strip_offset_and_salt (x, &offset);
10710 return ((LABEL_REF_P (sym)
10711 || (SYMBOL_REF_P (sym)
10712 && CONSTANT_POOL_ADDRESS_P (sym)
10713 && aarch64_pcrelative_literal_loads)));
10715 return false;
10717 case LO_SUM:
10718 info->type = ADDRESS_LO_SUM;
10719 info->base = XEXP (x, 0);
10720 info->offset = XEXP (x, 1);
10721 if (allow_reg_index_p
10722 && aarch64_base_register_rtx_p (info->base, strict_p))
10724 poly_int64 offset;
10725 HOST_WIDE_INT const_offset;
10726 rtx sym = strip_offset_and_salt (info->offset, &offset);
10727 if (SYMBOL_REF_P (sym)
10728 && offset.is_constant (&const_offset)
10729 && (aarch64_classify_symbol (sym, const_offset)
10730 == SYMBOL_SMALL_ABSOLUTE))
10732 /* The symbol and offset must be aligned to the access size. */
10733 unsigned int align;
10735 if (CONSTANT_POOL_ADDRESS_P (sym))
10736 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10737 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10739 tree exp = SYMBOL_REF_DECL (sym);
10740 align = TYPE_ALIGN (TREE_TYPE (exp));
10741 align = aarch64_constant_alignment (exp, align);
10743 else if (SYMBOL_REF_DECL (sym))
10744 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10745 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10746 && SYMBOL_REF_BLOCK (sym) != NULL)
10747 align = SYMBOL_REF_BLOCK (sym)->alignment;
10748 else
10749 align = BITS_PER_UNIT;
10751 poly_int64 ref_size = GET_MODE_SIZE (mode);
10752 if (known_eq (ref_size, 0))
10753 ref_size = GET_MODE_SIZE (DImode);
10755 return (multiple_p (const_offset, ref_size)
10756 && multiple_p (align / BITS_PER_UNIT, ref_size));
10759 return false;
10761 default:
10762 return false;
10766 /* Return true if the address X is valid for a PRFM instruction.
10767 STRICT_P is true if we should do strict checking with
10768 aarch64_classify_address. */
10770 bool
10771 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10773 struct aarch64_address_info addr;
10775 /* PRFM accepts the same addresses as DImode... */
10776 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10777 if (!res)
10778 return false;
10780 /* ... except writeback forms. */
10781 return addr.type != ADDRESS_REG_WB;
10784 bool
10785 aarch64_symbolic_address_p (rtx x)
10787 poly_int64 offset;
10788 x = strip_offset_and_salt (x, &offset);
10789 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10792 /* Classify the base of symbolic expression X. */
10794 enum aarch64_symbol_type
10795 aarch64_classify_symbolic_expression (rtx x)
10797 rtx offset;
10799 split_const (x, &x, &offset);
10800 return aarch64_classify_symbol (x, INTVAL (offset));
10804 /* Return TRUE if X is a legitimate address for accessing memory in
10805 mode MODE. */
10806 static bool
10807 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
10808 code_helper = ERROR_MARK)
10810 struct aarch64_address_info addr;
10812 return aarch64_classify_address (&addr, x, mode, strict_p);
10815 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10816 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
10817 bool
10818 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10819 aarch64_addr_query_type type)
10821 struct aarch64_address_info addr;
10823 return aarch64_classify_address (&addr, x, mode, strict_p, type);
10826 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
10828 static bool
10829 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10830 poly_int64 orig_offset,
10831 machine_mode mode)
10833 HOST_WIDE_INT size;
10834 if (GET_MODE_SIZE (mode).is_constant (&size))
10836 HOST_WIDE_INT const_offset, second_offset;
10838 /* A general SVE offset is A * VQ + B. Remove the A component from
10839 coefficient 0 in order to get the constant B. */
10840 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10842 /* Split an out-of-range address displacement into a base and
10843 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
10844 range otherwise to increase opportunities for sharing the base
10845 address of different sizes. Unaligned accesses use the signed
10846 9-bit range, TImode/TFmode/TDmode use the intersection of signed
10847 scaled 7-bit and signed 9-bit offset. */
10848 if (mode == TImode || mode == TFmode || mode == TDmode)
10849 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10850 else if ((const_offset & (size - 1)) != 0)
10851 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10852 else
10853 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10855 if (second_offset == 0 || known_eq (orig_offset, second_offset))
10856 return false;
10858 /* Split the offset into second_offset and the rest. */
10859 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10860 *offset2 = gen_int_mode (second_offset, Pmode);
10861 return true;
10863 else
10865 /* Get the mode we should use as the basis of the range. For structure
10866 modes this is the mode of one vector. */
10867 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10868 machine_mode step_mode
10869 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10871 /* Get the "mul vl" multiplier we'd like to use. */
10872 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10873 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10874 if (vec_flags & VEC_SVE_DATA)
10875 /* LDR supports a 9-bit range, but the move patterns for
10876 structure modes require all vectors to be in range of the
10877 same base. The simplest way of accomodating that while still
10878 promoting reuse of anchor points between different modes is
10879 to use an 8-bit range unconditionally. */
10880 vnum = ((vnum + 128) & 255) - 128;
10881 else
10882 /* Predicates are only handled singly, so we might as well use
10883 the full range. */
10884 vnum = ((vnum + 256) & 511) - 256;
10885 if (vnum == 0)
10886 return false;
10888 /* Convert the "mul vl" multiplier into a byte offset. */
10889 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10890 if (known_eq (second_offset, orig_offset))
10891 return false;
10893 /* Split the offset into second_offset and the rest. */
10894 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10895 *offset2 = gen_int_mode (second_offset, Pmode);
10896 return true;
10900 /* Return the binary representation of floating point constant VALUE in INTVAL.
10901 If the value cannot be converted, return false without setting INTVAL.
10902 The conversion is done in the given MODE. */
10903 bool
10904 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10907 /* We make a general exception for 0. */
10908 if (aarch64_float_const_zero_rtx_p (value))
10910 *intval = 0;
10911 return true;
10914 scalar_float_mode mode;
10915 if (!CONST_DOUBLE_P (value)
10916 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10917 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10918 /* Only support up to DF mode. */
10919 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10920 return false;
10922 unsigned HOST_WIDE_INT ival = 0;
10924 long res[2];
10925 real_to_target (res,
10926 CONST_DOUBLE_REAL_VALUE (value),
10927 REAL_MODE_FORMAT (mode));
10929 if (mode == DFmode || mode == DDmode)
10931 int order = BYTES_BIG_ENDIAN ? 1 : 0;
10932 ival = zext_hwi (res[order], 32);
10933 ival |= (zext_hwi (res[1 - order], 32) << 32);
10935 else
10936 ival = zext_hwi (res[0], 32);
10938 *intval = ival;
10939 return true;
10942 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10943 single MOV(+MOVK) followed by an FMOV. */
10944 bool
10945 aarch64_float_const_rtx_p (rtx x)
10947 machine_mode mode = GET_MODE (x);
10948 if (mode == VOIDmode)
10949 return false;
10951 /* Determine whether it's cheaper to write float constants as
10952 mov/movk pairs over ldr/adrp pairs. */
10953 unsigned HOST_WIDE_INT ival;
10955 if (CONST_DOUBLE_P (x)
10956 && SCALAR_FLOAT_MODE_P (mode)
10957 && aarch64_reinterpret_float_as_int (x, &ival))
10959 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
10960 int num_instr = aarch64_internal_mov_immediate
10961 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10962 return num_instr < 3;
10965 return false;
10968 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
10969 Floating Point). */
10970 bool
10971 aarch64_float_const_zero_rtx_p (rtx x)
10973 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
10974 zr as our callers expect, so no need to check the actual
10975 value if X is of Decimal Floating Point type. */
10976 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
10977 return false;
10979 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10980 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10981 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
10984 /* Return true if X is any kind of constant zero rtx. */
10986 bool
10987 aarch64_const_zero_rtx_p (rtx x)
10989 return (x == CONST0_RTX (GET_MODE (x))
10990 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
10993 /* Return TRUE if rtx X is immediate constant that fits in a single
10994 MOVI immediate operation. */
10995 bool
10996 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
10998 if (!TARGET_SIMD)
10999 return false;
11001 machine_mode vmode;
11002 scalar_int_mode imode;
11003 unsigned HOST_WIDE_INT ival;
11005 if (CONST_DOUBLE_P (x)
11006 && SCALAR_FLOAT_MODE_P (mode))
11008 if (!aarch64_reinterpret_float_as_int (x, &ival))
11009 return false;
11011 /* We make a general exception for 0. */
11012 if (aarch64_float_const_zero_rtx_p (x))
11013 return true;
11015 imode = int_mode_for_mode (mode).require ();
11017 else if (CONST_INT_P (x)
11018 && is_a <scalar_int_mode> (mode, &imode))
11019 ival = INTVAL (x);
11020 else
11021 return false;
11023 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11024 a 128 bit vector mode. */
11025 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11027 vmode = aarch64_simd_container_mode (imode, width);
11028 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11030 return aarch64_simd_valid_immediate (v_op, NULL);
11034 /* Return the fixed registers used for condition codes. */
11036 static bool
11037 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11039 *p1 = CC_REGNUM;
11040 *p2 = INVALID_REGNUM;
11041 return true;
11044 /* Return a fresh memory reference to the current function's TPIDR2 block,
11045 creating a block if necessary. */
11047 static rtx
11048 aarch64_get_tpidr2_block ()
11050 if (!cfun->machine->tpidr2_block)
11051 /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11052 boundary. */
11053 cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
11054 return copy_rtx (cfun->machine->tpidr2_block);
11057 /* Return a fresh register that points to the current function's
11058 TPIDR2 block, creating a block if necessary. */
11060 static rtx
11061 aarch64_get_tpidr2_ptr ()
11063 rtx block = aarch64_get_tpidr2_block ();
11064 return force_reg (Pmode, XEXP (block, 0));
11067 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11068 current function's TPIDR2 block. */
11070 static void
11071 aarch64_init_tpidr2_block ()
11073 rtx block = aarch64_get_tpidr2_block ();
11075 /* The ZA save buffer is SVL.B*SVL.B bytes in size. */
11076 rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
11077 rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
11078 rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
11079 svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
11080 rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
11081 BITS_PER_UNIT, -1, true);
11082 za_save_buffer = force_reg (Pmode, za_save_buffer);
11083 cfun->machine->za_save_buffer = za_save_buffer;
11085 /* The first word of the block points to the save buffer and the second
11086 word is the number of ZA slices to save. */
11087 rtx block_0 = adjust_address (block, DImode, 0);
11088 emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
11090 if (!memory_operand (block, V16QImode))
11091 block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
11092 emit_insn (gen_aarch64_setup_local_tpidr2 (block));
11095 /* Restore the contents of ZA from the lazy save buffer, given that
11096 register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11097 PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null. */
11099 void
11100 aarch64_restore_za (rtx tpidr2_block)
11102 emit_insn (gen_aarch64_smstart_za ());
11103 if (REGNO (tpidr2_block) != R0_REGNUM)
11104 emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11105 emit_insn (gen_aarch64_tpidr2_restore ());
11108 /* Return the ZT0 save buffer, creating one if necessary. */
11110 static rtx
11111 aarch64_get_zt0_save_buffer ()
11113 if (!cfun->machine->zt0_save_buffer)
11114 cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11115 return cfun->machine->zt0_save_buffer;
11118 /* Save ZT0 to the current function's save buffer. */
11120 static void
11121 aarch64_save_zt0 ()
11123 rtx mem = aarch64_get_zt0_save_buffer ();
11124 mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11125 emit_insn (gen_aarch64_sme_str_zt0 (mem));
11128 /* Restore ZT0 from the current function's save buffer. FROM_LAZY_SAVE_P
11129 is true if the load is happening after a call to a private-ZA function,
11130 false if it can be treated as a normal load. */
11132 static void
11133 aarch64_restore_zt0 (bool from_lazy_save_p)
11135 rtx mem = aarch64_get_zt0_save_buffer ();
11136 mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11137 emit_insn (from_lazy_save_p
11138 ? gen_aarch64_restore_zt0 (mem)
11139 : gen_aarch64_sme_ldr_zt0 (mem));
11142 /* Implement TARGET_START_CALL_ARGS. */
11144 static void
11145 aarch64_start_call_args (cumulative_args_t ca_v)
11147 CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11149 if (!TARGET_SME && (ca->isa_mode & AARCH64_FL_SM_ON))
11151 error ("calling a streaming function requires the ISA extension %qs",
11152 "sme");
11153 inform (input_location, "you can enable %qs using the command-line"
11154 " option %<-march%>, or by using the %<target%>"
11155 " attribute or pragma", "sme");
11158 if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11159 && !aarch64_cfun_has_state ("za"))
11160 error ("call to a function that shares %qs state from a function"
11161 " that has no %qs state", "za", "za");
11162 else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11163 && !aarch64_cfun_has_state ("zt0"))
11164 error ("call to a function that shares %qs state from a function"
11165 " that has no %qs state", "zt0", "zt0");
11166 else if (!TARGET_ZA && (ca->isa_mode & AARCH64_FL_ZA_ON))
11167 error ("call to a function that shares SME state from a function"
11168 " that has no SME state");
11170 /* If this is a call to a private ZA function, emit a marker to
11171 indicate where any necessary set-up code could be inserted.
11172 The code itself is inserted by the mode-switching pass. */
11173 if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11174 emit_insn (gen_aarch64_start_private_za_call ());
11176 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11177 save and restore ZT0 around the call. */
11178 if (aarch64_cfun_has_state ("zt0")
11179 && (ca->isa_mode & AARCH64_FL_ZA_ON)
11180 && ca->shared_zt0_flags == 0)
11181 aarch64_save_zt0 ();
11184 /* This function is used by the call expanders of the machine description.
11185 RESULT is the register in which the result is returned. It's NULL for
11186 "call" and "sibcall".
11187 MEM is the location of the function call.
11188 COOKIE is either:
11189 - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11190 - a PARALLEL that contains such a const_int as its first element.
11191 The second element is a PARALLEL that lists all the argument
11192 registers that need to be saved and restored around a change
11193 in PSTATE.SM, or const0_rtx if no such switch is needed.
11194 The third and fourth elements are const_ints that contain the
11195 sharing flags for ZA and ZT0 respectively.
11196 SIBCALL indicates whether this function call is normal call or sibling call.
11197 It will generate different pattern accordingly. */
11199 void
11200 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11202 rtx call, callee, tmp;
11203 rtvec vec;
11204 machine_mode mode;
11206 rtx callee_abi = cookie;
11207 rtx sme_mode_switch_args = const0_rtx;
11208 unsigned int shared_za_flags = 0;
11209 unsigned int shared_zt0_flags = 0;
11210 if (GET_CODE (cookie) == PARALLEL)
11212 callee_abi = XVECEXP (cookie, 0, 0);
11213 sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11214 shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11215 shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11218 gcc_assert (CONST_INT_P (callee_abi));
11219 auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11221 if (aarch64_cfun_has_state ("za")
11222 && (callee_isa_mode & AARCH64_FL_ZA_ON)
11223 && !shared_za_flags)
11225 sorry ("call to a function that shares state other than %qs"
11226 " from a function that has %qs state", "za", "za");
11227 inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11228 " callee preserves ZA");
11231 gcc_assert (MEM_P (mem));
11232 callee = XEXP (mem, 0);
11233 mode = GET_MODE (callee);
11234 gcc_assert (mode == Pmode);
11236 /* Decide if we should generate indirect calls by loading the
11237 address of the callee into a register before performing
11238 the branch-and-link. */
11239 if (SYMBOL_REF_P (callee)
11240 ? (aarch64_is_long_call_p (callee)
11241 || aarch64_is_noplt_call_p (callee))
11242 : !REG_P (callee))
11243 XEXP (mem, 0) = force_reg (mode, callee);
11245 /* Accumulate the return values, including state that is shared via
11246 attributes. */
11247 auto_vec<rtx, 8> return_values;
11248 if (result)
11250 if (GET_CODE (result) == PARALLEL)
11251 for (int i = 0; i < XVECLEN (result, 0); ++i)
11252 return_values.safe_push (XVECEXP (result, 0, i));
11253 else
11254 return_values.safe_push (result);
11256 unsigned int orig_num_return_values = return_values.length ();
11257 if (shared_za_flags & AARCH64_STATE_OUT)
11258 return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11259 /* When calling private-ZA functions from functions with ZA state,
11260 we want to know whether the call committed a lazy save. */
11261 if (TARGET_ZA && !shared_za_flags)
11262 return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11263 if (shared_zt0_flags & AARCH64_STATE_OUT)
11264 return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11266 /* Create the new return value, if necessary. */
11267 if (orig_num_return_values != return_values.length ())
11269 if (return_values.length () == 1)
11270 result = return_values[0];
11271 else
11273 for (rtx &x : return_values)
11274 if (GET_CODE (x) != EXPR_LIST)
11275 x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11276 rtvec v = gen_rtvec_v (return_values.length (),
11277 return_values.address ());
11278 result = gen_rtx_PARALLEL (VOIDmode, v);
11282 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11284 if (result != NULL_RTX)
11285 call = gen_rtx_SET (result, call);
11287 if (sibcall)
11288 tmp = ret_rtx;
11289 else
11290 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11292 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11293 UNSPEC_CALLEE_ABI);
11295 vec = gen_rtvec (3, call, callee_abi, tmp);
11296 call = gen_rtx_PARALLEL (VOIDmode, vec);
11298 auto call_insn = aarch64_emit_call_insn (call);
11300 /* Check whether the call requires a change to PSTATE.SM. We can't
11301 emit the instructions to change PSTATE.SM yet, since they involve
11302 a change in vector length and a change in instruction set, which
11303 cannot be represented in RTL.
11305 For now, just record which registers will be clobbered and used
11306 by the changes to PSTATE.SM. */
11307 if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11309 aarch64_sme_mode_switch_regs args_switch;
11310 if (sme_mode_switch_args != const0_rtx)
11312 unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11313 for (unsigned int i = 0; i < num_args; ++i)
11315 rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11316 args_switch.add_reg (GET_MODE (x), REGNO (x));
11320 aarch64_sme_mode_switch_regs result_switch;
11321 if (result)
11322 result_switch.add_call_result (call_insn);
11324 unsigned int num_gprs = MAX (args_switch.num_gprs (),
11325 result_switch.num_gprs ());
11326 for (unsigned int i = 0; i < num_gprs; ++i)
11327 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11328 gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11330 for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11331 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11332 gen_rtx_REG (V4x16QImode, regno));
11334 for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11335 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11336 gen_rtx_REG (VNx16BImode, regno));
11338 /* Ensure that the VG save slot has been initialized. Also emit
11339 an instruction to model the effect of the temporary clobber
11340 of VG, so that the prologue/epilogue pass sees the need to
11341 save the old value. */
11342 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11343 gen_rtx_REG (DImode, VG_REGNUM));
11344 emit_insn_before (gen_aarch64_update_vg (), call_insn);
11346 cfun->machine->call_switches_pstate_sm = true;
11349 /* Add any ZA-related information.
11351 ZA_REGNUM represents the current function's ZA state, rather than
11352 the contents of the ZA register itself. We ensure that the function's
11353 ZA state is preserved by private-ZA call sequences, so the call itself
11354 does not use or clobber ZA_REGNUM. The same thing applies to
11355 ZT0_REGNUM. */
11356 if (TARGET_ZA)
11358 /* The callee requires ZA to be active if the callee is shared-ZA,
11359 otherwise it requires ZA to be dormant or off. The state of ZA is
11360 captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11361 and ZA_SAVED_REGNUM. */
11362 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11363 gen_rtx_REG (DImode, SME_STATE_REGNUM));
11364 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11365 gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11366 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11367 gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11369 /* Keep the aarch64_start/end_private_za_call markers live. */
11370 if (!(callee_isa_mode & AARCH64_FL_ZA_ON))
11371 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11372 gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11374 /* If the callee is a shared-ZA function, record whether it uses the
11375 current value of ZA and ZT0. */
11376 if (shared_za_flags & AARCH64_STATE_IN)
11377 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11378 gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11380 if (shared_zt0_flags & AARCH64_STATE_IN)
11381 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11382 gen_rtx_REG (V8DImode, ZT0_REGNUM));
11386 /* Implement TARGET_END_CALL_ARGS. */
11388 static void
11389 aarch64_end_call_args (cumulative_args_t ca_v)
11391 CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11393 /* If this is a call to a private ZA function, emit a marker to
11394 indicate where any necessary restoration code could be inserted.
11395 The code itself is inserted by the mode-switching pass. */
11396 if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11397 emit_insn (gen_aarch64_end_private_za_call ());
11399 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11400 save and restore ZT0 around the call. */
11401 if (aarch64_cfun_has_state ("zt0")
11402 && (ca->isa_mode & AARCH64_FL_ZA_ON)
11403 && ca->shared_zt0_flags == 0)
11404 aarch64_restore_zt0 (false);
11407 /* Emit call insn with PAT and do aarch64-specific handling. */
11409 rtx_call_insn *
11410 aarch64_emit_call_insn (rtx pat)
11412 auto insn = emit_call_insn (pat);
11414 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11415 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11416 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11417 return as_a<rtx_call_insn *> (insn);
11420 machine_mode
11421 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11423 machine_mode mode_x = GET_MODE (x);
11424 rtx_code code_x = GET_CODE (x);
11426 /* All floating point compares return CCFP if it is an equality
11427 comparison, and CCFPE otherwise. */
11428 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11430 switch (code)
11432 case EQ:
11433 case NE:
11434 case UNORDERED:
11435 case ORDERED:
11436 case UNLT:
11437 case UNLE:
11438 case UNGT:
11439 case UNGE:
11440 case UNEQ:
11441 return CCFPmode;
11443 case LT:
11444 case LE:
11445 case GT:
11446 case GE:
11447 case LTGT:
11448 return CCFPEmode;
11450 default:
11451 gcc_unreachable ();
11455 /* Equality comparisons of short modes against zero can be performed
11456 using the TST instruction with the appropriate bitmask. */
11457 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11458 && (code == EQ || code == NE)
11459 && (mode_x == HImode || mode_x == QImode))
11460 return CC_Zmode;
11462 /* Similarly, comparisons of zero_extends from shorter modes can
11463 be performed using an ANDS with an immediate mask. */
11464 if (y == const0_rtx && code_x == ZERO_EXTEND
11465 && (mode_x == SImode || mode_x == DImode)
11466 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11467 && (code == EQ || code == NE))
11468 return CC_Zmode;
11470 /* Zero extracts support equality comparisons. */
11471 if ((mode_x == SImode || mode_x == DImode)
11472 && y == const0_rtx
11473 && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11474 && CONST_INT_P (XEXP (x, 2)))
11475 && (code == EQ || code == NE))
11476 return CC_Zmode;
11478 /* ANDS/BICS/TST support equality and all signed comparisons. */
11479 if ((mode_x == SImode || mode_x == DImode)
11480 && y == const0_rtx
11481 && (code_x == AND)
11482 && (code == EQ || code == NE || code == LT || code == GE
11483 || code == GT || code == LE))
11484 return CC_NZVmode;
11486 /* ADDS/SUBS correctly set N and Z flags. */
11487 if ((mode_x == SImode || mode_x == DImode)
11488 && y == const0_rtx
11489 && (code == EQ || code == NE || code == LT || code == GE)
11490 && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11491 return CC_NZmode;
11493 /* A compare with a shifted operand. Because of canonicalization,
11494 the comparison will have to be swapped when we emit the assembly
11495 code. */
11496 if ((mode_x == SImode || mode_x == DImode)
11497 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11498 && (code_x == ASHIFT || code_x == ASHIFTRT
11499 || code_x == LSHIFTRT
11500 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11501 return CC_SWPmode;
11503 /* Similarly for a negated operand, but we can only do this for
11504 equalities. */
11505 if ((mode_x == SImode || mode_x == DImode)
11506 && (REG_P (y) || SUBREG_P (y))
11507 && (code == EQ || code == NE)
11508 && code_x == NEG)
11509 return CC_Zmode;
11511 /* A test for unsigned overflow from an addition. */
11512 if ((mode_x == DImode || mode_x == TImode)
11513 && (code == LTU || code == GEU)
11514 && code_x == PLUS
11515 && rtx_equal_p (XEXP (x, 0), y))
11516 return CC_Cmode;
11518 /* A test for unsigned overflow from an add with carry. */
11519 if ((mode_x == DImode || mode_x == TImode)
11520 && (code == LTU || code == GEU)
11521 && code_x == PLUS
11522 && CONST_SCALAR_INT_P (y)
11523 && (rtx_mode_t (y, mode_x)
11524 == (wi::shwi (1, mode_x)
11525 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11526 return CC_ADCmode;
11528 /* A test for signed overflow. */
11529 if ((mode_x == DImode || mode_x == TImode)
11530 && code == NE
11531 && code_x == PLUS
11532 && GET_CODE (y) == SIGN_EXTEND)
11533 return CC_Vmode;
11535 /* For everything else, return CCmode. */
11536 return CCmode;
11539 static int
11540 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11543 aarch64_get_condition_code (rtx x)
11545 machine_mode mode = GET_MODE (XEXP (x, 0));
11546 enum rtx_code comp_code = GET_CODE (x);
11548 if (GET_MODE_CLASS (mode) != MODE_CC)
11549 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11550 return aarch64_get_condition_code_1 (mode, comp_code);
11553 static int
11554 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11556 switch (mode)
11558 case E_CCFPmode:
11559 case E_CCFPEmode:
11560 switch (comp_code)
11562 case GE: return AARCH64_GE;
11563 case GT: return AARCH64_GT;
11564 case LE: return AARCH64_LS;
11565 case LT: return AARCH64_MI;
11566 case NE: return AARCH64_NE;
11567 case EQ: return AARCH64_EQ;
11568 case ORDERED: return AARCH64_VC;
11569 case UNORDERED: return AARCH64_VS;
11570 case UNLT: return AARCH64_LT;
11571 case UNLE: return AARCH64_LE;
11572 case UNGT: return AARCH64_HI;
11573 case UNGE: return AARCH64_PL;
11574 default: return -1;
11576 break;
11578 case E_CCmode:
11579 switch (comp_code)
11581 case NE: return AARCH64_NE;
11582 case EQ: return AARCH64_EQ;
11583 case GE: return AARCH64_GE;
11584 case GT: return AARCH64_GT;
11585 case LE: return AARCH64_LE;
11586 case LT: return AARCH64_LT;
11587 case GEU: return AARCH64_CS;
11588 case GTU: return AARCH64_HI;
11589 case LEU: return AARCH64_LS;
11590 case LTU: return AARCH64_CC;
11591 default: return -1;
11593 break;
11595 case E_CC_SWPmode:
11596 switch (comp_code)
11598 case NE: return AARCH64_NE;
11599 case EQ: return AARCH64_EQ;
11600 case GE: return AARCH64_LE;
11601 case GT: return AARCH64_LT;
11602 case LE: return AARCH64_GE;
11603 case LT: return AARCH64_GT;
11604 case GEU: return AARCH64_LS;
11605 case GTU: return AARCH64_CC;
11606 case LEU: return AARCH64_CS;
11607 case LTU: return AARCH64_HI;
11608 default: return -1;
11610 break;
11612 case E_CC_NZCmode:
11613 switch (comp_code)
11615 case NE: return AARCH64_NE; /* = any */
11616 case EQ: return AARCH64_EQ; /* = none */
11617 case GE: return AARCH64_PL; /* = nfrst */
11618 case LT: return AARCH64_MI; /* = first */
11619 case GEU: return AARCH64_CS; /* = nlast */
11620 case GTU: return AARCH64_HI; /* = pmore */
11621 case LEU: return AARCH64_LS; /* = plast */
11622 case LTU: return AARCH64_CC; /* = last */
11623 default: return -1;
11625 break;
11627 case E_CC_NZVmode:
11628 switch (comp_code)
11630 case NE: return AARCH64_NE;
11631 case EQ: return AARCH64_EQ;
11632 case GE: return AARCH64_PL;
11633 case LT: return AARCH64_MI;
11634 case GT: return AARCH64_GT;
11635 case LE: return AARCH64_LE;
11636 default: return -1;
11638 break;
11640 case E_CC_NZmode:
11641 switch (comp_code)
11643 case NE: return AARCH64_NE;
11644 case EQ: return AARCH64_EQ;
11645 case GE: return AARCH64_PL;
11646 case LT: return AARCH64_MI;
11647 default: return -1;
11649 break;
11651 case E_CC_Zmode:
11652 switch (comp_code)
11654 case NE: return AARCH64_NE;
11655 case EQ: return AARCH64_EQ;
11656 default: return -1;
11658 break;
11660 case E_CC_Cmode:
11661 switch (comp_code)
11663 case LTU: return AARCH64_CS;
11664 case GEU: return AARCH64_CC;
11665 default: return -1;
11667 break;
11669 case E_CC_ADCmode:
11670 switch (comp_code)
11672 case GEU: return AARCH64_CS;
11673 case LTU: return AARCH64_CC;
11674 default: return -1;
11676 break;
11678 case E_CC_Vmode:
11679 switch (comp_code)
11681 case NE: return AARCH64_VS;
11682 case EQ: return AARCH64_VC;
11683 default: return -1;
11685 break;
11687 default:
11688 return -1;
11691 return -1;
11694 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11695 duplicate of such constants. If so, store in RET_WI the wide_int
11696 representation of the constant paired with the inner mode of the vector mode
11697 or MODE for scalar X constants. If MODE is not provided then TImode is
11698 used. */
11700 static bool
11701 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
11702 scalar_mode mode = TImode)
11704 rtx elt = unwrap_const_vec_duplicate (x);
11705 if (!CONST_SCALAR_INT_P (elt))
11706 return false;
11707 scalar_mode smode
11708 = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
11709 *ret_wi = rtx_mode_t (elt, smode);
11710 return true;
11713 /* Return true if X is a scalar or a constant vector of integer
11714 immediates that represent the rounding constant used in the fixed-point
11715 arithmetic instructions.
11716 The accepted form of the constant is (1 << (C - 1)) where C is in the range
11717 [1, MODE_WIDTH/2]. */
11719 bool
11720 aarch64_rnd_imm_p (rtx x)
11722 wide_int rnd_cst;
11723 if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
11724 return false;
11725 int log2 = wi::exact_log2 (rnd_cst);
11726 if (log2 < 0)
11727 return false;
11728 return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
11731 /* Return true if RND is a constant vector of integer rounding constants
11732 corresponding to a constant vector of shifts, SHIFT.
11733 The relationship should be RND == (1 << (SHIFT - 1)). */
11735 bool
11736 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
11738 wide_int rnd_cst, shft_cst;
11739 if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
11740 || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
11741 return false;
11743 return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
11746 bool
11747 aarch64_const_vec_all_same_in_range_p (rtx x,
11748 HOST_WIDE_INT minval,
11749 HOST_WIDE_INT maxval)
11751 rtx elt;
11752 return (const_vec_duplicate_p (x, &elt)
11753 && CONST_INT_P (elt)
11754 && IN_RANGE (INTVAL (elt), minval, maxval));
11757 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11758 but we can still create them in various ways. If the constant in VAL can be
11759 created using alternate methods then if possible then return true and
11760 additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11761 Otherwise return false if sequence is not possible. */
11763 bool
11764 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
11766 wide_int wval;
11767 auto smode = GET_MODE_INNER (mode);
11768 if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
11769 return false;
11771 /* For Advanced SIMD we can create an integer with only the top bit set
11772 using fneg (0.0f). */
11773 if (TARGET_SIMD
11774 && !TARGET_SVE
11775 && smode == DImode
11776 && wi::only_sign_bit_p (wval))
11778 if (!target)
11779 return true;
11781 /* Use the same base type as aarch64_gen_shareable_zero. */
11782 rtx zero = CONST0_RTX (V4SImode);
11783 emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
11784 rtx neg = lowpart_subreg (V2DFmode, target, mode);
11785 emit_insn (gen_negv2df2 (neg, copy_rtx (neg)));
11786 return true;
11789 return false;
11792 /* Check if the value in VAL with mode MODE can be created using special
11793 instruction sequences. */
11795 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
11797 return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
11800 bool
11801 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11803 return aarch64_const_vec_all_same_in_range_p (x, val, val);
11806 /* Return true if VEC is a constant in which every element is in the range
11807 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11809 static bool
11810 aarch64_const_vec_all_in_range_p (rtx vec,
11811 HOST_WIDE_INT minval,
11812 HOST_WIDE_INT maxval)
11814 if (!CONST_VECTOR_P (vec)
11815 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11816 return false;
11818 int nunits;
11819 if (!CONST_VECTOR_STEPPED_P (vec))
11820 nunits = const_vector_encoded_nelts (vec);
11821 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11822 return false;
11824 for (int i = 0; i < nunits; i++)
11826 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11827 if (!CONST_INT_P (vec_elem)
11828 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11829 return false;
11831 return true;
11834 /* N Z C V. */
11835 #define AARCH64_CC_V 1
11836 #define AARCH64_CC_C (1 << 1)
11837 #define AARCH64_CC_Z (1 << 2)
11838 #define AARCH64_CC_N (1 << 3)
11840 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11841 static const int aarch64_nzcv_codes[] =
11843 0, /* EQ, Z == 1. */
11844 AARCH64_CC_Z, /* NE, Z == 0. */
11845 0, /* CS, C == 1. */
11846 AARCH64_CC_C, /* CC, C == 0. */
11847 0, /* MI, N == 1. */
11848 AARCH64_CC_N, /* PL, N == 0. */
11849 0, /* VS, V == 1. */
11850 AARCH64_CC_V, /* VC, V == 0. */
11851 0, /* HI, C ==1 && Z == 0. */
11852 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
11853 AARCH64_CC_V, /* GE, N == V. */
11854 0, /* LT, N != V. */
11855 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
11856 0, /* LE, !(Z == 0 && N == V). */
11857 0, /* AL, Any. */
11858 0 /* NV, Any. */
11861 /* Print floating-point vector immediate operand X to F, negating it
11862 first if NEGATE is true. Return true on success, false if it isn't
11863 a constant we can handle. */
11865 static bool
11866 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11868 rtx elt;
11870 if (!const_vec_duplicate_p (x, &elt))
11871 return false;
11873 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11874 if (negate)
11875 r = real_value_negate (&r);
11877 /* Handle the SVE single-bit immediates specially, since they have a
11878 fixed form in the assembly syntax. */
11879 if (real_equal (&r, &dconst0))
11880 asm_fprintf (f, "0.0");
11881 else if (real_equal (&r, &dconst2))
11882 asm_fprintf (f, "2.0");
11883 else if (real_equal (&r, &dconst1))
11884 asm_fprintf (f, "1.0");
11885 else if (real_equal (&r, &dconsthalf))
11886 asm_fprintf (f, "0.5");
11887 else
11889 const int buf_size = 20;
11890 char float_buf[buf_size] = {'\0'};
11891 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11892 1, GET_MODE (elt));
11893 asm_fprintf (f, "%s", float_buf);
11896 return true;
11899 /* Return the equivalent letter for size. */
11900 static char
11901 sizetochar (int size)
11903 switch (size)
11905 case 64: return 'd';
11906 case 32: return 's';
11907 case 16: return 'h';
11908 case 8 : return 'b';
11909 default: gcc_unreachable ();
11913 /* Print operand X to file F in a target specific manner according to CODE.
11914 The acceptable formatting commands given by CODE are:
11915 'c': An integer or symbol address without a preceding #
11916 sign.
11917 'C': Take the duplicated element in a vector constant
11918 and print it in hex.
11919 'D': Take the duplicated element in a vector constant
11920 and print it as an unsigned integer, in decimal.
11921 'e': Print the sign/zero-extend size as a character 8->b,
11922 16->h, 32->w. Can also be used for masks:
11923 0xff->b, 0xffff->h, 0xffffffff->w.
11924 'I': If the operand is a duplicated vector constant,
11925 replace it with the duplicated scalar. If the
11926 operand is then a floating-point constant, replace
11927 it with the integer bit representation. Print the
11928 transformed constant as a signed decimal number.
11929 'p': Prints N such that 2^N == X (X must be power of 2 and
11930 const int).
11931 'P': Print the number of non-zero bits in X (a const_int).
11932 'H': Print the higher numbered register of a pair (TImode)
11933 of regs.
11934 'm': Print a condition (eq, ne, etc).
11935 'M': Same as 'm', but invert condition.
11936 'N': Take the duplicated element in a vector constant
11937 and print the negative of it in decimal.
11938 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11939 'Z': Same for SVE registers. ('z' was already taken.)
11940 Note that it is not necessary to use %Z for operands
11941 that have SVE modes. The convention is to use %Z
11942 only for non-SVE (or potentially non-SVE) modes.
11943 'S/T/U/V': Print a FP/SIMD register name for a register list.
11944 The register printed is the FP/SIMD register name
11945 of X + 0/1/2/3 for S/T/U/V.
11946 'R': Print a scalar Integer/FP/SIMD register name + 1.
11947 'X': Print bottom 16 bits of integer constant in hex.
11948 'w/x': Print a general register name or the zero register
11949 (32-bit or 64-bit).
11950 '0': Print a normal operand, if it's a general register,
11951 then we assume DImode.
11952 'k': Print NZCV for conditional compare instructions.
11953 'K': Print a predicate register as pn<N> rather than p<N>
11954 'A': Output address constant representing the first
11955 argument of X, specifying a relocation offset
11956 if appropriate.
11957 'L': Output constant address specified by X
11958 with a relocation offset if appropriate.
11959 'G': Prints address of X, specifying a PC relative
11960 relocation mode if appropriate.
11961 'y': Output address of LDP or STP - this is used for
11962 some LDP/STPs which don't use a PARALLEL in their
11963 pattern (so the mode needs to be adjusted).
11964 'z': Output address of a typical LDP or STP. */
11966 static void
11967 aarch64_print_operand (FILE *f, rtx x, int code)
11969 rtx elt;
11970 switch (code)
11972 case 'c':
11973 if (CONST_INT_P (x))
11974 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11975 else
11977 poly_int64 offset;
11978 rtx base = strip_offset_and_salt (x, &offset);
11979 if (SYMBOL_REF_P (base))
11980 output_addr_const (f, x);
11981 else
11982 output_operand_lossage ("unsupported operand for code '%c'", code);
11984 break;
11986 case 'e':
11988 x = unwrap_const_vec_duplicate (x);
11989 if (!CONST_INT_P (x))
11991 output_operand_lossage ("invalid operand for '%%%c'", code);
11992 return;
11995 HOST_WIDE_INT val = INTVAL (x);
11996 if ((val & ~7) == 8 || val == 0xff)
11997 fputc ('b', f);
11998 else if ((val & ~7) == 16 || val == 0xffff)
11999 fputc ('h', f);
12000 else if ((val & ~7) == 32 || val == 0xffffffff)
12001 fputc ('w', f);
12002 else
12004 output_operand_lossage ("invalid operand for '%%%c'", code);
12005 return;
12008 break;
12010 case 'p':
12012 int n;
12014 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
12016 output_operand_lossage ("invalid operand for '%%%c'", code);
12017 return;
12020 asm_fprintf (f, "%d", n);
12022 break;
12024 case 'P':
12025 if (!CONST_INT_P (x))
12027 output_operand_lossage ("invalid operand for '%%%c'", code);
12028 return;
12031 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
12032 break;
12034 case 'H':
12035 if (x == const0_rtx)
12037 asm_fprintf (f, "xzr");
12038 break;
12041 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
12043 output_operand_lossage ("invalid operand for '%%%c'", code);
12044 return;
12047 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
12048 break;
12050 case 'I':
12052 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
12053 if (CONST_INT_P (x))
12054 asm_fprintf (f, "%wd", INTVAL (x));
12055 else
12057 output_operand_lossage ("invalid operand for '%%%c'", code);
12058 return;
12060 break;
12063 case 'M':
12064 case 'm':
12066 int cond_code;
12067 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
12068 if (x == const_true_rtx)
12070 if (code == 'M')
12071 fputs ("nv", f);
12072 return;
12075 if (!COMPARISON_P (x))
12077 output_operand_lossage ("invalid operand for '%%%c'", code);
12078 return;
12081 cond_code = aarch64_get_condition_code (x);
12082 gcc_assert (cond_code >= 0);
12083 if (code == 'M')
12084 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
12085 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
12086 fputs (aarch64_sve_condition_codes[cond_code], f);
12087 else
12088 fputs (aarch64_condition_codes[cond_code], f);
12090 break;
12092 case 'N':
12093 if (!const_vec_duplicate_p (x, &elt))
12095 output_operand_lossage ("invalid vector constant");
12096 return;
12099 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12100 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12101 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12102 && aarch64_print_vector_float_operand (f, x, true))
12104 else
12106 output_operand_lossage ("invalid vector constant");
12107 return;
12109 break;
12111 case 'b':
12112 case 'h':
12113 case 's':
12114 case 'd':
12115 case 'q':
12116 case 'Z':
12117 code = TOLOWER (code);
12118 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12120 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12121 return;
12123 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12124 break;
12126 case 'S':
12127 case 'T':
12128 case 'U':
12129 case 'V':
12130 if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12132 output_operand_lossage ("incompatible operand for '%%%c'", code);
12133 return;
12135 if (PR_REGNUM_P (REGNO (x)))
12136 asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12137 else
12138 asm_fprintf (f, "%c%d",
12139 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12140 REGNO (x) - V0_REGNUM + (code - 'S'));
12141 break;
12143 case 'R':
12144 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12145 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12146 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12147 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12148 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12149 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12150 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12151 else
12152 output_operand_lossage ("incompatible register operand for '%%%c'",
12153 code);
12154 break;
12156 case 'X':
12157 if (!CONST_INT_P (x))
12159 output_operand_lossage ("invalid operand for '%%%c'", code);
12160 return;
12162 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12163 break;
12165 case 'C':
12167 /* Print a replicated constant in hex. */
12168 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12170 output_operand_lossage ("invalid operand for '%%%c'", code);
12171 return;
12173 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12174 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12176 break;
12178 case 'D':
12180 /* Print a replicated constant in decimal, treating it as
12181 unsigned. */
12182 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12184 output_operand_lossage ("invalid operand for '%%%c'", code);
12185 return;
12187 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12188 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12190 break;
12192 case 'w':
12193 case 'x':
12194 if (aarch64_const_zero_rtx_p (x))
12196 asm_fprintf (f, "%czr", code);
12197 break;
12200 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12202 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12203 break;
12206 if (REG_P (x) && REGNO (x) == SP_REGNUM)
12208 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12209 break;
12212 /* Fall through */
12214 case 0:
12215 if (x == NULL)
12217 output_operand_lossage ("missing operand");
12218 return;
12221 switch (GET_CODE (x))
12223 case CONST_STRING:
12225 asm_fprintf (f, "%s", XSTR (x, 0));
12226 break;
12228 case REG:
12229 if (aarch64_sve_data_mode_p (GET_MODE (x)))
12231 if (REG_NREGS (x) == 1)
12232 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12233 else
12235 char suffix
12236 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12237 asm_fprintf (f, "{z%d.%c - z%d.%c}",
12238 REGNO (x) - V0_REGNUM, suffix,
12239 END_REGNO (x) - V0_REGNUM - 1, suffix);
12242 else
12243 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12244 break;
12246 case MEM:
12247 output_address (GET_MODE (x), XEXP (x, 0));
12248 break;
12250 case LABEL_REF:
12251 case SYMBOL_REF:
12252 output_addr_const (asm_out_file, x);
12253 break;
12255 case CONST_INT:
12256 asm_fprintf (f, "%wd", INTVAL (x));
12257 break;
12259 case CONST:
12260 if (!VECTOR_MODE_P (GET_MODE (x)))
12262 output_addr_const (asm_out_file, x);
12263 break;
12265 /* fall through */
12267 case CONST_VECTOR:
12268 if (!const_vec_duplicate_p (x, &elt))
12270 output_operand_lossage ("invalid vector constant");
12271 return;
12274 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12275 asm_fprintf (f, "%wd", INTVAL (elt));
12276 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12277 && aarch64_print_vector_float_operand (f, x, false))
12279 else
12281 output_operand_lossage ("invalid vector constant");
12282 return;
12284 break;
12286 case CONST_DOUBLE:
12287 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12288 be getting CONST_DOUBLEs holding integers. */
12289 gcc_assert (GET_MODE (x) != VOIDmode);
12290 if (aarch64_float_const_zero_rtx_p (x))
12292 fputc ('0', f);
12293 break;
12295 else if (aarch64_float_const_representable_p (x))
12297 #define buf_size 20
12298 char float_buf[buf_size] = {'\0'};
12299 real_to_decimal_for_mode (float_buf,
12300 CONST_DOUBLE_REAL_VALUE (x),
12301 buf_size, buf_size,
12302 1, GET_MODE (x));
12303 asm_fprintf (asm_out_file, "%s", float_buf);
12304 break;
12305 #undef buf_size
12307 output_operand_lossage ("invalid constant");
12308 return;
12309 default:
12310 output_operand_lossage ("invalid operand");
12311 return;
12313 break;
12315 case 'A':
12316 if (GET_CODE (x) == HIGH)
12317 x = XEXP (x, 0);
12319 switch (aarch64_classify_symbolic_expression (x))
12321 case SYMBOL_SMALL_GOT_4G:
12322 asm_fprintf (asm_out_file, ":got:");
12323 break;
12325 case SYMBOL_SMALL_TLSGD:
12326 asm_fprintf (asm_out_file, ":tlsgd:");
12327 break;
12329 case SYMBOL_SMALL_TLSDESC:
12330 asm_fprintf (asm_out_file, ":tlsdesc:");
12331 break;
12333 case SYMBOL_SMALL_TLSIE:
12334 asm_fprintf (asm_out_file, ":gottprel:");
12335 break;
12337 case SYMBOL_TLSLE24:
12338 asm_fprintf (asm_out_file, ":tprel:");
12339 break;
12341 case SYMBOL_TINY_GOT:
12342 gcc_unreachable ();
12343 break;
12345 default:
12346 break;
12348 output_addr_const (asm_out_file, x);
12349 break;
12351 case 'L':
12352 switch (aarch64_classify_symbolic_expression (x))
12354 case SYMBOL_SMALL_GOT_4G:
12355 asm_fprintf (asm_out_file, ":got_lo12:");
12356 break;
12358 case SYMBOL_SMALL_TLSGD:
12359 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12360 break;
12362 case SYMBOL_SMALL_TLSDESC:
12363 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12364 break;
12366 case SYMBOL_SMALL_TLSIE:
12367 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12368 break;
12370 case SYMBOL_TLSLE12:
12371 asm_fprintf (asm_out_file, ":tprel_lo12:");
12372 break;
12374 case SYMBOL_TLSLE24:
12375 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12376 break;
12378 case SYMBOL_TINY_GOT:
12379 asm_fprintf (asm_out_file, ":got:");
12380 break;
12382 case SYMBOL_TINY_TLSIE:
12383 asm_fprintf (asm_out_file, ":gottprel:");
12384 break;
12386 default:
12387 break;
12389 output_addr_const (asm_out_file, x);
12390 break;
12392 case 'G':
12393 switch (aarch64_classify_symbolic_expression (x))
12395 case SYMBOL_TLSLE24:
12396 asm_fprintf (asm_out_file, ":tprel_hi12:");
12397 break;
12398 default:
12399 break;
12401 output_addr_const (asm_out_file, x);
12402 break;
12404 case 'k':
12406 HOST_WIDE_INT cond_code;
12408 if (!CONST_INT_P (x))
12410 output_operand_lossage ("invalid operand for '%%%c'", code);
12411 return;
12414 cond_code = INTVAL (x);
12415 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12416 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12418 break;
12420 case 'K':
12421 if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12423 output_operand_lossage ("invalid operand for '%%%c'", code);
12424 return;
12426 asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12427 break;
12429 case 'y':
12430 case 'z':
12432 machine_mode mode = GET_MODE (x);
12434 if (!MEM_P (x)
12435 || (code == 'y'
12436 && maybe_ne (GET_MODE_SIZE (mode), 8)
12437 && maybe_ne (GET_MODE_SIZE (mode), 16)
12438 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12440 output_operand_lossage ("invalid operand for '%%%c'", code);
12441 return;
12444 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12445 code == 'y'
12446 ? ADDR_QUERY_LDP_STP_N
12447 : ADDR_QUERY_LDP_STP))
12448 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12450 break;
12452 default:
12453 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12454 return;
12458 /* Print address 'x' of a memory access with mode 'mode'.
12459 'op' is the context required by aarch64_classify_address. It can either be
12460 MEM for a normal memory access or PARALLEL for LDP/STP. */
12461 static bool
12462 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12463 aarch64_addr_query_type type)
12465 struct aarch64_address_info addr;
12466 unsigned int size, vec_flags;
12468 /* Check all addresses are Pmode - including ILP32. */
12469 if (GET_MODE (x) != Pmode
12470 && (!CONST_INT_P (x)
12471 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12473 output_operand_lossage ("invalid address mode");
12474 return false;
12477 const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12478 || type == ADDR_QUERY_LDP_STP_N);
12480 if (aarch64_classify_address (&addr, x, mode, true, type))
12481 switch (addr.type)
12483 case ADDRESS_REG_IMM:
12484 if (known_eq (addr.const_offset, 0))
12486 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12487 return true;
12490 vec_flags = aarch64_classify_vector_mode (mode);
12491 if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12493 HOST_WIDE_INT vnum
12494 = exact_div (addr.const_offset,
12495 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12496 asm_fprintf (f, "[%s, #%wd, mul vl]",
12497 reg_names[REGNO (addr.base)], vnum);
12498 return true;
12501 if (!CONST_INT_P (addr.offset))
12502 return false;
12504 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12505 INTVAL (addr.offset));
12506 return true;
12508 case ADDRESS_REG_REG:
12509 if (addr.shift == 0)
12510 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12511 reg_names [REGNO (addr.offset)]);
12512 else
12513 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12514 reg_names [REGNO (addr.offset)], addr.shift);
12515 return true;
12517 case ADDRESS_REG_UXTW:
12518 if (addr.shift == 0)
12519 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12520 REGNO (addr.offset) - R0_REGNUM);
12521 else
12522 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12523 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12524 return true;
12526 case ADDRESS_REG_SXTW:
12527 if (addr.shift == 0)
12528 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12529 REGNO (addr.offset) - R0_REGNUM);
12530 else
12531 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12532 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12533 return true;
12535 case ADDRESS_REG_WB:
12536 /* Writeback is only supported for fixed-width modes. */
12537 size = GET_MODE_SIZE (mode).to_constant ();
12538 switch (GET_CODE (x))
12540 case PRE_INC:
12541 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12542 return true;
12543 case POST_INC:
12544 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12545 return true;
12546 case PRE_DEC:
12547 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12548 return true;
12549 case POST_DEC:
12550 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12551 return true;
12552 case PRE_MODIFY:
12553 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12554 INTVAL (addr.offset));
12555 return true;
12556 case POST_MODIFY:
12557 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12558 INTVAL (addr.offset));
12559 return true;
12560 default:
12561 break;
12563 break;
12565 case ADDRESS_LO_SUM:
12566 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12567 output_addr_const (f, addr.offset);
12568 asm_fprintf (f, "]");
12569 return true;
12571 case ADDRESS_SYMBOLIC:
12572 output_addr_const (f, x);
12573 return true;
12576 return false;
12579 /* Print address 'x' of a memory access with mode 'mode'. */
12580 static void
12581 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12583 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12584 output_addr_const (f, x);
12587 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12589 static bool
12590 aarch64_output_addr_const_extra (FILE *file, rtx x)
12592 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12594 output_addr_const (file, XVECEXP (x, 0, 0));
12595 return true;
12597 return false;
12600 bool
12601 aarch64_label_mentioned_p (rtx x)
12603 const char *fmt;
12604 int i;
12606 if (LABEL_REF_P (x))
12607 return true;
12609 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12610 referencing instruction, but they are constant offsets, not
12611 symbols. */
12612 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12613 return false;
12615 fmt = GET_RTX_FORMAT (GET_CODE (x));
12616 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12618 if (fmt[i] == 'E')
12620 int j;
12622 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12623 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12624 return 1;
12626 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12627 return 1;
12630 return 0;
12633 /* Implement REGNO_REG_CLASS. */
12635 enum reg_class
12636 aarch64_regno_regclass (unsigned regno)
12638 if (W8_W11_REGNUM_P (regno))
12639 return W8_W11_REGS;
12641 if (W12_W15_REGNUM_P (regno))
12642 return W12_W15_REGS;
12644 if (STUB_REGNUM_P (regno))
12645 return STUB_REGS;
12647 if (GP_REGNUM_P (regno))
12648 return GENERAL_REGS;
12650 if (regno == SP_REGNUM)
12651 return STACK_REG;
12653 if (regno == FRAME_POINTER_REGNUM
12654 || regno == ARG_POINTER_REGNUM)
12655 return POINTER_REGS;
12657 if (FP_REGNUM_P (regno))
12658 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12659 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12661 if (PR_REGNUM_P (regno))
12662 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12664 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12665 return FFR_REGS;
12667 if (FAKE_REGNUM_P (regno))
12668 return FAKE_REGS;
12670 return NO_REGS;
12673 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12674 If OFFSET is out of range, return an offset of an anchor point
12675 that is in range. Return 0 otherwise. */
12677 static HOST_WIDE_INT
12678 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12679 machine_mode mode)
12681 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12682 if (size > 16)
12683 return (offset + 0x400) & ~0x7f0;
12685 /* For offsets that aren't a multiple of the access size, the limit is
12686 -256...255. */
12687 if (offset & (size - 1))
12689 /* BLKmode typically uses LDP of X-registers. */
12690 if (mode == BLKmode)
12691 return (offset + 512) & ~0x3ff;
12692 return (offset + 0x100) & ~0x1ff;
12695 /* Small negative offsets are supported. */
12696 if (IN_RANGE (offset, -256, 0))
12697 return 0;
12699 if (mode == TImode || mode == TFmode || mode == TDmode)
12700 return (offset + 0x100) & ~0x1ff;
12702 /* Use 12-bit offset by access size. */
12703 return offset & (~0xfff * size);
12706 static rtx
12707 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
12709 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12710 where mask is selected by alignment and size of the offset.
12711 We try to pick as large a range for the offset as possible to
12712 maximize the chance of a CSE. However, for aligned addresses
12713 we limit the range to 4k so that structures with different sized
12714 elements are likely to use the same base. We need to be careful
12715 not to split a CONST for some forms of address expression, otherwise
12716 it will generate sub-optimal code. */
12718 /* First split X + CONST (base, offset) into (base + X) + offset. */
12719 if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
12721 poly_int64 offset;
12722 rtx base = strip_offset (XEXP (x, 1), &offset);
12724 base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
12725 NULL_RTX, true, OPTAB_DIRECT);
12726 x = plus_constant (Pmode, base, offset);
12729 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12731 rtx base = XEXP (x, 0);
12732 rtx offset_rtx = XEXP (x, 1);
12733 HOST_WIDE_INT offset = INTVAL (offset_rtx);
12735 if (GET_CODE (base) == PLUS)
12737 rtx op0 = XEXP (base, 0);
12738 rtx op1 = XEXP (base, 1);
12740 /* Force any scaling into a temp for CSE. */
12741 op0 = force_reg (Pmode, op0);
12742 op1 = force_reg (Pmode, op1);
12744 /* Let the pointer register be in op0. */
12745 if (REG_POINTER (op1))
12746 std::swap (op0, op1);
12748 /* If the pointer is virtual or frame related, then we know that
12749 virtual register instantiation or register elimination is going
12750 to apply a second constant. We want the two constants folded
12751 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12752 if (virt_or_elim_regno_p (REGNO (op0)))
12754 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12755 NULL_RTX, true, OPTAB_DIRECT);
12756 return gen_rtx_PLUS (Pmode, base, op1);
12759 /* Otherwise, in order to encourage CSE (and thence loop strength
12760 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12761 base = expand_binop (Pmode, add_optab, op0, op1,
12762 NULL_RTX, true, OPTAB_DIRECT);
12763 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12766 HOST_WIDE_INT size;
12767 if (GET_MODE_SIZE (mode).is_constant (&size))
12769 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12770 mode);
12771 if (base_offset != 0)
12773 base = plus_constant (Pmode, base, base_offset);
12774 base = force_operand (base, NULL_RTX);
12775 return plus_constant (Pmode, base, offset - base_offset);
12780 return x;
12783 static reg_class_t
12784 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12785 reg_class_t rclass,
12786 machine_mode mode,
12787 secondary_reload_info *sri)
12789 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12790 LDR and STR. See the comment at the head of aarch64-sve.md for
12791 more details about the big-endian handling. */
12792 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12793 if (reg_class_subset_p (rclass, FP_REGS)
12794 && !((REG_P (x) && HARD_REGISTER_P (x))
12795 || aarch64_simd_valid_immediate (x, NULL))
12796 && mode != VNx16QImode
12797 && (vec_flags & VEC_SVE_DATA)
12798 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12800 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12801 return NO_REGS;
12804 /* If we have to disable direct literal pool loads and stores because the
12805 function is too big, then we need a scratch register. */
12806 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12807 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12808 || targetm.vector_mode_supported_p (GET_MODE (x)))
12809 && !aarch64_pcrelative_literal_loads)
12811 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12812 return NO_REGS;
12815 /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12816 Q register to a Q register directly. We need a scratch. */
12817 if (REG_P (x)
12818 && (mode == TFmode
12819 || mode == TImode
12820 || mode == TDmode
12821 || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12822 && mode == GET_MODE (x)
12823 && !TARGET_SIMD
12824 && FP_REGNUM_P (REGNO (x))
12825 && reg_class_subset_p (rclass, FP_REGS))
12827 sri->icode = code_for_aarch64_reload_mov (mode);
12828 return NO_REGS;
12831 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12832 because AArch64 has richer addressing modes for LDR/STR instructions
12833 than LDP/STP instructions. */
12834 if (TARGET_FLOAT && rclass == GENERAL_REGS
12835 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12836 return FP_REGS;
12838 if (rclass == FP_REGS
12839 && (mode == TImode || mode == TFmode || mode == TDmode)
12840 && CONSTANT_P(x))
12841 return GENERAL_REGS;
12843 return NO_REGS;
12846 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12848 static bool
12849 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12850 reg_class_t class2)
12852 if (!TARGET_SIMD
12853 && reg_classes_intersect_p (class1, FP_REGS)
12854 && reg_classes_intersect_p (class2, FP_REGS))
12856 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12857 so we can't easily split a move involving tuples of 128-bit
12858 vectors. Force the copy through memory instead.
12860 (Tuples of 64-bit vectors are fine.) */
12861 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12862 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12863 return true;
12865 return false;
12868 /* Implement TARGET_FRAME_POINTER_REQUIRED. */
12870 static bool
12871 aarch64_frame_pointer_required ()
12873 /* If the function needs to record the incoming value of PSTATE.SM,
12874 make sure that the slot is accessible from the frame pointer. */
12875 return aarch64_need_old_pstate_sm ();
12878 static bool
12879 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12881 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12883 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12884 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12885 if (frame_pointer_needed)
12886 return to == HARD_FRAME_POINTER_REGNUM;
12887 return true;
12890 poly_int64
12891 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12893 aarch64_frame &frame = cfun->machine->frame;
12895 if (to == HARD_FRAME_POINTER_REGNUM)
12897 if (from == ARG_POINTER_REGNUM)
12898 return frame.bytes_above_hard_fp;
12900 if (from == FRAME_POINTER_REGNUM)
12901 return frame.bytes_above_hard_fp - frame.bytes_above_locals;
12904 if (to == STACK_POINTER_REGNUM)
12906 if (from == FRAME_POINTER_REGNUM)
12907 return frame.frame_size - frame.bytes_above_locals;
12910 return frame.frame_size;
12914 /* Get return address without mangling. */
12917 aarch64_return_addr_rtx (void)
12919 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12920 /* Note: aarch64_return_address_signing_enabled only
12921 works after cfun->machine->frame.laid_out is set,
12922 so here we don't know if the return address will
12923 be signed or not. */
12924 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12925 emit_move_insn (lr, val);
12926 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12927 return lr;
12931 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12932 previous frame. */
12935 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12937 if (count != 0)
12938 return const0_rtx;
12939 return aarch64_return_addr_rtx ();
12942 static void
12943 aarch64_asm_trampoline_template (FILE *f)
12945 /* Even if the current function doesn't have branch protection, some
12946 later function might, so since this template is only generated once
12947 we have to add a BTI just in case. */
12948 asm_fprintf (f, "\thint\t34 // bti c\n");
12950 if (TARGET_ILP32)
12952 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12953 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12955 else
12957 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12958 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12960 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12962 /* We always emit a speculation barrier.
12963 This is because the same trampoline template is used for every nested
12964 function. Since nested functions are not particularly common or
12965 performant we don't worry too much about the extra instructions to copy
12966 around.
12967 This is not yet a problem, since we have not yet implemented function
12968 specific attributes to choose between hardening against straight line
12969 speculation or not, but such function specific attributes are likely to
12970 happen in the future. */
12971 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12973 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12974 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12977 static void
12978 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12980 rtx fnaddr, mem, a_tramp;
12981 const int tramp_code_sz = 24;
12983 /* Don't need to copy the trailing D-words, we fill those in below. */
12984 /* We create our own memory address in Pmode so that `emit_block_move` can
12985 use parts of the backend which expect Pmode addresses. */
12986 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12987 emit_block_move (gen_rtx_MEM (BLKmode, temp),
12988 assemble_trampoline_template (),
12989 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12990 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12991 fnaddr = XEXP (DECL_RTL (fndecl), 0);
12992 if (GET_MODE (fnaddr) != ptr_mode)
12993 fnaddr = convert_memory_address (ptr_mode, fnaddr);
12994 emit_move_insn (mem, fnaddr);
12996 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12997 emit_move_insn (mem, chain_value);
12999 /* XXX We should really define a "clear_cache" pattern and use
13000 gen_clear_cache(). */
13001 a_tramp = XEXP (m_tramp, 0);
13002 maybe_emit_call_builtin___clear_cache (a_tramp,
13003 plus_constant (ptr_mode,
13004 a_tramp,
13005 TRAMPOLINE_SIZE));
13008 static unsigned char
13009 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
13011 /* ??? Logically we should only need to provide a value when
13012 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13013 can hold MODE, but at the moment we need to handle all modes.
13014 Just ignore any runtime parts for registers that can't store them. */
13015 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
13016 unsigned int nregs, vec_flags;
13017 switch (regclass)
13019 case W8_W11_REGS:
13020 case W12_W15_REGS:
13021 case STUB_REGS:
13022 case TAILCALL_ADDR_REGS:
13023 case POINTER_REGS:
13024 case GENERAL_REGS:
13025 case ALL_REGS:
13026 case POINTER_AND_FP_REGS:
13027 case FP_REGS:
13028 case FP_LO_REGS:
13029 case FP_LO8_REGS:
13030 vec_flags = aarch64_classify_vector_mode (mode);
13031 if ((vec_flags & VEC_SVE_DATA)
13032 && constant_multiple_p (GET_MODE_SIZE (mode),
13033 aarch64_vl_bytes (mode, vec_flags), &nregs))
13034 return nregs;
13035 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
13036 return GET_MODE_SIZE (mode).to_constant () / 8;
13037 return (vec_flags & VEC_ADVSIMD
13038 ? CEIL (lowest_size, UNITS_PER_VREG)
13039 : CEIL (lowest_size, UNITS_PER_WORD));
13041 case PR_REGS:
13042 case PR_LO_REGS:
13043 case PR_HI_REGS:
13044 return mode == VNx32BImode ? 2 : 1;
13046 case STACK_REG:
13047 case FFR_REGS:
13048 case PR_AND_FFR_REGS:
13049 case FAKE_REGS:
13050 return 1;
13052 case NO_REGS:
13053 return 0;
13055 default:
13056 break;
13058 gcc_unreachable ();
13061 static reg_class_t
13062 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
13064 if (regclass == POINTER_REGS)
13065 return GENERAL_REGS;
13067 if (regclass == STACK_REG)
13069 if (REG_P(x)
13070 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
13071 return regclass;
13073 return NO_REGS;
13076 /* Register eliminiation can result in a request for
13077 SP+constant->FP_REGS. We cannot support such operations which
13078 use SP as source and an FP_REG as destination, so reject out
13079 right now. */
13080 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
13082 rtx lhs = XEXP (x, 0);
13084 /* Look through a possible SUBREG introduced by ILP32. */
13085 if (SUBREG_P (lhs))
13086 lhs = SUBREG_REG (lhs);
13088 gcc_assert (REG_P (lhs));
13089 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
13090 POINTER_REGS));
13091 return NO_REGS;
13094 return regclass;
13097 void
13098 aarch64_asm_output_labelref (FILE* f, const char *name)
13100 asm_fprintf (f, "%U%s", name);
13103 static void
13104 aarch64_elf_asm_constructor (rtx symbol, int priority)
13106 if (priority == DEFAULT_INIT_PRIORITY)
13107 default_ctor_section_asm_out_constructor (symbol, priority);
13108 else
13110 section *s;
13111 /* While priority is known to be in range [0, 65535], so 18 bytes
13112 would be enough, the compiler might not know that. To avoid
13113 -Wformat-truncation false positive, use a larger size. */
13114 char buf[23];
13115 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13116 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13117 switch_to_section (s);
13118 assemble_align (POINTER_SIZE);
13119 assemble_aligned_integer (POINTER_BYTES, symbol);
13123 static void
13124 aarch64_elf_asm_destructor (rtx symbol, int priority)
13126 if (priority == DEFAULT_INIT_PRIORITY)
13127 default_dtor_section_asm_out_destructor (symbol, priority);
13128 else
13130 section *s;
13131 /* While priority is known to be in range [0, 65535], so 18 bytes
13132 would be enough, the compiler might not know that. To avoid
13133 -Wformat-truncation false positive, use a larger size. */
13134 char buf[23];
13135 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13136 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13137 switch_to_section (s);
13138 assemble_align (POINTER_SIZE);
13139 assemble_aligned_integer (POINTER_BYTES, symbol);
13143 const char*
13144 aarch64_output_casesi (rtx *operands)
13146 char buf[100];
13147 char label[100];
13148 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13149 int index;
13150 static const char *const patterns[4][2] =
13153 "ldrb\t%w3, [%0,%w1,uxtw]",
13154 "add\t%3, %4, %w3, sxtb #2"
13157 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13158 "add\t%3, %4, %w3, sxth #2"
13161 "ldr\t%w3, [%0,%w1,uxtw #2]",
13162 "add\t%3, %4, %w3, sxtw #2"
13164 /* We assume that DImode is only generated when not optimizing and
13165 that we don't really need 64-bit address offsets. That would
13166 imply an object file with 8GB of code in a single function! */
13168 "ldr\t%w3, [%0,%w1,uxtw #2]",
13169 "add\t%3, %4, %w3, sxtw #2"
13173 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13175 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13176 index = exact_log2 (GET_MODE_SIZE (mode));
13178 gcc_assert (index >= 0 && index <= 3);
13180 /* Need to implement table size reduction, by chaning the code below. */
13181 output_asm_insn (patterns[index][0], operands);
13182 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13183 snprintf (buf, sizeof (buf),
13184 "adr\t%%4, %s", targetm.strip_name_encoding (label));
13185 output_asm_insn (buf, operands);
13186 output_asm_insn (patterns[index][1], operands);
13187 output_asm_insn ("br\t%3", operands);
13188 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13189 operands);
13190 assemble_label (asm_out_file, label);
13191 return "";
13194 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13195 operand is MASK. */
13196 const char *
13197 aarch64_output_sme_zero_za (rtx mask)
13199 auto mask_val = UINTVAL (mask);
13200 if (mask_val == 0)
13201 return "zero\t{}";
13203 if (mask_val == 0xff)
13204 return "zero\t{ za }";
13206 static constexpr struct { unsigned char mask; char letter; } tiles[] = {
13207 { 0xff, 'b' },
13208 { 0x55, 'h' },
13209 { 0x11, 's' },
13210 { 0x01, 'd' }
13212 /* The last entry in the list has the form "za7.d }", but that's the
13213 same length as "za7.d, ". */
13214 static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13215 for (auto &tile : tiles)
13217 unsigned int tile_mask = tile.mask;
13218 unsigned int tile_index = 0;
13219 unsigned int i = snprintf (buffer, sizeof (buffer), "zero\t");
13220 const char *prefix = "{ ";
13221 auto remaining_mask = mask_val;
13222 while (tile_mask < 0x100)
13224 if ((remaining_mask & tile_mask) == tile_mask)
13226 i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13227 prefix, tile_index, tile.letter);
13228 prefix = ", ";
13229 remaining_mask &= ~tile_mask;
13231 tile_mask <<= 1;
13232 tile_index += 1;
13234 if (remaining_mask == 0)
13236 gcc_assert (i + 3 <= sizeof (buffer));
13237 snprintf (buffer + i, sizeof (buffer) - i, " }");
13238 return buffer;
13241 gcc_unreachable ();
13244 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13245 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13246 operator. */
13249 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13251 if (shift >= 0 && shift <= 4)
13253 int size;
13254 for (size = 8; size <= 32; size *= 2)
13256 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13257 if (mask == bits << shift)
13258 return size;
13261 return 0;
13264 /* Constant pools are per function only when PC relative
13265 literal loads are true or we are in the large memory
13266 model. */
13268 static inline bool
13269 aarch64_can_use_per_function_literal_pools_p (void)
13271 return (aarch64_pcrelative_literal_loads
13272 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13275 static bool
13276 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13278 /* We can't use blocks for constants when we're using a per-function
13279 constant pool. */
13280 return !aarch64_can_use_per_function_literal_pools_p ();
13283 /* Select appropriate section for constants depending
13284 on where we place literal pools. */
13286 static section *
13287 aarch64_select_rtx_section (machine_mode mode,
13288 rtx x,
13289 unsigned HOST_WIDE_INT align)
13291 if (aarch64_can_use_per_function_literal_pools_p ())
13292 return function_section (current_function_decl);
13294 return default_elf_select_rtx_section (mode, x, align);
13297 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13298 void
13299 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13300 HOST_WIDE_INT offset)
13302 /* When using per-function literal pools, we must ensure that any code
13303 section is aligned to the minimal instruction length, lest we get
13304 errors from the assembler re "unaligned instructions". */
13305 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13306 ASM_OUTPUT_ALIGN (f, 2);
13309 /* Costs. */
13311 /* Helper function for rtx cost calculation. Strip a shift expression
13312 from X. Returns the inner operand if successful, or the original
13313 expression on failure. */
13314 static rtx
13315 aarch64_strip_shift (rtx x)
13317 rtx op = x;
13319 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13320 we can convert both to ROR during final output. */
13321 if ((GET_CODE (op) == ASHIFT
13322 || GET_CODE (op) == ASHIFTRT
13323 || GET_CODE (op) == LSHIFTRT
13324 || GET_CODE (op) == ROTATERT
13325 || GET_CODE (op) == ROTATE)
13326 && CONST_INT_P (XEXP (op, 1)))
13327 return XEXP (op, 0);
13329 if (GET_CODE (op) == MULT
13330 && CONST_INT_P (XEXP (op, 1))
13331 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13332 return XEXP (op, 0);
13334 return x;
13337 /* Helper function for rtx cost calculation. Strip an extend
13338 expression from X. Returns the inner operand if successful, or the
13339 original expression on failure. We deal with a number of possible
13340 canonicalization variations here. If STRIP_SHIFT is true, then
13341 we can strip off a shift also. */
13342 static rtx
13343 aarch64_strip_extend (rtx x, bool strip_shift)
13345 scalar_int_mode mode;
13346 rtx op = x;
13348 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13349 return op;
13351 if (GET_CODE (op) == AND
13352 && GET_CODE (XEXP (op, 0)) == MULT
13353 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13354 && CONST_INT_P (XEXP (op, 1))
13355 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13356 INTVAL (XEXP (op, 1))) != 0)
13357 return XEXP (XEXP (op, 0), 0);
13359 /* Now handle extended register, as this may also have an optional
13360 left shift by 1..4. */
13361 if (strip_shift
13362 && GET_CODE (op) == ASHIFT
13363 && CONST_INT_P (XEXP (op, 1))
13364 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13365 op = XEXP (op, 0);
13367 if (GET_CODE (op) == ZERO_EXTEND
13368 || GET_CODE (op) == SIGN_EXTEND)
13369 op = XEXP (op, 0);
13371 if (op != x)
13372 return op;
13374 return x;
13377 /* Helper function for rtx cost calculation. Strip extension as well as any
13378 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13379 successful, or the original expression on failure. */
13380 static rtx
13381 aarch64_strip_extend_vec_half (rtx x)
13383 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13385 x = XEXP (x, 0);
13386 if (GET_CODE (x) == VEC_SELECT
13387 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13388 XEXP (x, 1)))
13389 x = XEXP (x, 0);
13391 return x;
13394 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13395 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13396 operand if successful, or the original expression on failure. */
13397 static rtx
13398 aarch64_strip_duplicate_vec_elt (rtx x)
13400 if (GET_CODE (x) == VEC_DUPLICATE
13401 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13403 x = XEXP (x, 0);
13404 if (GET_CODE (x) == VEC_SELECT)
13405 x = XEXP (x, 0);
13406 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13407 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13408 x = XEXP (XEXP (x, 0), 0);
13410 return x;
13413 /* Return true iff CODE is a shift supported in combination
13414 with arithmetic instructions. */
13416 static bool
13417 aarch64_shift_p (enum rtx_code code)
13419 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13423 /* Return true iff X is a cheap shift without a sign extend. */
13425 static bool
13426 aarch64_cheap_mult_shift_p (rtx x)
13428 rtx op0, op1;
13430 op0 = XEXP (x, 0);
13431 op1 = XEXP (x, 1);
13433 if (!(aarch64_tune_params.extra_tuning_flags
13434 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13435 return false;
13437 if (GET_CODE (op0) == SIGN_EXTEND)
13438 return false;
13440 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13441 && UINTVAL (op1) <= 4)
13442 return true;
13444 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13445 return false;
13447 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13449 if (l2 > 0 && l2 <= 4)
13450 return true;
13452 return false;
13455 /* Helper function for rtx cost calculation. Calculate the cost of
13456 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13457 Return the calculated cost of the expression, recursing manually in to
13458 operands where needed. */
13460 static int
13461 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13463 rtx op0, op1;
13464 const struct cpu_cost_table *extra_cost
13465 = aarch64_tune_params.insn_extra_cost;
13466 int cost = 0;
13467 bool compound_p = (outer == PLUS || outer == MINUS);
13468 machine_mode mode = GET_MODE (x);
13470 gcc_checking_assert (code == MULT);
13472 op0 = XEXP (x, 0);
13473 op1 = XEXP (x, 1);
13475 if (VECTOR_MODE_P (mode))
13477 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13478 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13480 /* The select-operand-high-half versions of the instruction have the
13481 same cost as the three vector version - don't add the costs of the
13482 extension or selection into the costs of the multiply. */
13483 op0 = aarch64_strip_extend_vec_half (op0);
13484 op1 = aarch64_strip_extend_vec_half (op1);
13485 /* The by-element versions of the instruction have the same costs as
13486 the normal 3-vector version. We make an assumption that the input
13487 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13488 costing of a MUL by element pre RA is a bit optimistic. */
13489 op0 = aarch64_strip_duplicate_vec_elt (op0);
13490 op1 = aarch64_strip_duplicate_vec_elt (op1);
13492 cost += rtx_cost (op0, mode, MULT, 0, speed);
13493 cost += rtx_cost (op1, mode, MULT, 1, speed);
13494 if (speed)
13496 if (GET_CODE (x) == MULT)
13497 cost += extra_cost->vect.mult;
13498 /* This is to catch the SSRA costing currently flowing here. */
13499 else
13500 cost += extra_cost->vect.alu;
13502 return cost;
13505 /* Integer multiply/fma. */
13506 if (GET_MODE_CLASS (mode) == MODE_INT)
13508 /* The multiply will be canonicalized as a shift, cost it as such. */
13509 if (aarch64_shift_p (GET_CODE (x))
13510 || (CONST_INT_P (op1)
13511 && exact_log2 (INTVAL (op1)) > 0))
13513 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13514 || GET_CODE (op0) == SIGN_EXTEND;
13515 if (speed)
13517 if (compound_p)
13519 /* If the shift is considered cheap,
13520 then don't add any cost. */
13521 if (aarch64_cheap_mult_shift_p (x))
13523 else if (REG_P (op1))
13524 /* ARITH + shift-by-register. */
13525 cost += extra_cost->alu.arith_shift_reg;
13526 else if (is_extend)
13527 /* ARITH + extended register. We don't have a cost field
13528 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13529 cost += extra_cost->alu.extend_arith;
13530 else
13531 /* ARITH + shift-by-immediate. */
13532 cost += extra_cost->alu.arith_shift;
13534 else
13535 /* LSL (immediate). */
13536 cost += extra_cost->alu.shift;
13539 /* Strip extends as we will have costed them in the case above. */
13540 if (is_extend)
13541 op0 = aarch64_strip_extend (op0, true);
13543 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13545 return cost;
13548 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13549 compound and let the below cases handle it. After all, MNEG is a
13550 special-case alias of MSUB. */
13551 if (GET_CODE (op0) == NEG)
13553 op0 = XEXP (op0, 0);
13554 compound_p = true;
13557 /* Integer multiplies or FMAs have zero/sign extending variants. */
13558 if ((GET_CODE (op0) == ZERO_EXTEND
13559 && GET_CODE (op1) == ZERO_EXTEND)
13560 || (GET_CODE (op0) == SIGN_EXTEND
13561 && GET_CODE (op1) == SIGN_EXTEND))
13563 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13564 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13566 if (speed)
13568 if (compound_p)
13569 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13570 cost += extra_cost->mult[0].extend_add;
13571 else
13572 /* MUL/SMULL/UMULL. */
13573 cost += extra_cost->mult[0].extend;
13576 return cost;
13579 /* This is either an integer multiply or a MADD. In both cases
13580 we want to recurse and cost the operands. */
13581 cost += rtx_cost (op0, mode, MULT, 0, speed);
13582 cost += rtx_cost (op1, mode, MULT, 1, speed);
13584 if (speed)
13586 if (compound_p)
13587 /* MADD/MSUB. */
13588 cost += extra_cost->mult[mode == DImode].add;
13589 else
13590 /* MUL. */
13591 cost += extra_cost->mult[mode == DImode].simple;
13594 return cost;
13596 else
13598 if (speed)
13600 /* Floating-point FMA/FMUL can also support negations of the
13601 operands, unless the rounding mode is upward or downward in
13602 which case FNMUL is different than FMUL with operand negation. */
13603 bool neg0 = GET_CODE (op0) == NEG;
13604 bool neg1 = GET_CODE (op1) == NEG;
13605 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13607 if (neg0)
13608 op0 = XEXP (op0, 0);
13609 if (neg1)
13610 op1 = XEXP (op1, 0);
13613 if (compound_p)
13614 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13615 cost += extra_cost->fp[mode == DFmode].fma;
13616 else
13617 /* FMUL/FNMUL. */
13618 cost += extra_cost->fp[mode == DFmode].mult;
13621 cost += rtx_cost (op0, mode, MULT, 0, speed);
13622 cost += rtx_cost (op1, mode, MULT, 1, speed);
13623 return cost;
13627 static int
13628 aarch64_address_cost (rtx x,
13629 machine_mode mode,
13630 addr_space_t as ATTRIBUTE_UNUSED,
13631 bool speed)
13633 enum rtx_code c = GET_CODE (x);
13634 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13635 struct aarch64_address_info info;
13636 int cost = 0;
13637 info.shift = 0;
13639 if (!aarch64_classify_address (&info, x, mode, false))
13641 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13643 /* This is a CONST or SYMBOL ref which will be split
13644 in a different way depending on the code model in use.
13645 Cost it through the generic infrastructure. */
13646 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13647 /* Divide through by the cost of one instruction to
13648 bring it to the same units as the address costs. */
13649 cost_symbol_ref /= COSTS_N_INSNS (1);
13650 /* The cost is then the cost of preparing the address,
13651 followed by an immediate (possibly 0) offset. */
13652 return cost_symbol_ref + addr_cost->imm_offset;
13654 else
13656 /* This is most likely a jump table from a case
13657 statement. */
13658 return addr_cost->register_offset;
13662 switch (info.type)
13664 case ADDRESS_LO_SUM:
13665 case ADDRESS_SYMBOLIC:
13666 case ADDRESS_REG_IMM:
13667 cost += addr_cost->imm_offset;
13668 break;
13670 case ADDRESS_REG_WB:
13671 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13672 cost += addr_cost->pre_modify;
13673 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13675 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13676 if (nvectors == 3)
13677 cost += addr_cost->post_modify_ld3_st3;
13678 else if (nvectors == 4)
13679 cost += addr_cost->post_modify_ld4_st4;
13680 else
13681 cost += addr_cost->post_modify;
13683 else
13684 gcc_unreachable ();
13686 break;
13688 case ADDRESS_REG_REG:
13689 cost += addr_cost->register_offset;
13690 break;
13692 case ADDRESS_REG_SXTW:
13693 cost += addr_cost->register_sextend;
13694 break;
13696 case ADDRESS_REG_UXTW:
13697 cost += addr_cost->register_zextend;
13698 break;
13700 default:
13701 gcc_unreachable ();
13705 if (info.shift > 0)
13707 /* For the sake of calculating the cost of the shifted register
13708 component, we can treat same sized modes in the same way. */
13709 if (known_eq (GET_MODE_BITSIZE (mode), 16))
13710 cost += addr_cost->addr_scale_costs.hi;
13711 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13712 cost += addr_cost->addr_scale_costs.si;
13713 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13714 cost += addr_cost->addr_scale_costs.di;
13715 else
13716 /* We can't tell, or this is a 128-bit vector. */
13717 cost += addr_cost->addr_scale_costs.ti;
13720 return cost;
13723 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13724 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13725 to be taken. */
13728 aarch64_branch_cost (bool speed_p, bool predictable_p)
13730 /* When optimizing for speed, use the cost of unpredictable branches. */
13731 const struct cpu_branch_cost *branch_costs =
13732 aarch64_tune_params.branch_costs;
13734 if (!speed_p || predictable_p)
13735 return branch_costs->predictable;
13736 else
13737 return branch_costs->unpredictable;
13740 /* Return true if X is a zero or sign extract
13741 usable in an ADD or SUB (extended register) instruction. */
13742 static bool
13743 aarch64_rtx_arith_op_extract_p (rtx x)
13745 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13746 No shift. */
13747 if (GET_CODE (x) == SIGN_EXTEND
13748 || GET_CODE (x) == ZERO_EXTEND)
13749 return REG_P (XEXP (x, 0));
13751 return false;
13754 static bool
13755 aarch64_frint_unspec_p (unsigned int u)
13757 switch (u)
13759 case UNSPEC_FRINTZ:
13760 case UNSPEC_FRINTP:
13761 case UNSPEC_FRINTM:
13762 case UNSPEC_FRINTA:
13763 case UNSPEC_FRINTN:
13764 case UNSPEC_FRINTX:
13765 case UNSPEC_FRINTI:
13766 return true;
13768 default:
13769 return false;
13773 /* Return true iff X is an rtx that will match an extr instruction
13774 i.e. as described in the *extr<mode>5_insn family of patterns.
13775 OP0 and OP1 will be set to the operands of the shifts involved
13776 on success and will be NULL_RTX otherwise. */
13778 static bool
13779 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13781 rtx op0, op1;
13782 scalar_int_mode mode;
13783 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13784 return false;
13786 *res_op0 = NULL_RTX;
13787 *res_op1 = NULL_RTX;
13789 if (GET_CODE (x) != IOR)
13790 return false;
13792 op0 = XEXP (x, 0);
13793 op1 = XEXP (x, 1);
13795 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13796 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13798 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13799 if (GET_CODE (op1) == ASHIFT)
13800 std::swap (op0, op1);
13802 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13803 return false;
13805 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13806 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13808 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13809 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13811 *res_op0 = XEXP (op0, 0);
13812 *res_op1 = XEXP (op1, 0);
13813 return true;
13817 return false;
13820 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13821 storing it in *COST. Result is true if the total cost of the operation
13822 has now been calculated. */
13823 static bool
13824 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13826 rtx inner;
13827 rtx comparator;
13828 enum rtx_code cmpcode;
13829 const struct cpu_cost_table *extra_cost
13830 = aarch64_tune_params.insn_extra_cost;
13832 if (COMPARISON_P (op0))
13834 inner = XEXP (op0, 0);
13835 comparator = XEXP (op0, 1);
13836 cmpcode = GET_CODE (op0);
13838 else
13840 inner = op0;
13841 comparator = const0_rtx;
13842 cmpcode = NE;
13845 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13847 /* Conditional branch. */
13848 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13849 return true;
13850 else
13852 if (cmpcode == NE || cmpcode == EQ)
13854 if (comparator == const0_rtx)
13856 /* TBZ/TBNZ/CBZ/CBNZ. */
13857 if (GET_CODE (inner) == ZERO_EXTRACT)
13858 /* TBZ/TBNZ. */
13859 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13860 ZERO_EXTRACT, 0, speed);
13861 else
13862 /* CBZ/CBNZ. */
13863 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13865 return true;
13867 if (register_operand (inner, VOIDmode)
13868 && aarch64_imm24 (comparator, VOIDmode))
13870 /* SUB and SUBS. */
13871 *cost += COSTS_N_INSNS (2);
13872 if (speed)
13873 *cost += extra_cost->alu.arith * 2;
13874 return true;
13877 else if (cmpcode == LT || cmpcode == GE)
13879 /* TBZ/TBNZ. */
13880 if (comparator == const0_rtx)
13881 return true;
13885 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13887 /* CCMP. */
13888 if (GET_CODE (op1) == COMPARE)
13890 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13891 if (XEXP (op1, 1) == const0_rtx)
13892 *cost += 1;
13893 if (speed)
13895 machine_mode mode = GET_MODE (XEXP (op1, 0));
13897 if (GET_MODE_CLASS (mode) == MODE_INT)
13898 *cost += extra_cost->alu.arith;
13899 else
13900 *cost += extra_cost->fp[mode == DFmode].compare;
13902 return true;
13905 /* It's a conditional operation based on the status flags,
13906 so it must be some flavor of CSEL. */
13908 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13909 if (GET_CODE (op1) == NEG
13910 || GET_CODE (op1) == NOT
13911 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13912 op1 = XEXP (op1, 0);
13913 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13915 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13916 op1 = XEXP (op1, 0);
13917 op2 = XEXP (op2, 0);
13919 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13921 inner = XEXP (op1, 0);
13922 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13923 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13924 op1 = XEXP (inner, 0);
13926 else if (op1 == constm1_rtx || op1 == const1_rtx)
13928 /* Use CSINV or CSINC. */
13929 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13930 return true;
13932 else if (op2 == constm1_rtx || op2 == const1_rtx)
13934 /* Use CSINV or CSINC. */
13935 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13936 return true;
13939 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13940 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13941 return true;
13944 /* We don't know what this is, cost all operands. */
13945 return false;
13948 /* Check whether X is a bitfield operation of the form shift + extend that
13949 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13950 operand to which the bitfield operation is applied. Otherwise return
13951 NULL_RTX. */
13953 static rtx
13954 aarch64_extend_bitfield_pattern_p (rtx x)
13956 rtx_code outer_code = GET_CODE (x);
13957 machine_mode outer_mode = GET_MODE (x);
13959 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13960 && outer_mode != SImode && outer_mode != DImode)
13961 return NULL_RTX;
13963 rtx inner = XEXP (x, 0);
13964 rtx_code inner_code = GET_CODE (inner);
13965 machine_mode inner_mode = GET_MODE (inner);
13966 rtx op = NULL_RTX;
13968 switch (inner_code)
13970 case ASHIFT:
13971 if (CONST_INT_P (XEXP (inner, 1))
13972 && (inner_mode == QImode || inner_mode == HImode))
13973 op = XEXP (inner, 0);
13974 break;
13975 case LSHIFTRT:
13976 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13977 && (inner_mode == QImode || inner_mode == HImode))
13978 op = XEXP (inner, 0);
13979 break;
13980 case ASHIFTRT:
13981 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13982 && (inner_mode == QImode || inner_mode == HImode))
13983 op = XEXP (inner, 0);
13984 break;
13985 default:
13986 break;
13989 return op;
13992 /* Return true if the mask and a shift amount from an RTX of the form
13993 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13994 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
13996 bool
13997 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13998 rtx shft_amnt)
14000 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
14001 && INTVAL (mask) > 0
14002 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
14003 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
14004 && (UINTVAL (mask)
14005 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
14008 /* Return true if the masks and a shift amount from an RTX of the form
14009 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14010 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
14012 bool
14013 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
14014 unsigned HOST_WIDE_INT mask1,
14015 unsigned HOST_WIDE_INT shft_amnt,
14016 unsigned HOST_WIDE_INT mask2)
14018 unsigned HOST_WIDE_INT t;
14020 /* Verify that there is no overlap in what bits are set in the two masks. */
14021 if (mask1 != ~mask2)
14022 return false;
14024 /* Verify that mask2 is not all zeros or ones. */
14025 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
14026 return false;
14028 /* The shift amount should always be less than the mode size. */
14029 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
14031 /* Verify that the mask being shifted is contiguous and would be in the
14032 least significant bits after shifting by shft_amnt. */
14033 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
14034 return (t == (t & -t));
14037 /* Return true if X is an RTX representing an operation in the ABD family
14038 of instructions. */
14040 static bool
14041 aarch64_abd_rtx_p (rtx x)
14043 if (GET_CODE (x) != MINUS)
14044 return false;
14045 rtx max_arm = XEXP (x, 0);
14046 rtx min_arm = XEXP (x, 1);
14047 if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
14048 return false;
14049 bool signed_p = GET_CODE (max_arm) == SMAX;
14050 if (signed_p && GET_CODE (min_arm) != SMIN)
14051 return false;
14052 else if (!signed_p && GET_CODE (min_arm) != UMIN)
14053 return false;
14055 rtx maxop0 = XEXP (max_arm, 0);
14056 rtx maxop1 = XEXP (max_arm, 1);
14057 rtx minop0 = XEXP (min_arm, 0);
14058 rtx minop1 = XEXP (min_arm, 1);
14059 return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
14062 /* Calculate the cost of calculating X, storing it in *COST. Result
14063 is true if the total cost of the operation has now been calculated. */
14064 static bool
14065 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
14066 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
14068 rtx op0, op1, op2;
14069 const struct cpu_cost_table *extra_cost
14070 = aarch64_tune_params.insn_extra_cost;
14071 rtx_code code = GET_CODE (x);
14072 scalar_int_mode int_mode;
14074 /* By default, assume that everything has equivalent cost to the
14075 cheapest instruction. Any additional costs are applied as a delta
14076 above this default. */
14077 *cost = COSTS_N_INSNS (1);
14079 switch (code)
14081 case SET:
14082 /* The cost depends entirely on the operands to SET. */
14083 *cost = 0;
14084 op0 = SET_DEST (x);
14085 op1 = SET_SRC (x);
14087 switch (GET_CODE (op0))
14089 case MEM:
14090 if (speed)
14092 rtx address = XEXP (op0, 0);
14093 if (VECTOR_MODE_P (mode))
14094 *cost += extra_cost->ldst.storev;
14095 else if (GET_MODE_CLASS (mode) == MODE_INT)
14096 *cost += extra_cost->ldst.store;
14097 else if (mode == SFmode || mode == SDmode)
14098 *cost += extra_cost->ldst.storef;
14099 else if (mode == DFmode || mode == DDmode)
14100 *cost += extra_cost->ldst.stored;
14102 *cost +=
14103 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14104 0, speed));
14107 *cost += rtx_cost (op1, mode, SET, 1, speed);
14108 return true;
14110 case SUBREG:
14111 if (! REG_P (SUBREG_REG (op0)))
14112 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14114 /* Fall through. */
14115 case REG:
14116 /* The cost is one per vector-register copied. */
14117 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14119 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14120 *cost = COSTS_N_INSNS (nregs);
14122 /* const0_rtx is in general free, but we will use an
14123 instruction to set a register to 0. */
14124 else if (REG_P (op1) || op1 == const0_rtx)
14126 /* The cost is 1 per register copied. */
14127 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14128 *cost = COSTS_N_INSNS (nregs);
14130 else
14131 /* Cost is just the cost of the RHS of the set. */
14132 *cost += rtx_cost (op1, mode, SET, 1, speed);
14133 return true;
14135 case ZERO_EXTRACT:
14136 case SIGN_EXTRACT:
14137 /* Bit-field insertion. Strip any redundant widening of
14138 the RHS to meet the width of the target. */
14139 if (SUBREG_P (op1))
14140 op1 = SUBREG_REG (op1);
14141 if ((GET_CODE (op1) == ZERO_EXTEND
14142 || GET_CODE (op1) == SIGN_EXTEND)
14143 && CONST_INT_P (XEXP (op0, 1))
14144 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14145 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14146 op1 = XEXP (op1, 0);
14148 if (CONST_INT_P (op1))
14150 /* MOV immediate is assumed to always be cheap. */
14151 *cost = COSTS_N_INSNS (1);
14153 else
14155 /* BFM. */
14156 if (speed)
14157 *cost += extra_cost->alu.bfi;
14158 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
14161 return true;
14163 default:
14164 /* We can't make sense of this, assume default cost. */
14165 *cost = COSTS_N_INSNS (1);
14166 return false;
14168 return false;
14170 case CONST_INT:
14171 /* If an instruction can incorporate a constant within the
14172 instruction, the instruction's expression avoids calling
14173 rtx_cost() on the constant. If rtx_cost() is called on a
14174 constant, then it is usually because the constant must be
14175 moved into a register by one or more instructions.
14177 The exception is constant 0, which can be expressed
14178 as XZR/WZR and is therefore free. The exception to this is
14179 if we have (set (reg) (const0_rtx)) in which case we must cost
14180 the move. However, we can catch that when we cost the SET, so
14181 we don't need to consider that here. */
14182 if (x == const0_rtx)
14183 *cost = 0;
14184 else
14186 /* To an approximation, building any other constant is
14187 proportionally expensive to the number of instructions
14188 required to build that constant. This is true whether we
14189 are compiling for SPEED or otherwise. */
14190 machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14191 ? SImode : DImode;
14192 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14193 (NULL_RTX, x, false, imode));
14195 return true;
14197 case CONST_DOUBLE:
14199 /* First determine number of instructions to do the move
14200 as an integer constant. */
14201 if (!aarch64_float_const_representable_p (x)
14202 && !aarch64_can_const_movi_rtx_p (x, mode)
14203 && aarch64_float_const_rtx_p (x))
14205 unsigned HOST_WIDE_INT ival;
14206 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14207 gcc_assert (succeed);
14209 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14210 ? DImode : SImode;
14211 int ncost = aarch64_internal_mov_immediate
14212 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14213 *cost += COSTS_N_INSNS (ncost);
14214 return true;
14217 if (speed)
14219 /* mov[df,sf]_aarch64. */
14220 if (aarch64_float_const_representable_p (x))
14221 /* FMOV (scalar immediate). */
14222 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14223 else if (!aarch64_float_const_zero_rtx_p (x))
14225 /* This will be a load from memory. */
14226 if (mode == DFmode || mode == DDmode)
14227 *cost += extra_cost->ldst.loadd;
14228 else
14229 *cost += extra_cost->ldst.loadf;
14231 else
14232 /* Otherwise this is +0.0. We get this using MOVI d0, #0
14233 or MOV v0.s[0], wzr - neither of which are modeled by the
14234 cost tables. Just use the default cost. */
14239 return true;
14241 case MEM:
14242 if (speed)
14244 /* For loads we want the base cost of a load, plus an
14245 approximation for the additional cost of the addressing
14246 mode. */
14247 rtx address = XEXP (x, 0);
14248 if (VECTOR_MODE_P (mode))
14249 *cost += extra_cost->ldst.loadv;
14250 else if (GET_MODE_CLASS (mode) == MODE_INT)
14251 *cost += extra_cost->ldst.load;
14252 else if (mode == SFmode || mode == SDmode)
14253 *cost += extra_cost->ldst.loadf;
14254 else if (mode == DFmode || mode == DDmode)
14255 *cost += extra_cost->ldst.loadd;
14257 *cost +=
14258 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14259 0, speed));
14262 return true;
14264 case NEG:
14265 op0 = XEXP (x, 0);
14267 if (VECTOR_MODE_P (mode))
14269 /* Many vector comparison operations are represented as NEG
14270 of a comparison. */
14271 if (COMPARISON_P (op0))
14273 rtx op00 = XEXP (op0, 0);
14274 rtx op01 = XEXP (op0, 1);
14275 machine_mode inner_mode = GET_MODE (op00);
14276 /* FACGE/FACGT. */
14277 if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14278 && GET_CODE (op00) == ABS
14279 && GET_CODE (op01) == ABS)
14281 op00 = XEXP (op00, 0);
14282 op01 = XEXP (op01, 0);
14284 *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14285 *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14286 if (speed)
14287 *cost += extra_cost->vect.alu;
14288 return true;
14290 if (speed)
14292 /* FNEG. */
14293 *cost += extra_cost->vect.alu;
14295 return false;
14298 if (GET_MODE_CLASS (mode) == MODE_INT)
14300 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14301 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14303 /* CSETM. */
14304 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14305 return true;
14308 /* Cost this as SUB wzr, X. */
14309 op0 = CONST0_RTX (mode);
14310 op1 = XEXP (x, 0);
14311 goto cost_minus;
14314 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14316 /* Support (neg(fma...)) as a single instruction only if
14317 sign of zeros is unimportant. This matches the decision
14318 making in aarch64.md. */
14319 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14321 /* FNMADD. */
14322 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14323 return true;
14325 if (GET_CODE (op0) == MULT)
14327 /* FNMUL. */
14328 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14329 return true;
14331 if (speed)
14332 /* FNEG. */
14333 *cost += extra_cost->fp[mode == DFmode].neg;
14334 return false;
14337 return false;
14339 case CLRSB:
14340 case CLZ:
14341 if (speed)
14343 if (VECTOR_MODE_P (mode))
14344 *cost += extra_cost->vect.alu;
14345 else
14346 *cost += extra_cost->alu.clz;
14349 return false;
14351 case CTZ:
14352 if (VECTOR_MODE_P (mode))
14354 *cost = COSTS_N_INSNS (3);
14355 if (speed)
14356 *cost += extra_cost->vect.alu * 3;
14358 else if (TARGET_CSSC)
14360 *cost = COSTS_N_INSNS (1);
14361 if (speed)
14362 *cost += extra_cost->alu.clz;
14364 else
14366 *cost = COSTS_N_INSNS (2);
14367 if (speed)
14368 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14370 return false;
14372 case COMPARE:
14373 op0 = XEXP (x, 0);
14374 op1 = XEXP (x, 1);
14376 if (op1 == const0_rtx
14377 && GET_CODE (op0) == AND)
14379 x = op0;
14380 mode = GET_MODE (op0);
14381 goto cost_logic;
14384 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14386 /* TODO: A write to the CC flags possibly costs extra, this
14387 needs encoding in the cost tables. */
14389 mode = GET_MODE (op0);
14390 /* ANDS. */
14391 if (GET_CODE (op0) == AND)
14393 x = op0;
14394 goto cost_logic;
14397 if (GET_CODE (op0) == PLUS)
14399 /* ADDS (and CMN alias). */
14400 x = op0;
14401 goto cost_plus;
14404 if (GET_CODE (op0) == MINUS)
14406 /* SUBS. */
14407 x = op0;
14408 goto cost_minus;
14411 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14412 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14413 && CONST_INT_P (XEXP (op0, 2)))
14415 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14416 Handle it here directly rather than going to cost_logic
14417 since we know the immediate generated for the TST is valid
14418 so we can avoid creating an intermediate rtx for it only
14419 for costing purposes. */
14420 if (speed)
14421 *cost += extra_cost->alu.logical;
14423 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14424 ZERO_EXTRACT, 0, speed);
14425 return true;
14428 if (GET_CODE (op1) == NEG)
14430 /* CMN. */
14431 if (speed)
14432 *cost += extra_cost->alu.arith;
14434 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14435 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14436 return true;
14439 /* CMP.
14441 Compare can freely swap the order of operands, and
14442 canonicalization puts the more complex operation first.
14443 But the integer MINUS logic expects the shift/extend
14444 operation in op1. */
14445 if (! (REG_P (op0)
14446 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14448 op0 = XEXP (x, 1);
14449 op1 = XEXP (x, 0);
14451 goto cost_minus;
14454 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14456 /* FCMP. */
14457 if (speed)
14458 *cost += extra_cost->fp[mode == DFmode].compare;
14460 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14462 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14463 /* FCMP supports constant 0.0 for no extra cost. */
14464 return true;
14466 return false;
14469 if (VECTOR_MODE_P (mode))
14471 /* Vector compare. */
14472 if (speed)
14473 *cost += extra_cost->vect.alu;
14475 if (aarch64_float_const_zero_rtx_p (op1))
14477 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14478 cost. */
14479 return true;
14481 return false;
14483 return false;
14485 case MINUS:
14487 op0 = XEXP (x, 0);
14488 op1 = XEXP (x, 1);
14490 cost_minus:
14491 if (VECTOR_MODE_P (mode))
14493 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14494 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14496 /* Recognise the SABD and UABD operation here.
14497 Recursion from the PLUS case will catch the accumulating
14498 forms. */
14499 if (aarch64_abd_rtx_p (x))
14501 if (speed)
14502 *cost += extra_cost->vect.alu;
14503 return true;
14505 /* SUBL2 and SUBW2.
14506 The select-operand-high-half versions of the sub instruction
14507 have the same cost as the regular three vector version -
14508 don't add the costs of the select into the costs of the sub.
14510 op0 = aarch64_strip_extend_vec_half (op0);
14511 op1 = aarch64_strip_extend_vec_half (op1);
14515 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14517 /* Detect valid immediates. */
14518 if ((GET_MODE_CLASS (mode) == MODE_INT
14519 || (GET_MODE_CLASS (mode) == MODE_CC
14520 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14521 && CONST_INT_P (op1)
14522 && aarch64_uimm12_shift (INTVAL (op1)))
14524 if (speed)
14525 /* SUB(S) (immediate). */
14526 *cost += extra_cost->alu.arith;
14527 return true;
14530 /* Look for SUB (extended register). */
14531 if (is_a <scalar_int_mode> (mode)
14532 && aarch64_rtx_arith_op_extract_p (op1))
14534 if (speed)
14535 *cost += extra_cost->alu.extend_arith;
14537 op1 = aarch64_strip_extend (op1, true);
14538 *cost += rtx_cost (op1, VOIDmode,
14539 (enum rtx_code) GET_CODE (op1), 0, speed);
14540 return true;
14543 rtx new_op1 = aarch64_strip_extend (op1, false);
14545 /* Cost this as an FMA-alike operation. */
14546 if ((GET_CODE (new_op1) == MULT
14547 || aarch64_shift_p (GET_CODE (new_op1)))
14548 && code != COMPARE)
14550 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14551 (enum rtx_code) code,
14552 speed);
14553 return true;
14556 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14558 if (speed)
14560 if (VECTOR_MODE_P (mode))
14562 /* Vector SUB. */
14563 *cost += extra_cost->vect.alu;
14565 else if (GET_MODE_CLASS (mode) == MODE_INT)
14567 /* SUB(S). */
14568 *cost += extra_cost->alu.arith;
14570 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14572 /* FSUB. */
14573 *cost += extra_cost->fp[mode == DFmode].addsub;
14576 return true;
14579 case PLUS:
14581 rtx new_op0;
14583 op0 = XEXP (x, 0);
14584 op1 = XEXP (x, 1);
14586 cost_plus:
14587 if (VECTOR_MODE_P (mode))
14589 /* ADDL2 and ADDW2. */
14590 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14591 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14593 /* The select-operand-high-half versions of the add instruction
14594 have the same cost as the regular three vector version -
14595 don't add the costs of the select into the costs of the add.
14597 op0 = aarch64_strip_extend_vec_half (op0);
14598 op1 = aarch64_strip_extend_vec_half (op1);
14602 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14603 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14605 /* CSINC. */
14606 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14607 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14608 return true;
14611 if (GET_MODE_CLASS (mode) == MODE_INT
14612 && (aarch64_plus_immediate (op1, mode)
14613 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14615 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14617 if (speed)
14619 /* ADD (immediate). */
14620 *cost += extra_cost->alu.arith;
14622 /* Some tunings prefer to not use the VL-based scalar ops.
14623 Increase the cost of the poly immediate to prevent their
14624 formation. */
14625 if (GET_CODE (op1) == CONST_POLY_INT
14626 && (aarch64_tune_params.extra_tuning_flags
14627 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14628 *cost += COSTS_N_INSNS (1);
14630 return true;
14633 if (aarch64_pluslong_immediate (op1, mode))
14635 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14636 if ((INTVAL (op1) & 0xfff) != 0)
14637 *cost += COSTS_N_INSNS (1);
14639 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14640 return true;
14643 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14645 /* Look for ADD (extended register). */
14646 if (is_a <scalar_int_mode> (mode)
14647 && aarch64_rtx_arith_op_extract_p (op0))
14649 if (speed)
14650 *cost += extra_cost->alu.extend_arith;
14652 op0 = aarch64_strip_extend (op0, true);
14653 *cost += rtx_cost (op0, VOIDmode,
14654 (enum rtx_code) GET_CODE (op0), 0, speed);
14655 return true;
14658 /* Strip any extend, leave shifts behind as we will
14659 cost them through mult_cost. */
14660 new_op0 = aarch64_strip_extend (op0, false);
14662 if (GET_CODE (new_op0) == MULT
14663 || aarch64_shift_p (GET_CODE (new_op0)))
14665 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14666 speed);
14667 return true;
14670 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14672 if (speed)
14674 if (VECTOR_MODE_P (mode))
14676 /* Vector ADD. */
14677 *cost += extra_cost->vect.alu;
14679 else if (GET_MODE_CLASS (mode) == MODE_INT)
14681 /* ADD. */
14682 *cost += extra_cost->alu.arith;
14684 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14686 /* FADD. */
14687 *cost += extra_cost->fp[mode == DFmode].addsub;
14690 return true;
14693 case BITREVERSE:
14694 case BSWAP:
14695 *cost = COSTS_N_INSNS (1);
14697 if (speed)
14699 if (VECTOR_MODE_P (mode))
14700 *cost += extra_cost->vect.alu;
14701 else
14702 *cost += extra_cost->alu.rev;
14704 return false;
14706 case IOR:
14707 if (aarch_rev16_p (x))
14709 *cost = COSTS_N_INSNS (1);
14711 if (speed)
14713 if (VECTOR_MODE_P (mode))
14714 *cost += extra_cost->vect.alu;
14715 else
14716 *cost += extra_cost->alu.rev;
14718 return true;
14721 if (aarch64_extr_rtx_p (x, &op0, &op1))
14723 *cost += rtx_cost (op0, mode, IOR, 0, speed);
14724 *cost += rtx_cost (op1, mode, IOR, 1, speed);
14725 if (speed)
14726 *cost += extra_cost->alu.shift;
14728 return true;
14730 /* Fall through. */
14731 case XOR:
14732 case AND:
14733 cost_logic:
14734 op0 = XEXP (x, 0);
14735 op1 = XEXP (x, 1);
14737 if (VECTOR_MODE_P (mode))
14739 if (speed)
14740 *cost += extra_cost->vect.alu;
14741 return true;
14744 if (code == AND
14745 && GET_CODE (op0) == MULT
14746 && CONST_INT_P (XEXP (op0, 1))
14747 && CONST_INT_P (op1)
14748 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14749 INTVAL (op1)) != 0)
14751 /* This is a UBFM/SBFM. */
14752 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14753 if (speed)
14754 *cost += extra_cost->alu.bfx;
14755 return true;
14758 if (is_int_mode (mode, &int_mode))
14760 if (CONST_INT_P (op1))
14762 /* We have a mask + shift version of a UBFIZ
14763 i.e. the *andim_ashift<mode>_bfiz pattern. */
14764 if (GET_CODE (op0) == ASHIFT
14765 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14766 XEXP (op0, 1)))
14768 *cost += rtx_cost (XEXP (op0, 0), int_mode,
14769 (enum rtx_code) code, 0, speed);
14770 if (speed)
14771 *cost += extra_cost->alu.bfx;
14773 return true;
14775 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14777 /* We possibly get the immediate for free, this is not
14778 modelled. */
14779 *cost += rtx_cost (op0, int_mode,
14780 (enum rtx_code) code, 0, speed);
14781 if (speed)
14782 *cost += extra_cost->alu.logical;
14784 return true;
14787 else
14789 rtx new_op0 = op0;
14791 /* Handle ORN, EON, or BIC. */
14792 if (GET_CODE (op0) == NOT)
14793 op0 = XEXP (op0, 0);
14795 new_op0 = aarch64_strip_shift (op0);
14797 /* If we had a shift on op0 then this is a logical-shift-
14798 by-register/immediate operation. Otherwise, this is just
14799 a logical operation. */
14800 if (speed)
14802 if (new_op0 != op0)
14804 /* Shift by immediate. */
14805 if (CONST_INT_P (XEXP (op0, 1)))
14806 *cost += extra_cost->alu.log_shift;
14807 else
14808 *cost += extra_cost->alu.log_shift_reg;
14810 else
14811 *cost += extra_cost->alu.logical;
14814 /* In both cases we want to cost both operands. */
14815 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14816 0, speed);
14817 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14818 1, speed);
14820 return true;
14823 return false;
14825 case NOT:
14826 x = XEXP (x, 0);
14827 op0 = aarch64_strip_shift (x);
14829 if (VECTOR_MODE_P (mode))
14831 /* Vector NOT. */
14832 *cost += extra_cost->vect.alu;
14833 return false;
14836 /* MVN-shifted-reg. */
14837 if (op0 != x)
14839 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14841 if (speed)
14842 *cost += extra_cost->alu.log_shift;
14844 return true;
14846 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14847 Handle the second form here taking care that 'a' in the above can
14848 be a shift. */
14849 else if (GET_CODE (op0) == XOR)
14851 rtx newop0 = XEXP (op0, 0);
14852 rtx newop1 = XEXP (op0, 1);
14853 rtx op0_stripped = aarch64_strip_shift (newop0);
14855 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14856 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14858 if (speed)
14860 if (op0_stripped != newop0)
14861 *cost += extra_cost->alu.log_shift;
14862 else
14863 *cost += extra_cost->alu.logical;
14866 return true;
14868 /* MVN. */
14869 if (speed)
14870 *cost += extra_cost->alu.logical;
14872 return false;
14874 case ZERO_EXTEND:
14876 op0 = XEXP (x, 0);
14877 /* If a value is written in SI mode, then zero extended to DI
14878 mode, the operation will in general be free as a write to
14879 a 'w' register implicitly zeroes the upper bits of an 'x'
14880 register. However, if this is
14882 (set (reg) (zero_extend (reg)))
14884 we must cost the explicit register move. */
14885 if (mode == DImode
14886 && GET_MODE (op0) == SImode)
14888 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14890 /* If OP_COST is non-zero, then the cost of the zero extend
14891 is effectively the cost of the inner operation. Otherwise
14892 we have a MOV instruction and we take the cost from the MOV
14893 itself. This is true independently of whether we are
14894 optimizing for space or time. */
14895 if (op_cost)
14896 *cost = op_cost;
14898 return true;
14900 else if (MEM_P (op0))
14902 /* All loads can zero extend to any size for free. */
14903 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14904 return true;
14907 op0 = aarch64_extend_bitfield_pattern_p (x);
14908 if (op0)
14910 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14911 if (speed)
14912 *cost += extra_cost->alu.bfx;
14913 return true;
14916 if (speed)
14918 if (VECTOR_MODE_P (mode))
14920 /* UMOV. */
14921 *cost += extra_cost->vect.alu;
14923 else
14925 /* We generate an AND instead of UXTB/UXTH. */
14926 *cost += extra_cost->alu.logical;
14929 return false;
14931 case SIGN_EXTEND:
14932 if (MEM_P (XEXP (x, 0)))
14934 /* LDRSH. */
14935 if (speed)
14937 rtx address = XEXP (XEXP (x, 0), 0);
14938 *cost += extra_cost->ldst.load_sign_extend;
14940 *cost +=
14941 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14942 0, speed));
14944 return true;
14947 op0 = aarch64_extend_bitfield_pattern_p (x);
14948 if (op0)
14950 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14951 if (speed)
14952 *cost += extra_cost->alu.bfx;
14953 return true;
14956 if (speed)
14958 if (VECTOR_MODE_P (mode))
14959 *cost += extra_cost->vect.alu;
14960 else
14961 *cost += extra_cost->alu.extend;
14963 return false;
14965 case ROTATE:
14966 case ROTATERT:
14967 case LSHIFTRT:
14968 case ASHIFTRT:
14969 case ASHIFT:
14970 op0 = XEXP (x, 0);
14971 op1 = XEXP (x, 1);
14973 if (CONST_INT_P (op1))
14975 if (speed)
14977 if (VECTOR_MODE_P (mode))
14979 /* Vector shift (immediate). */
14980 *cost += extra_cost->vect.alu;
14982 else
14984 /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
14985 These are all aliases. */
14986 *cost += extra_cost->alu.shift;
14990 /* We can incorporate zero/sign extend for free. */
14991 if (GET_CODE (op0) == ZERO_EXTEND
14992 || GET_CODE (op0) == SIGN_EXTEND)
14993 op0 = XEXP (op0, 0);
14995 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14996 return true;
14998 else
15000 if (VECTOR_MODE_P (mode))
15002 if (speed)
15003 /* Vector shift (register). */
15004 *cost += extra_cost->vect.alu;
15006 else
15008 if (speed)
15009 /* LSLV, ASRV. */
15010 *cost += extra_cost->alu.shift_reg;
15012 /* The register shift amount may be in a shorter mode expressed
15013 as a lowpart SUBREG. For costing purposes just look inside. */
15014 if (SUBREG_P (op1) && subreg_lowpart_p (op1))
15015 op1 = SUBREG_REG (op1);
15016 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
15017 && CONST_INT_P (XEXP (op1, 1))
15018 && known_eq (INTVAL (XEXP (op1, 1)),
15019 GET_MODE_BITSIZE (mode) - 1))
15021 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
15022 /* We already demanded XEXP (op1, 0) to be REG_P, so
15023 don't recurse into it. */
15024 return true;
15027 return false; /* All arguments need to be in registers. */
15030 case SYMBOL_REF:
15032 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
15033 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
15035 /* LDR. */
15036 if (speed)
15037 *cost += extra_cost->ldst.load;
15039 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
15040 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
15042 /* ADRP, followed by ADD. */
15043 *cost += COSTS_N_INSNS (1);
15044 if (speed)
15045 *cost += 2 * extra_cost->alu.arith;
15047 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
15048 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
15050 /* ADR. */
15051 if (speed)
15052 *cost += extra_cost->alu.arith;
15055 if (flag_pic)
15057 /* One extra load instruction, after accessing the GOT. */
15058 *cost += COSTS_N_INSNS (1);
15059 if (speed)
15060 *cost += extra_cost->ldst.load;
15062 return true;
15064 case HIGH:
15065 case LO_SUM:
15066 /* ADRP/ADD (immediate). */
15067 if (speed)
15068 *cost += extra_cost->alu.arith;
15069 return true;
15071 case ZERO_EXTRACT:
15072 case SIGN_EXTRACT:
15073 /* UBFX/SBFX. */
15074 if (speed)
15076 if (VECTOR_MODE_P (mode))
15077 *cost += extra_cost->vect.alu;
15078 else
15079 *cost += extra_cost->alu.bfx;
15082 /* We can trust that the immediates used will be correct (there
15083 are no by-register forms), so we need only cost op0. */
15084 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
15085 return true;
15087 case MULT:
15088 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
15089 /* aarch64_rtx_mult_cost always handles recursion to its
15090 operands. */
15091 return true;
15093 case MOD:
15094 /* We can expand signed mod by power of 2 using a NEGS, two parallel
15095 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
15096 an unconditional negate. This case should only ever be reached through
15097 the set_smod_pow2_cheap check in expmed.cc. */
15098 if (CONST_INT_P (XEXP (x, 1))
15099 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
15100 && (mode == SImode || mode == DImode))
15102 /* We expand to 4 instructions. Reset the baseline. */
15103 *cost = COSTS_N_INSNS (4);
15105 if (speed)
15106 *cost += 2 * extra_cost->alu.logical
15107 + 2 * extra_cost->alu.arith;
15109 return true;
15112 /* Fall-through. */
15113 case UMOD:
15114 if (speed)
15116 /* Slighly prefer UMOD over SMOD. */
15117 if (VECTOR_MODE_P (mode))
15118 *cost += extra_cost->vect.alu;
15119 else if (GET_MODE_CLASS (mode) == MODE_INT)
15120 *cost += (extra_cost->mult[mode == DImode].add
15121 + extra_cost->mult[mode == DImode].idiv
15122 + (code == MOD ? 1 : 0));
15124 return false; /* All arguments need to be in registers. */
15126 case DIV:
15127 case UDIV:
15128 case SQRT:
15129 if (speed)
15131 if (VECTOR_MODE_P (mode))
15132 *cost += extra_cost->vect.alu;
15133 else if (GET_MODE_CLASS (mode) == MODE_INT)
15134 /* There is no integer SQRT, so only DIV and UDIV can get
15135 here. */
15136 *cost += (extra_cost->mult[mode == DImode].idiv
15137 /* Slighly prefer UDIV over SDIV. */
15138 + (code == DIV ? 1 : 0));
15139 else
15140 *cost += extra_cost->fp[mode == DFmode].div;
15142 return false; /* All arguments need to be in registers. */
15144 case IF_THEN_ELSE:
15145 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15146 XEXP (x, 2), cost, speed);
15148 case EQ:
15149 case NE:
15150 case GT:
15151 case GTU:
15152 case LT:
15153 case LTU:
15154 case GE:
15155 case GEU:
15156 case LE:
15157 case LEU:
15159 return false; /* All arguments must be in registers. */
15161 case FMA:
15162 op0 = XEXP (x, 0);
15163 op1 = XEXP (x, 1);
15164 op2 = XEXP (x, 2);
15166 if (speed)
15168 if (VECTOR_MODE_P (mode))
15169 *cost += extra_cost->vect.alu;
15170 else
15171 *cost += extra_cost->fp[mode == DFmode].fma;
15174 /* FMSUB, FNMADD, and FNMSUB are free. */
15175 if (GET_CODE (op0) == NEG)
15176 op0 = XEXP (op0, 0);
15178 if (GET_CODE (op2) == NEG)
15179 op2 = XEXP (op2, 0);
15181 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15182 and the by-element operand as operand 0. */
15183 if (GET_CODE (op1) == NEG)
15184 op1 = XEXP (op1, 0);
15186 /* Catch vector-by-element operations. The by-element operand can
15187 either be (vec_duplicate (vec_select (x))) or just
15188 (vec_select (x)), depending on whether we are multiplying by
15189 a vector or a scalar.
15191 Canonicalization is not very good in these cases, FMA4 will put the
15192 by-element operand as operand 0, FNMA4 will have it as operand 1. */
15193 if (GET_CODE (op0) == VEC_DUPLICATE)
15194 op0 = XEXP (op0, 0);
15195 else if (GET_CODE (op1) == VEC_DUPLICATE)
15196 op1 = XEXP (op1, 0);
15198 if (GET_CODE (op0) == VEC_SELECT)
15199 op0 = XEXP (op0, 0);
15200 else if (GET_CODE (op1) == VEC_SELECT)
15201 op1 = XEXP (op1, 0);
15203 /* If the remaining parameters are not registers,
15204 get the cost to put them into registers. */
15205 *cost += rtx_cost (op0, mode, FMA, 0, speed);
15206 *cost += rtx_cost (op1, mode, FMA, 1, speed);
15207 *cost += rtx_cost (op2, mode, FMA, 2, speed);
15208 return true;
15210 case FLOAT:
15211 case UNSIGNED_FLOAT:
15212 if (speed)
15213 *cost += extra_cost->fp[mode == DFmode].fromint;
15214 return false;
15216 case FLOAT_EXTEND:
15217 if (speed)
15219 if (VECTOR_MODE_P (mode))
15221 /*Vector truncate. */
15222 *cost += extra_cost->vect.alu;
15224 else
15225 *cost += extra_cost->fp[mode == DFmode].widen;
15227 return false;
15229 case FLOAT_TRUNCATE:
15230 if (speed)
15232 if (VECTOR_MODE_P (mode))
15234 /*Vector conversion. */
15235 *cost += extra_cost->vect.alu;
15237 else
15238 *cost += extra_cost->fp[mode == DFmode].narrow;
15240 return false;
15242 case FIX:
15243 case UNSIGNED_FIX:
15244 x = XEXP (x, 0);
15245 /* Strip the rounding part. They will all be implemented
15246 by the fcvt* family of instructions anyway. */
15247 if (GET_CODE (x) == UNSPEC)
15249 unsigned int uns_code = XINT (x, 1);
15251 if (uns_code == UNSPEC_FRINTA
15252 || uns_code == UNSPEC_FRINTM
15253 || uns_code == UNSPEC_FRINTN
15254 || uns_code == UNSPEC_FRINTP
15255 || uns_code == UNSPEC_FRINTZ)
15256 x = XVECEXP (x, 0, 0);
15259 if (speed)
15261 if (VECTOR_MODE_P (mode))
15262 *cost += extra_cost->vect.alu;
15263 else
15264 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15267 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15268 fixed-point fcvt. */
15269 if (GET_CODE (x) == MULT
15270 && ((VECTOR_MODE_P (mode)
15271 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15272 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15274 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15275 0, speed);
15276 return true;
15279 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15280 return true;
15282 case ABS:
15283 if (VECTOR_MODE_P (mode))
15285 /* ABS (vector). */
15286 if (speed)
15287 *cost += extra_cost->vect.alu;
15289 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15291 op0 = XEXP (x, 0);
15293 /* FABD, which is analogous to FADD. */
15294 if (GET_CODE (op0) == MINUS)
15296 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15297 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15298 if (speed)
15299 *cost += extra_cost->fp[mode == DFmode].addsub;
15301 return true;
15303 /* Simple FABS is analogous to FNEG. */
15304 if (speed)
15305 *cost += extra_cost->fp[mode == DFmode].neg;
15307 else
15309 /* Integer ABS will either be split to
15310 two arithmetic instructions, or will be an ABS
15311 (scalar), which we don't model. */
15312 *cost = COSTS_N_INSNS (2);
15313 if (speed)
15314 *cost += 2 * extra_cost->alu.arith;
15316 return false;
15318 case SMAX:
15319 case SMIN:
15320 if (speed)
15322 if (VECTOR_MODE_P (mode))
15323 *cost += extra_cost->vect.alu;
15324 else
15326 /* FMAXNM/FMINNM/FMAX/FMIN.
15327 TODO: This may not be accurate for all implementations, but
15328 we do not model this in the cost tables. */
15329 *cost += extra_cost->fp[mode == DFmode].addsub;
15332 return false;
15334 case UNSPEC:
15335 /* The floating point round to integer frint* instructions. */
15336 if (aarch64_frint_unspec_p (XINT (x, 1)))
15338 if (speed)
15339 *cost += extra_cost->fp[mode == DFmode].roundint;
15341 return false;
15343 break;
15345 case TRUNCATE:
15347 /* Decompose <su>muldi3_highpart. */
15348 if (/* (truncate:DI */
15349 mode == DImode
15350 /* (lshiftrt:TI */
15351 && GET_MODE (XEXP (x, 0)) == TImode
15352 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15353 /* (mult:TI */
15354 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15355 /* (ANY_EXTEND:TI (reg:DI))
15356 (ANY_EXTEND:TI (reg:DI))) */
15357 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15358 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15359 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15360 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15361 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15362 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15363 /* (const_int 64) */
15364 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15365 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15367 /* UMULH/SMULH. */
15368 if (speed)
15369 *cost += extra_cost->mult[mode == DImode].extend;
15370 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15371 mode, MULT, 0, speed);
15372 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15373 mode, MULT, 1, speed);
15374 return true;
15376 break;
15377 case CONST_VECTOR:
15379 /* Load using MOVI/MVNI. */
15380 if (aarch64_simd_valid_immediate (x, NULL))
15381 *cost = extra_cost->vect.movi;
15382 else /* Load using constant pool. */
15383 *cost = extra_cost->ldst.load;
15384 break;
15386 case VEC_CONCAT:
15387 /* depending on the operation, either DUP or INS.
15388 For now, keep default costing. */
15389 break;
15390 case VEC_DUPLICATE:
15391 /* Load using a DUP. */
15392 *cost = extra_cost->vect.dup;
15393 return false;
15394 case VEC_SELECT:
15396 rtx op0 = XEXP (x, 0);
15397 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15399 /* cost subreg of 0 as free, otherwise as DUP */
15400 rtx op1 = XEXP (x, 1);
15401 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15403 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15404 *cost = extra_cost->vect.dup;
15405 else
15406 *cost = extra_cost->vect.extract;
15407 return true;
15409 default:
15410 break;
15413 if (dump_file
15414 && flag_aarch64_verbose_cost)
15415 fprintf (dump_file,
15416 "\nFailed to cost RTX. Assuming default cost.\n");
15418 return true;
15421 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15422 calculated for X. This cost is stored in *COST. Returns true
15423 if the total cost of X was calculated. */
15424 static bool
15425 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15426 int param, int *cost, bool speed)
15428 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15430 if (dump_file
15431 && flag_aarch64_verbose_cost)
15433 print_rtl_single (dump_file, x);
15434 fprintf (dump_file, "\n%s cost: %d (%s)\n",
15435 speed ? "Hot" : "Cold",
15436 *cost, result ? "final" : "partial");
15439 return result;
15442 static int
15443 aarch64_register_move_cost (machine_mode mode,
15444 reg_class_t from_i, reg_class_t to_i)
15446 enum reg_class from = (enum reg_class) from_i;
15447 enum reg_class to = (enum reg_class) to_i;
15448 const struct cpu_regmove_cost *regmove_cost
15449 = aarch64_tune_params.regmove_cost;
15451 /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS. */
15452 if (reg_class_subset_p (to, POINTER_REGS))
15453 to = GENERAL_REGS;
15455 if (reg_class_subset_p (from, POINTER_REGS))
15456 from = GENERAL_REGS;
15458 /* Make RDFFR very expensive. In particular, if we know that the FFR
15459 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15460 as a way of obtaining a PTRUE. */
15461 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15462 && hard_reg_set_subset_p (reg_class_contents[from_i],
15463 reg_class_contents[FFR_REGS]))
15464 return 80;
15466 /* Moving between GPR and stack cost is the same as GP2GP. */
15467 if ((from == GENERAL_REGS && to == STACK_REG)
15468 || (to == GENERAL_REGS && from == STACK_REG))
15469 return regmove_cost->GP2GP;
15471 /* To/From the stack register, we move via the gprs. */
15472 if (to == STACK_REG || from == STACK_REG)
15473 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15474 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15476 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15477 if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15478 && known_eq (GET_MODE_SIZE (mode), 16))
15480 /* 128-bit operations on general registers require 2 instructions. */
15481 if (from == GENERAL_REGS && to == GENERAL_REGS)
15482 return regmove_cost->GP2GP * 2;
15483 else if (from == GENERAL_REGS)
15484 return regmove_cost->GP2FP * 2;
15485 else if (to == GENERAL_REGS)
15486 return regmove_cost->FP2GP * 2;
15488 /* When AdvSIMD instructions are disabled it is not possible to move
15489 a 128-bit value directly between Q registers. This is handled in
15490 secondary reload. A general register is used as a scratch to move
15491 the upper DI value and the lower DI value is moved directly,
15492 hence the cost is the sum of three moves. */
15493 if (!TARGET_SIMD && !TARGET_SVE)
15494 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15496 return regmove_cost->FP2FP;
15499 if (from == GENERAL_REGS && to == GENERAL_REGS)
15500 return regmove_cost->GP2GP;
15501 else if (from == GENERAL_REGS)
15502 return regmove_cost->GP2FP;
15503 else if (to == GENERAL_REGS)
15504 return regmove_cost->FP2GP;
15506 if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15508 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15509 The cost must be greater than 2 units to indicate that direct
15510 moves aren't possible. */
15511 auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15512 + aarch64_tune_params.memmov_cost.store_fp);
15513 return MIN (CEIL (per_vector, 2), 4);
15516 return regmove_cost->FP2FP;
15519 /* Implements TARGET_MEMORY_MOVE_COST. */
15520 static int
15521 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15523 enum reg_class rclass = (enum reg_class) rclass_i;
15524 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15525 ? reg_classes_intersect_p (rclass, PR_REGS)
15526 : reg_class_subset_p (rclass, PR_REGS))
15527 return (in
15528 ? aarch64_tune_params.memmov_cost.load_pred
15529 : aarch64_tune_params.memmov_cost.store_pred);
15531 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15532 ? reg_classes_intersect_p (rclass, FP_REGS)
15533 : reg_class_subset_p (rclass, FP_REGS))
15534 return (in
15535 ? aarch64_tune_params.memmov_cost.load_fp
15536 : aarch64_tune_params.memmov_cost.store_fp);
15538 return (in
15539 ? aarch64_tune_params.memmov_cost.load_int
15540 : aarch64_tune_params.memmov_cost.store_int);
15543 /* Implement TARGET_INSN_COST. We have the opportunity to do something
15544 much more productive here, such as using insn attributes to cost things.
15545 But we don't, not yet.
15547 The main point of this current definition is to make calling insn_cost
15548 on one instruction equivalent to calling seq_cost on a sequence that
15549 contains only that instruction. The default definition would instead
15550 only look at SET_SRCs, ignoring SET_DESTs.
15552 This ensures that, for example, storing a 128-bit zero vector is more
15553 expensive than storing a 128-bit vector register. A move of zero
15554 into a 128-bit vector register followed by multiple stores of that
15555 register is then cheaper than multiple stores of zero (which would
15556 use STP of XZR). This in turn allows STP Qs to be formed. */
15557 static int
15558 aarch64_insn_cost (rtx_insn *insn, bool speed)
15560 if (rtx set = single_set (insn))
15561 return set_rtx_cost (set, speed);
15562 return pattern_cost (PATTERN (insn), speed);
15565 /* Implement TARGET_INIT_BUILTINS. */
15566 static void
15567 aarch64_init_builtins ()
15569 aarch64_general_init_builtins ();
15570 aarch64_sve::init_builtins ();
15571 #ifdef SUBTARGET_INIT_BUILTINS
15572 SUBTARGET_INIT_BUILTINS;
15573 #endif
15576 /* Implement TARGET_FOLD_BUILTIN. */
15577 static tree
15578 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15580 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15581 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15582 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15583 switch (code & AARCH64_BUILTIN_CLASS)
15585 case AARCH64_BUILTIN_GENERAL:
15586 return aarch64_general_fold_builtin (subcode, type, nargs, args);
15588 case AARCH64_BUILTIN_SVE:
15589 return NULL_TREE;
15591 gcc_unreachable ();
15594 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15595 static bool
15596 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15598 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15599 tree fndecl = gimple_call_fndecl (stmt);
15600 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15601 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15602 gimple *new_stmt = NULL;
15603 switch (code & AARCH64_BUILTIN_CLASS)
15605 case AARCH64_BUILTIN_GENERAL:
15606 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15607 break;
15609 case AARCH64_BUILTIN_SVE:
15610 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15611 break;
15614 if (!new_stmt)
15615 return false;
15617 gsi_replace (gsi, new_stmt, false);
15618 return true;
15621 /* Implement TARGET_EXPAND_BUILTIN. */
15622 static rtx
15623 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15625 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15626 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15627 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15628 switch (code & AARCH64_BUILTIN_CLASS)
15630 case AARCH64_BUILTIN_GENERAL:
15631 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15633 case AARCH64_BUILTIN_SVE:
15634 return aarch64_sve::expand_builtin (subcode, exp, target);
15636 gcc_unreachable ();
15639 /* Implement TARGET_BUILTIN_DECL. */
15640 static tree
15641 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15643 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15644 switch (code & AARCH64_BUILTIN_CLASS)
15646 case AARCH64_BUILTIN_GENERAL:
15647 return aarch64_general_builtin_decl (subcode, initialize_p);
15649 case AARCH64_BUILTIN_SVE:
15650 return aarch64_sve::builtin_decl (subcode, initialize_p);
15652 gcc_unreachable ();
15655 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15656 to optimize 1.0/sqrt. */
15658 static bool
15659 use_rsqrt_p (machine_mode mode)
15661 return (!flag_trapping_math
15662 && flag_unsafe_math_optimizations
15663 && ((aarch64_tune_params.approx_modes->recip_sqrt
15664 & AARCH64_APPROX_MODE (mode))
15665 || flag_mrecip_low_precision_sqrt));
15668 /* Function to decide when to use the approximate reciprocal square root
15669 builtin. */
15671 static tree
15672 aarch64_builtin_reciprocal (tree fndecl)
15674 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15676 if (!use_rsqrt_p (mode))
15677 return NULL_TREE;
15678 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15679 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15680 switch (code & AARCH64_BUILTIN_CLASS)
15682 case AARCH64_BUILTIN_GENERAL:
15683 return aarch64_general_builtin_rsqrt (subcode);
15685 case AARCH64_BUILTIN_SVE:
15686 return NULL_TREE;
15688 gcc_unreachable ();
15691 /* Emit code to perform the floating-point operation:
15693 DST = SRC1 * SRC2
15695 where all three operands are already known to be registers.
15696 If the operation is an SVE one, PTRUE is a suitable all-true
15697 predicate. */
15699 static void
15700 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15702 if (ptrue)
15703 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15704 dst, ptrue, src1, src2,
15705 gen_int_mode (SVE_RELAXED_GP, SImode)));
15706 else
15707 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15710 /* Emit instruction sequence to compute either the approximate square root
15711 or its approximate reciprocal, depending on the flag RECP, and return
15712 whether the sequence was emitted or not. */
15714 bool
15715 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15717 machine_mode mode = GET_MODE (dst);
15719 if (GET_MODE_INNER (mode) == HFmode)
15721 gcc_assert (!recp);
15722 return false;
15725 if (!recp)
15727 if (!(flag_mlow_precision_sqrt
15728 || (aarch64_tune_params.approx_modes->sqrt
15729 & AARCH64_APPROX_MODE (mode))))
15730 return false;
15732 if (!flag_finite_math_only
15733 || flag_trapping_math
15734 || !flag_unsafe_math_optimizations
15735 || optimize_function_for_size_p (cfun))
15736 return false;
15738 else
15739 /* Caller assumes we cannot fail. */
15740 gcc_assert (use_rsqrt_p (mode));
15742 rtx pg = NULL_RTX;
15743 if (aarch64_sve_mode_p (mode))
15744 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15745 machine_mode mmsk = (VECTOR_MODE_P (mode)
15746 ? related_int_vector_mode (mode).require ()
15747 : int_mode_for_mode (mode).require ());
15748 rtx xmsk = NULL_RTX;
15749 if (!recp)
15751 /* When calculating the approximate square root, compare the
15752 argument with 0.0 and create a mask. */
15753 rtx zero = CONST0_RTX (mode);
15754 if (pg)
15756 xmsk = gen_reg_rtx (GET_MODE (pg));
15757 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15758 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15759 xmsk, pg, hint, src, zero));
15761 else
15763 xmsk = gen_reg_rtx (mmsk);
15764 emit_insn (gen_rtx_SET (xmsk,
15765 gen_rtx_NEG (mmsk,
15766 gen_rtx_EQ (mmsk, src, zero))));
15770 /* Estimate the approximate reciprocal square root. */
15771 rtx xdst = gen_reg_rtx (mode);
15772 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15774 /* Iterate over the series twice for SF and thrice for DF. */
15775 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15777 /* Optionally iterate over the series once less for faster performance
15778 while sacrificing the accuracy. */
15779 if ((recp && flag_mrecip_low_precision_sqrt)
15780 || (!recp && flag_mlow_precision_sqrt))
15781 iterations--;
15783 /* Iterate over the series to calculate the approximate reciprocal square
15784 root. */
15785 rtx x1 = gen_reg_rtx (mode);
15786 while (iterations--)
15788 rtx x2 = gen_reg_rtx (mode);
15789 aarch64_emit_mult (x2, pg, xdst, xdst);
15791 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15793 if (iterations > 0)
15794 aarch64_emit_mult (xdst, pg, xdst, x1);
15797 if (!recp)
15799 if (pg)
15800 /* Multiply nonzero source values by the corresponding intermediate
15801 result elements, so that the final calculation is the approximate
15802 square root rather than its reciprocal. Select a zero result for
15803 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15804 otherwise. */
15805 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15806 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15807 else
15809 /* Qualify the approximate reciprocal square root when the
15810 argument is 0.0 by squashing the intermediary result to 0.0. */
15811 rtx xtmp = gen_reg_rtx (mmsk);
15812 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15813 gen_rtx_SUBREG (mmsk, xdst, 0)));
15814 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15816 /* Calculate the approximate square root. */
15817 aarch64_emit_mult (xdst, pg, xdst, src);
15821 /* Finalize the approximation. */
15822 aarch64_emit_mult (dst, pg, xdst, x1);
15824 return true;
15827 /* Emit the instruction sequence to compute the approximation for the division
15828 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15830 bool
15831 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15833 machine_mode mode = GET_MODE (quo);
15835 if (GET_MODE_INNER (mode) == HFmode)
15836 return false;
15838 bool use_approx_division_p = (flag_mlow_precision_div
15839 || (aarch64_tune_params.approx_modes->division
15840 & AARCH64_APPROX_MODE (mode)));
15842 if (!flag_finite_math_only
15843 || flag_trapping_math
15844 || !flag_unsafe_math_optimizations
15845 || optimize_function_for_size_p (cfun)
15846 || !use_approx_division_p)
15847 return false;
15849 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15850 return false;
15852 rtx pg = NULL_RTX;
15853 if (aarch64_sve_mode_p (mode))
15854 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15856 /* Estimate the approximate reciprocal. */
15857 rtx xrcp = gen_reg_rtx (mode);
15858 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15860 /* Iterate over the series twice for SF and thrice for DF. */
15861 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15863 /* Optionally iterate over the series less for faster performance,
15864 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15865 if (flag_mlow_precision_div)
15866 iterations = (GET_MODE_INNER (mode) == DFmode
15867 ? aarch64_double_recp_precision
15868 : aarch64_float_recp_precision);
15870 /* Iterate over the series to calculate the approximate reciprocal. */
15871 rtx xtmp = gen_reg_rtx (mode);
15872 while (iterations--)
15874 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15876 if (iterations > 0)
15877 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15880 if (num != CONST1_RTX (mode))
15882 /* As the approximate reciprocal of DEN is already calculated, only
15883 calculate the approximate division when NUM is not 1.0. */
15884 rtx xnum = force_reg (mode, num);
15885 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15888 /* Finalize the approximation. */
15889 aarch64_emit_mult (quo, pg, xrcp, xtmp);
15890 return true;
15893 /* Return the number of instructions that can be issued per cycle. */
15894 static int
15895 aarch64_sched_issue_rate (void)
15897 return aarch64_tune_params.issue_rate;
15900 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15901 static int
15902 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15904 if (DEBUG_INSN_P (insn))
15905 return more;
15907 rtx_code code = GET_CODE (PATTERN (insn));
15908 if (code == USE || code == CLOBBER)
15909 return more;
15911 if (get_attr_type (insn) == TYPE_NO_INSN)
15912 return more;
15914 return more - 1;
15917 static int
15918 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15920 int issue_rate = aarch64_sched_issue_rate ();
15922 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15926 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15927 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15928 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15930 static int
15931 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15932 int ready_index)
15934 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15938 /* Vectorizer cost model target hooks. */
15940 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15941 return the decl that should be recorded. Return null otherwise. */
15942 tree
15943 aarch64_vector_load_decl (tree addr)
15945 if (TREE_CODE (addr) != ADDR_EXPR)
15946 return NULL_TREE;
15947 tree base = get_base_address (TREE_OPERAND (addr, 0));
15948 if (TREE_CODE (base) != VAR_DECL)
15949 return NULL_TREE;
15950 return base;
15953 /* Return true if STMT_INFO accesses a decl that is known to be the
15954 argument to a vld1 in the same function. */
15955 static bool
15956 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15958 if (!cfun->machine->vector_load_decls)
15959 return false;
15960 auto dr = STMT_VINFO_DATA_REF (stmt_info);
15961 if (!dr)
15962 return false;
15963 tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15964 return decl && cfun->machine->vector_load_decls->contains (decl);
15967 /* Information about how the CPU would issue the scalar, Advanced SIMD
15968 or SVE version of a vector loop, using the scheme defined by the
15969 aarch64_base_vec_issue_info hierarchy of structures. */
15970 class aarch64_vec_op_count
15972 public:
15973 aarch64_vec_op_count () = default;
15974 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15975 unsigned int = 1);
15977 unsigned int vec_flags () const { return m_vec_flags; }
15978 unsigned int vf_factor () const { return m_vf_factor; }
15980 const aarch64_base_vec_issue_info *base_issue_info () const;
15981 const aarch64_simd_vec_issue_info *simd_issue_info () const;
15982 const aarch64_sve_vec_issue_info *sve_issue_info () const;
15984 fractional_cost rename_cycles_per_iter () const;
15985 fractional_cost min_nonpred_cycles_per_iter () const;
15986 fractional_cost min_pred_cycles_per_iter () const;
15987 fractional_cost min_cycles_per_iter () const;
15989 void dump () const;
15991 /* The number of individual "general" operations. See the comments
15992 in aarch64_base_vec_issue_info for details. */
15993 unsigned int general_ops = 0;
15995 /* The number of load and store operations, under the same scheme
15996 as above. */
15997 unsigned int loads = 0;
15998 unsigned int stores = 0;
16000 /* The minimum number of cycles needed to execute all loop-carried
16001 operations, which in the vector code become associated with
16002 reductions. */
16003 unsigned int reduction_latency = 0;
16005 /* The number of individual predicate operations. See the comments
16006 in aarch64_sve_vec_issue_info for details. */
16007 unsigned int pred_ops = 0;
16009 private:
16010 /* The issue information for the core. */
16011 const aarch64_vec_issue_info *m_issue_info = nullptr;
16013 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16014 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16015 Advanced SIMD code.
16016 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16017 SVE code. */
16018 unsigned int m_vec_flags = 0;
16020 /* Assume that, when the code is executing on the core described
16021 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16022 times more data than the vectorizer anticipates.
16024 This is only ever different from 1 for SVE. It allows us to consider
16025 what would happen on a 256-bit SVE target even when the -mtune
16026 parameters say that the “likely” SVE length is 128 bits. */
16027 unsigned int m_vf_factor = 1;
16030 aarch64_vec_op_count::
16031 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
16032 unsigned int vec_flags, unsigned int vf_factor)
16033 : m_issue_info (issue_info),
16034 m_vec_flags (vec_flags),
16035 m_vf_factor (vf_factor)
16039 /* Return the base issue information (i.e. the parts that make sense
16040 for both scalar and vector code). Return null if we have no issue
16041 information. */
16042 const aarch64_base_vec_issue_info *
16043 aarch64_vec_op_count::base_issue_info () const
16045 if (auto *ret = simd_issue_info ())
16046 return ret;
16047 return m_issue_info->scalar;
16050 /* If the structure describes vector code and we have associated issue
16051 information, return that issue information, otherwise return null. */
16052 const aarch64_simd_vec_issue_info *
16053 aarch64_vec_op_count::simd_issue_info () const
16055 if (auto *ret = sve_issue_info ())
16056 return ret;
16057 if (m_vec_flags)
16058 return m_issue_info->advsimd;
16059 return nullptr;
16062 /* If the structure describes SVE code and we have associated issue
16063 information, return that issue information, otherwise return null. */
16064 const aarch64_sve_vec_issue_info *
16065 aarch64_vec_op_count::sve_issue_info () const
16067 if (m_vec_flags & VEC_ANY_SVE)
16068 return m_issue_info->sve;
16069 return nullptr;
16072 /* Estimate the minimum number of cycles per iteration needed to rename
16073 the instructions.
16075 ??? For now this is done inline rather than via cost tables, since it
16076 isn't clear how it should be parameterized for the general case. */
16077 fractional_cost
16078 aarch64_vec_op_count::rename_cycles_per_iter () const
16080 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16081 || sve_issue_info () == &neoversen2_sve_issue_info
16082 || sve_issue_info () == &neoversev2_sve_issue_info)
16083 /* + 1 for an addition. We've already counted a general op for each
16084 store, so we don't need to account for stores separately. The branch
16085 reads no registers and so does not need to be counted either.
16087 ??? This value is very much on the pessimistic side, but seems to work
16088 pretty well in practice. */
16089 return { general_ops + loads + pred_ops + 1, 5 };
16091 return 0;
16094 /* Like min_cycles_per_iter, but excluding predicate operations. */
16095 fractional_cost
16096 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16098 auto *issue_info = base_issue_info ();
16100 fractional_cost cycles = MAX (reduction_latency, 1);
16101 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
16102 cycles = std::max (cycles, { loads + stores,
16103 issue_info->loads_stores_per_cycle });
16104 cycles = std::max (cycles, { general_ops,
16105 issue_info->general_ops_per_cycle });
16106 cycles = std::max (cycles, rename_cycles_per_iter ());
16107 return cycles;
16110 /* Like min_cycles_per_iter, but including only the predicate operations. */
16111 fractional_cost
16112 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16114 if (auto *issue_info = sve_issue_info ())
16115 return { pred_ops, issue_info->pred_ops_per_cycle };
16116 return 0;
16119 /* Estimate the minimum number of cycles needed to issue the operations.
16120 This is a very simplistic model! */
16121 fractional_cost
16122 aarch64_vec_op_count::min_cycles_per_iter () const
16124 return std::max (min_nonpred_cycles_per_iter (),
16125 min_pred_cycles_per_iter ());
16128 /* Dump information about the structure. */
16129 void
16130 aarch64_vec_op_count::dump () const
16132 dump_printf_loc (MSG_NOTE, vect_location,
16133 " load operations = %d\n", loads);
16134 dump_printf_loc (MSG_NOTE, vect_location,
16135 " store operations = %d\n", stores);
16136 dump_printf_loc (MSG_NOTE, vect_location,
16137 " general operations = %d\n", general_ops);
16138 if (sve_issue_info ())
16139 dump_printf_loc (MSG_NOTE, vect_location,
16140 " predicate operations = %d\n", pred_ops);
16141 dump_printf_loc (MSG_NOTE, vect_location,
16142 " reduction latency = %d\n", reduction_latency);
16143 if (auto rcpi = rename_cycles_per_iter ())
16144 dump_printf_loc (MSG_NOTE, vect_location,
16145 " estimated cycles per iteration to rename = %f\n",
16146 rcpi.as_double ());
16147 if (auto pred_cpi = min_pred_cycles_per_iter ())
16149 dump_printf_loc (MSG_NOTE, vect_location,
16150 " estimated min cycles per iteration"
16151 " without predication = %f\n",
16152 min_nonpred_cycles_per_iter ().as_double ());
16153 dump_printf_loc (MSG_NOTE, vect_location,
16154 " estimated min cycles per iteration"
16155 " for predication = %f\n", pred_cpi.as_double ());
16157 if (auto cpi = min_cycles_per_iter ())
16158 dump_printf_loc (MSG_NOTE, vect_location,
16159 " estimated min cycles per iteration = %f\n",
16160 cpi.as_double ());
16163 /* Information about vector code that we're in the process of costing. */
16164 class aarch64_vector_costs : public vector_costs
16166 public:
16167 aarch64_vector_costs (vec_info *, bool);
16169 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16170 stmt_vec_info stmt_info, slp_tree, tree vectype,
16171 int misalign,
16172 vect_cost_model_location where) override;
16173 void finish_cost (const vector_costs *) override;
16174 bool better_main_loop_than_p (const vector_costs *other) const override;
16176 private:
16177 void record_potential_advsimd_unrolling (loop_vec_info);
16178 void analyze_loop_vinfo (loop_vec_info);
16179 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
16180 aarch64_vec_op_count *);
16181 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16182 fractional_cost, unsigned int,
16183 unsigned int *, bool *);
16184 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16185 unsigned int);
16186 bool prefer_unrolled_loop () const;
16187 unsigned int determine_suggested_unroll_factor ();
16189 /* True if we have performed one-time initialization based on the
16190 vec_info. */
16191 bool m_analyzed_vinfo = false;
16193 /* This loop uses an average operation that is not supported by SVE, but is
16194 supported by Advanced SIMD and SVE2. */
16195 bool m_has_avg = false;
16197 /* True if the vector body contains a store to a decl and if the
16198 function is known to have a vld1 from the same decl.
16200 In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16201 initializing a vector is:
16203 float f[4] = { elts };
16204 float32x4_t x = vld1q_f32(f);
16206 We should strongly prefer vectorization of the initialization of f,
16207 so that the store to f and the load back can be optimized away,
16208 leaving a vectorization of { elts }. */
16209 bool m_stores_to_vector_load_decl = false;
16211 /* Non-zero if the last operation we costed is a vector promotion or demotion.
16212 In this case the value is the number of insns in the last operation.
16214 On AArch64 vector promotion and demotions require us to first widen or
16215 narrow the input and only after that emit conversion instructions. For
16216 costing this means we need to emit the cost of the final conversions as
16217 well. */
16218 unsigned int m_num_last_promote_demote = 0;
16220 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16221 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16222 SIMD code.
16223 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
16224 unsigned int m_vec_flags = 0;
16226 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16227 This means that code such as:
16229 a[0] = x;
16230 a[1] = x;
16232 will be costed as two scalar instructions and two vector instructions
16233 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
16234 wins if the costs are equal, because of the fact that the vector costs
16235 include constant initializations whereas the scalar costs don't.
16236 We would therefore tend to vectorize the code above, even though
16237 the scalar version can use a single STP.
16239 We should eventually fix this and model LDP and STP in the main costs;
16240 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16241 Until then, we look specifically for code that does nothing more than
16242 STP-like operations. We cost them on that basis in addition to the
16243 normal latency-based costs.
16245 If the scalar or vector code could be a sequence of STPs +
16246 initialization, this variable counts the cost of the sequence,
16247 with 2 units per instruction. The variable is ~0U for other
16248 kinds of code. */
16249 unsigned int m_stp_sequence_cost = 0;
16251 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16252 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
16253 situations, we try to predict whether an Advanced SIMD implementation
16254 of the loop could be completely unrolled and become straight-line code.
16255 If so, it is generally better to use the Advanced SIMD version rather
16256 than length-agnostic SVE, since the SVE loop would execute an unknown
16257 number of times and so could not be completely unrolled in the same way.
16259 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16260 number of Advanced SIMD loop iterations that would be unrolled and
16261 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16262 in the unrolled loop. Both values are zero if we're not applying
16263 the heuristic. */
16264 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16265 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16267 /* If we're vectorizing a loop that executes a constant number of times,
16268 this variable gives the number of times that the vector loop would
16269 iterate, otherwise it is zero. */
16270 uint64_t m_num_vector_iterations = 0;
16272 /* Used only when vectorizing loops. Estimates the number and kind of
16273 operations that would be needed by one iteration of the scalar
16274 or vector loop. There is one entry for each tuning option of
16275 interest. */
16276 auto_vec<aarch64_vec_op_count, 2> m_ops;
16279 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16280 bool costing_for_scalar)
16281 : vector_costs (vinfo, costing_for_scalar),
16282 m_vec_flags (costing_for_scalar ? 0
16283 : aarch64_classify_vector_mode (vinfo->vector_mode))
16285 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16287 m_ops.quick_push ({ issue_info, m_vec_flags });
16288 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16290 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16291 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16292 vf_factor });
16297 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
16298 vector_costs *
16299 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16301 return new aarch64_vector_costs (vinfo, costing_for_scalar);
16304 /* Return true if the current CPU should use the new costs defined
16305 in GCC 11. This should be removed for GCC 12 and above, with the
16306 costs applying to all CPUs instead. */
16307 static bool
16308 aarch64_use_new_vector_costs_p ()
16310 return (aarch64_tune_params.extra_tuning_flags
16311 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16314 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16315 static const simd_vec_cost *
16316 aarch64_simd_vec_costs (tree vectype)
16318 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16319 if (vectype != NULL
16320 && aarch64_sve_mode_p (TYPE_MODE (vectype))
16321 && costs->sve != NULL)
16322 return costs->sve;
16323 return costs->advsimd;
16326 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16327 static const simd_vec_cost *
16328 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16330 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16331 if ((flags & VEC_ANY_SVE) && costs->sve)
16332 return costs->sve;
16333 return costs->advsimd;
16336 /* If STMT_INFO is a memory reference, return the scalar memory type,
16337 otherwise return null. */
16338 static tree
16339 aarch64_dr_type (stmt_vec_info stmt_info)
16341 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16342 return TREE_TYPE (DR_REF (dr));
16343 return NULL_TREE;
16346 /* Decide whether to use the unrolling heuristic described above
16347 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16348 describes the loop that we're vectorizing. */
16349 void
16350 aarch64_vector_costs::
16351 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16353 /* The heuristic only makes sense on targets that have the same
16354 vector throughput for SVE and Advanced SIMD. */
16355 if (!(aarch64_tune_params.extra_tuning_flags
16356 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16357 return;
16359 /* We only want to apply the heuristic if LOOP_VINFO is being
16360 vectorized for SVE. */
16361 if (!(m_vec_flags & VEC_ANY_SVE))
16362 return;
16364 /* Check whether it is possible in principle to use Advanced SIMD
16365 instead. */
16366 if (aarch64_autovec_preference == 2)
16367 return;
16369 /* We don't want to apply the heuristic to outer loops, since it's
16370 harder to track two levels of unrolling. */
16371 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16372 return;
16374 /* Only handle cases in which the number of Advanced SIMD iterations
16375 would be known at compile time but the number of SVE iterations
16376 would not. */
16377 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16378 || aarch64_sve_vg.is_constant ())
16379 return;
16381 /* Guess how many times the Advanced SIMD loop would iterate and make
16382 sure that it is within the complete unrolling limit. Even if the
16383 number of iterations is small enough, the number of statements might
16384 not be, which is why we need to estimate the number of statements too. */
16385 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16386 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16387 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16388 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16389 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16390 return;
16392 /* Record that we're applying the heuristic and should try to estimate
16393 the number of statements in the Advanced SIMD loop. */
16394 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16397 /* Do one-time initialization of the aarch64_vector_costs given that we're
16398 costing the loop vectorization described by LOOP_VINFO. */
16399 void
16400 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16402 /* Record the number of times that the vector loop would execute,
16403 if known. */
16404 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16405 auto scalar_niters = max_stmt_executions_int (loop);
16406 if (scalar_niters >= 0)
16408 unsigned int vf = vect_vf_for_cost (loop_vinfo);
16409 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16410 m_num_vector_iterations = scalar_niters / vf;
16411 else
16412 m_num_vector_iterations = CEIL (scalar_niters, vf);
16415 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16416 heuristic described above m_unrolled_advsimd_niters. */
16417 record_potential_advsimd_unrolling (loop_vinfo);
16420 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16421 static int
16422 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16423 tree vectype,
16424 int misalign ATTRIBUTE_UNUSED)
16426 unsigned elements;
16427 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16428 bool fp = false;
16430 if (vectype != NULL)
16431 fp = FLOAT_TYPE_P (vectype);
16433 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16435 switch (type_of_cost)
16437 case scalar_stmt:
16438 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16440 case scalar_load:
16441 return costs->scalar_load_cost;
16443 case scalar_store:
16444 return costs->scalar_store_cost;
16446 case vector_stmt:
16447 return fp ? simd_costs->fp_stmt_cost
16448 : simd_costs->int_stmt_cost;
16450 case vector_load:
16451 return simd_costs->align_load_cost;
16453 case vector_store:
16454 return simd_costs->store_cost;
16456 case vec_to_scalar:
16457 return simd_costs->vec_to_scalar_cost;
16459 case scalar_to_vec:
16460 return simd_costs->scalar_to_vec_cost;
16462 case unaligned_load:
16463 case vector_gather_load:
16464 return simd_costs->unalign_load_cost;
16466 case unaligned_store:
16467 case vector_scatter_store:
16468 return simd_costs->unalign_store_cost;
16470 case cond_branch_taken:
16471 return costs->cond_taken_branch_cost;
16473 case cond_branch_not_taken:
16474 return costs->cond_not_taken_branch_cost;
16476 case vec_perm:
16477 return simd_costs->permute_cost;
16479 case vec_promote_demote:
16480 return fp ? simd_costs->fp_stmt_cost
16481 : simd_costs->int_stmt_cost;
16483 case vec_construct:
16484 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16485 return elements / 2 + 1;
16487 default:
16488 gcc_unreachable ();
16492 /* Return true if an access of kind KIND for STMT_INFO represents one
16493 vector of an LD[234] or ST[234] operation. Return the total number of
16494 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16495 static int
16496 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16498 if ((kind == vector_load
16499 || kind == unaligned_load
16500 || kind == vector_store
16501 || kind == unaligned_store)
16502 && STMT_VINFO_DATA_REF (stmt_info))
16504 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16505 if (stmt_info
16506 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16507 return DR_GROUP_SIZE (stmt_info);
16509 return 0;
16512 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16513 vectors would produce a series of LDP or STP operations. KIND is the
16514 kind of statement that STMT_INFO represents. */
16515 static bool
16516 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16517 stmt_vec_info stmt_info)
16519 switch (kind)
16521 case vector_load:
16522 case vector_store:
16523 case unaligned_load:
16524 case unaligned_store:
16525 break;
16527 default:
16528 return false;
16531 return is_gimple_assign (stmt_info->stmt);
16534 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16535 or multiply-subtract sequence that might be suitable for fusing into a
16536 single instruction. If VEC_FLAGS is zero, analyze the operation as
16537 a scalar one, otherwise analyze it as an operation on vectors with those
16538 VEC_* flags. */
16539 static bool
16540 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16541 unsigned int vec_flags)
16543 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16544 if (!assign)
16545 return false;
16546 tree_code code = gimple_assign_rhs_code (assign);
16547 if (code != PLUS_EXPR && code != MINUS_EXPR)
16548 return false;
16550 auto is_mul_result = [&](int i)
16552 tree rhs = gimple_op (assign, i);
16553 /* ??? Should we try to check for a single use as well? */
16554 if (TREE_CODE (rhs) != SSA_NAME)
16555 return false;
16557 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16558 if (!def_stmt_info
16559 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16560 return false;
16561 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16562 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16563 return false;
16565 if (vec_flags & VEC_ADVSIMD)
16567 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16568 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16569 only supports MLA forms, so will require a move if the result
16570 cannot be tied to the accumulator. The most important case in
16571 which this is true is when the accumulator input is invariant. */
16572 rhs = gimple_op (assign, 3 - i);
16573 if (TREE_CODE (rhs) != SSA_NAME)
16574 return false;
16575 def_stmt_info = vinfo->lookup_def (rhs);
16576 if (!def_stmt_info
16577 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16578 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16579 return false;
16582 return true;
16585 if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16586 /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16587 multiplication must be on the second operand (to form an FMLS).
16588 But if both operands are multiplications and the second operand
16589 is used more than once, we'll instead negate the second operand
16590 and use it as an accumulator for the first operand. */
16591 return (is_mul_result (2)
16592 && (has_single_use (gimple_assign_rhs2 (assign))
16593 || !is_mul_result (1)));
16595 return is_mul_result (1) || is_mul_result (2);
16598 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16599 expression sequence that might be suitable for fusing into a
16600 single instruction. If VEC_FLAGS is zero, analyze the operation as
16601 a scalar one, otherwise analyze it as an operation on vectors with those
16602 VEC_* flags. */
16604 static bool
16605 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16606 unsigned int vec_flags)
16608 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16609 if (!assign
16610 || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16611 || !STMT_VINFO_VECTYPE (stmt_info)
16612 || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16613 return false;
16615 for (int i = 1; i < 3; ++i)
16617 tree rhs = gimple_op (assign, i);
16619 if (TREE_CODE (rhs) != SSA_NAME)
16620 continue;
16622 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16623 if (!def_stmt_info
16624 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16625 continue;
16627 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16628 if (!rhs_assign
16629 || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16630 != tcc_comparison)
16631 continue;
16633 if (vec_flags & VEC_ADVSIMD)
16634 return false;
16636 return true;
16638 return false;
16641 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16642 in-loop reduction that SVE supports directly, return its latency in cycles,
16643 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16644 instructions. */
16645 static unsigned int
16646 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16647 stmt_vec_info stmt_info,
16648 const sve_vec_cost *sve_costs)
16650 switch (vect_reduc_type (vinfo, stmt_info))
16652 case EXTRACT_LAST_REDUCTION:
16653 return sve_costs->clast_cost;
16655 case FOLD_LEFT_REDUCTION:
16656 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16658 case E_HFmode:
16659 case E_BFmode:
16660 return sve_costs->fadda_f16_cost;
16662 case E_SFmode:
16663 return sve_costs->fadda_f32_cost;
16665 case E_DFmode:
16666 return sve_costs->fadda_f64_cost;
16668 default:
16669 break;
16671 break;
16674 return 0;
16677 /* STMT_INFO describes a loop-carried operation in the original scalar code
16678 that we are considering implementing as a reduction. Return one of the
16679 following values, depending on VEC_FLAGS:
16681 - If VEC_FLAGS is zero, return the loop carry latency of the original
16682 scalar operation.
16684 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16685 Advanced SIMD implementation.
16687 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16688 SVE implementation. */
16689 static unsigned int
16690 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16691 unsigned int vec_flags)
16693 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16694 const sve_vec_cost *sve_costs = nullptr;
16695 if (vec_flags & VEC_ANY_SVE)
16696 sve_costs = aarch64_tune_params.vec_costs->sve;
16698 /* If the caller is asking for the SVE latency, check for forms of reduction
16699 that only SVE can handle directly. */
16700 if (sve_costs)
16702 unsigned int latency
16703 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16704 if (latency)
16705 return latency;
16708 /* Handle scalar costs. */
16709 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16710 if (vec_flags == 0)
16712 if (is_float)
16713 return vec_costs->scalar_fp_stmt_cost;
16714 return vec_costs->scalar_int_stmt_cost;
16717 /* Otherwise, the loop body just contains normal integer or FP operations,
16718 with a vector reduction outside the loop. */
16719 const simd_vec_cost *simd_costs
16720 = aarch64_simd_vec_costs_for_flags (vec_flags);
16721 if (is_float)
16722 return simd_costs->fp_stmt_cost;
16723 return simd_costs->int_stmt_cost;
16726 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16727 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16728 try to subdivide the target-independent categorization provided by KIND
16729 to get a more accurate cost. */
16730 static fractional_cost
16731 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16732 stmt_vec_info stmt_info,
16733 fractional_cost stmt_cost)
16735 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16736 the extension with the load. */
16737 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16738 return 0;
16740 return stmt_cost;
16743 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16744 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16745 when vectorized would operate on vector type VECTYPE. Try to subdivide
16746 the target-independent categorization provided by KIND to get a more
16747 accurate cost. WHERE specifies where the cost associated with KIND
16748 occurs. */
16749 static fractional_cost
16750 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16751 stmt_vec_info stmt_info, tree vectype,
16752 enum vect_cost_model_location where,
16753 fractional_cost stmt_cost)
16755 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16756 const sve_vec_cost *sve_costs = nullptr;
16757 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16758 sve_costs = aarch64_tune_params.vec_costs->sve;
16760 /* It's generally better to avoid costing inductions, since the induction
16761 will usually be hidden by other operations. This is particularly true
16762 for things like COND_REDUCTIONS. */
16763 if (is_a<gphi *> (stmt_info->stmt))
16764 return 0;
16766 /* Detect cases in which vec_to_scalar is describing the extraction of a
16767 vector element in preparation for a scalar store. The store itself is
16768 costed separately. */
16769 if (vect_is_store_elt_extraction (kind, stmt_info))
16770 return simd_costs->store_elt_extra_cost;
16772 /* Detect SVE gather loads, which are costed as a single scalar_load
16773 for each element. We therefore need to divide the full-instruction
16774 cost by the number of elements in the vector. */
16775 if (kind == scalar_load
16776 && sve_costs
16777 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16779 unsigned int nunits = vect_nunits_for_cost (vectype);
16780 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16781 return { sve_costs->gather_load_x64_cost, nunits };
16782 return { sve_costs->gather_load_x32_cost, nunits };
16785 /* Detect cases in which a scalar_store is really storing one element
16786 in a scatter operation. */
16787 if (kind == scalar_store
16788 && sve_costs
16789 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16790 return sve_costs->scatter_store_elt_cost;
16792 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16793 if (kind == vec_to_scalar
16794 && where == vect_body
16795 && sve_costs)
16797 unsigned int latency
16798 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16799 if (latency)
16800 return latency;
16803 /* Detect cases in which vec_to_scalar represents a single reduction
16804 instruction like FADDP or MAXV. */
16805 if (kind == vec_to_scalar
16806 && where == vect_epilogue
16807 && vect_is_reduction (stmt_info))
16808 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16810 case E_QImode:
16811 return simd_costs->reduc_i8_cost;
16813 case E_HImode:
16814 return simd_costs->reduc_i16_cost;
16816 case E_SImode:
16817 return simd_costs->reduc_i32_cost;
16819 case E_DImode:
16820 return simd_costs->reduc_i64_cost;
16822 case E_HFmode:
16823 case E_BFmode:
16824 return simd_costs->reduc_f16_cost;
16826 case E_SFmode:
16827 return simd_costs->reduc_f32_cost;
16829 case E_DFmode:
16830 return simd_costs->reduc_f64_cost;
16832 default:
16833 break;
16836 /* Otherwise stick with the original categorization. */
16837 return stmt_cost;
16840 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16841 for STMT_INFO, which has cost kind KIND and which when vectorized would
16842 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16843 targets. */
16844 static fractional_cost
16845 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16846 stmt_vec_info stmt_info, tree vectype,
16847 fractional_cost stmt_cost)
16849 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16850 vector register size or number of units. Integer promotions of this
16851 type therefore map to SXT[BHW] or UXT[BHW].
16853 Most loads have extending forms that can do the sign or zero extension
16854 on the fly. Optimistically assume that a load followed by an extension
16855 will fold to this form during combine, and that the extension therefore
16856 comes for free. */
16857 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16858 stmt_cost = 0;
16860 /* For similar reasons, vector_stmt integer truncations are a no-op,
16861 because we can just ignore the unused upper bits of the source. */
16862 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16863 stmt_cost = 0;
16865 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16866 but there are no equivalent instructions for SVE. This means that
16867 (all other things being equal) 128-bit SVE needs twice as many load
16868 and store instructions as Advanced SIMD in order to process vector pairs.
16870 Also, scalar code can often use LDP and STP to access pairs of values,
16871 so it is too simplistic to say that one SVE load or store replaces
16872 VF scalar loads and stores.
16874 Ideally we would account for this in the scalar and Advanced SIMD
16875 costs by making suitable load/store pairs as cheap as a single
16876 load/store. However, that would be a very invasive change and in
16877 practice it tends to stress other parts of the cost model too much.
16878 E.g. stores of scalar constants currently count just a store,
16879 whereas stores of vector constants count a store and a vec_init.
16880 This is an artificial distinction for AArch64, where stores of
16881 nonzero scalar constants need the same kind of register invariant
16882 as vector stores.
16884 An alternative would be to double the cost of any SVE loads and stores
16885 that could be paired in Advanced SIMD (and possibly also paired in
16886 scalar code). But this tends to stress other parts of the cost model
16887 in the same way. It also means that we can fall back to Advanced SIMD
16888 even if full-loop predication would have been useful.
16890 Here we go for a more conservative version: double the costs of SVE
16891 loads and stores if one iteration of the scalar loop processes enough
16892 elements for it to use a whole number of Advanced SIMD LDP or STP
16893 instructions. This makes it very likely that the VF would be 1 for
16894 Advanced SIMD, and so no epilogue should be needed. */
16895 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16897 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16898 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16899 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16900 if (multiple_p (count * elt_bits, 256)
16901 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16902 stmt_cost *= 2;
16905 return stmt_cost;
16908 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16909 and which when vectorized would operate on vector type VECTYPE. Add the
16910 cost of any embedded operations. */
16911 static fractional_cost
16912 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
16913 stmt_vec_info stmt_info, tree vectype,
16914 unsigned vec_flags, fractional_cost stmt_cost)
16916 if (vectype)
16918 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16920 /* Detect cases in which a vector load or store represents an
16921 LD[234] or ST[234] instruction. */
16922 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16924 case 2:
16925 stmt_cost += simd_costs->ld2_st2_permute_cost;
16926 break;
16928 case 3:
16929 stmt_cost += simd_costs->ld3_st3_permute_cost;
16930 break;
16932 case 4:
16933 stmt_cost += simd_costs->ld4_st4_permute_cost;
16934 break;
16937 gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
16938 if ((kind == scalar_stmt || kind == vector_stmt) && assign)
16940 /* For MLA we need to reduce the cost since MLA is 1 instruction. */
16941 if (!vect_is_reduction (stmt_info)
16942 && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
16943 return 0;
16945 /* For vector boolean ANDs with a compare operand we just need
16946 one insn. */
16947 if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
16948 return 0;
16951 if (kind == vector_stmt || kind == vec_to_scalar)
16952 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16954 if (FLOAT_TYPE_P (cmp_type))
16955 stmt_cost += simd_costs->fp_stmt_cost;
16956 else
16957 stmt_cost += simd_costs->int_stmt_cost;
16961 if (kind == scalar_stmt)
16962 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16964 if (FLOAT_TYPE_P (cmp_type))
16965 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16966 else
16967 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16970 return stmt_cost;
16973 /* Return true if STMT_INFO is part of a reduction that has the form:
16975 r = r op ...;
16976 r = r op ...;
16978 with the single accumulator being read and written multiple times. */
16979 static bool
16980 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
16982 if (!STMT_VINFO_REDUC_DEF (stmt_info))
16983 return false;
16985 auto reduc_info = info_for_reduction (vinfo, stmt_info);
16986 return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
16989 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16990 and they describe an operation in the body of a vector loop. Record issue
16991 information relating to the vector operation in OPS. */
16992 void
16993 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16994 stmt_vec_info stmt_info,
16995 aarch64_vec_op_count *ops)
16997 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16998 if (!base_issue)
16999 return;
17000 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
17001 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
17003 /* Calculate the minimum cycles per iteration imposed by a reduction
17004 operation. */
17005 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17006 && vect_is_reduction (stmt_info))
17008 unsigned int base
17009 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
17010 if (aarch64_force_single_cycle (m_vinfo, stmt_info))
17011 /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17012 and then accumulate that, but at the moment the loop-carried
17013 dependency includes all copies. */
17014 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
17015 else
17016 ops->reduction_latency = MAX (ops->reduction_latency, base);
17019 if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
17021 /* Assume that multiply-adds will become a single operation. */
17022 if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
17023 return;
17025 /* Assume that bool AND with compare operands will become a single
17026 operation. */
17027 if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
17028 return;
17032 /* Count the basic operation cost associated with KIND. */
17033 switch (kind)
17035 case cond_branch_taken:
17036 case cond_branch_not_taken:
17037 case vector_gather_load:
17038 case vector_scatter_store:
17039 /* We currently don't expect these to be used in a loop body. */
17040 break;
17042 case vec_perm:
17043 case vec_promote_demote:
17044 case vec_construct:
17045 case vec_to_scalar:
17046 case scalar_to_vec:
17047 case vector_stmt:
17048 case scalar_stmt:
17049 ops->general_ops += count;
17050 break;
17052 case scalar_load:
17053 case vector_load:
17054 case unaligned_load:
17055 ops->loads += count;
17056 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17057 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
17058 break;
17060 case vector_store:
17061 case unaligned_store:
17062 case scalar_store:
17063 ops->stores += count;
17064 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17065 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
17066 break;
17069 /* Add any embedded comparison operations. */
17070 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17071 && vect_embedded_comparison_type (stmt_info))
17072 ops->general_ops += count;
17074 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17075 have only accounted for one. */
17076 if ((kind == vector_stmt || kind == vec_to_scalar)
17077 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
17078 ops->general_ops += count;
17080 /* Count the predicate operations needed by an SVE comparison. */
17081 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
17082 if (tree type = vect_comparison_type (stmt_info))
17084 unsigned int base = (FLOAT_TYPE_P (type)
17085 ? sve_issue->fp_cmp_pred_ops
17086 : sve_issue->int_cmp_pred_ops);
17087 ops->pred_ops += base * count;
17090 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
17091 if (simd_issue)
17092 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
17094 case 2:
17095 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
17096 break;
17098 case 3:
17099 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
17100 break;
17102 case 4:
17103 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
17104 break;
17107 /* Add any overhead associated with gather loads and scatter stores. */
17108 if (sve_issue
17109 && (kind == scalar_load || kind == scalar_store)
17110 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
17112 unsigned int pairs = CEIL (count, 2);
17113 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17114 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17118 /* Return true if STMT_INFO contains a memory access and if the constant
17119 component of the memory address is aligned to SIZE bytes. */
17120 static bool
17121 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17122 poly_uint64 size)
17124 if (!STMT_VINFO_DATA_REF (stmt_info))
17125 return false;
17127 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17128 stmt_info = first_stmt;
17129 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17130 /* Needed for gathers & scatters, for example. */
17131 if (!constant_offset)
17132 return false;
17134 return multiple_p (wi::to_poly_offset (constant_offset), size);
17137 /* Check if a scalar or vector stmt could be part of a region of code
17138 that does nothing more than store values to memory, in the scalar
17139 case using STP. Return the cost of the stmt if so, counting 2 for
17140 one instruction. Return ~0U otherwise.
17142 The arguments are a subset of those passed to add_stmt_cost. */
17143 unsigned int
17144 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17145 stmt_vec_info stmt_info, tree vectype)
17147 /* Code that stores vector constants uses a vector_load to create
17148 the constant. We don't apply the heuristic to that case for two
17149 main reasons:
17151 - At the moment, STPs are only formed via peephole2, and the
17152 constant scalar moves would often come between STRs and so
17153 prevent STP formation.
17155 - The scalar code also has to load the constant somehow, and that
17156 isn't costed. */
17157 switch (kind)
17159 case scalar_to_vec:
17160 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
17161 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17163 case vec_construct:
17164 if (FLOAT_TYPE_P (vectype))
17165 /* Count 1 insn for the maximum number of FP->SIMD INS
17166 instructions. */
17167 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17169 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17170 maximum number of GPR->SIMD INS instructions. */
17171 return vect_nunits_for_cost (vectype) * 4 * count;
17173 case vector_store:
17174 case unaligned_store:
17175 /* Count 1 insn per vector if we can't form STP Q pairs. */
17176 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17177 return count * 2;
17179 if (stmt_info)
17181 /* Assume we won't be able to use STP if the constant offset
17182 component of the address is misaligned. ??? This could be
17183 removed if we formed STP pairs earlier, rather than relying
17184 on peephole2. */
17185 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17186 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17187 return count * 2;
17189 return CEIL (count, 2) * 2;
17191 case scalar_store:
17192 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17194 /* Check for a mode in which STP pairs can be formed. */
17195 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17196 if (maybe_ne (size, 4) && maybe_ne (size, 8))
17197 return ~0U;
17199 /* Assume we won't be able to use STP if the constant offset
17200 component of the address is misaligned. ??? This could be
17201 removed if we formed STP pairs earlier, rather than relying
17202 on peephole2. */
17203 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17204 return ~0U;
17206 return count;
17208 default:
17209 return ~0U;
17213 unsigned
17214 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17215 stmt_vec_info stmt_info, slp_tree,
17216 tree vectype, int misalign,
17217 vect_cost_model_location where)
17219 fractional_cost stmt_cost
17220 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17222 bool in_inner_loop_p = (where == vect_body
17223 && stmt_info
17224 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17226 /* Do one-time initialization based on the vinfo. */
17227 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17228 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
17230 if (loop_vinfo)
17231 analyze_loop_vinfo (loop_vinfo);
17233 m_analyzed_vinfo = true;
17236 /* Apply the heuristic described above m_stp_sequence_cost. */
17237 if (m_stp_sequence_cost != ~0U)
17239 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17240 stmt_info, vectype);
17241 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17244 /* Try to get a more accurate cost by looking at STMT_INFO instead
17245 of just looking at KIND. */
17246 if (stmt_info && aarch64_use_new_vector_costs_p ())
17248 /* If we scalarize a strided store, the vectorizer costs one
17249 vec_to_scalar for each element. However, we can store the first
17250 element using an FP store without a separate extract step. */
17251 if (vect_is_store_elt_extraction (kind, stmt_info))
17252 count -= 1;
17254 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17255 stmt_info, stmt_cost);
17257 if (vectype && m_vec_flags)
17258 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17259 stmt_info, vectype,
17260 where, stmt_cost);
17263 /* Do any SVE-specific adjustments to the cost. */
17264 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17265 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17266 vectype, stmt_cost);
17268 /* Vector promotion and demotion requires us to widen the operation first
17269 and only after that perform the conversion. Unfortunately the mid-end
17270 expects this to be doable as a single operation and doesn't pass on
17271 enough context here for us to tell which operation is happening. To
17272 account for this we count every promote-demote operation twice and if
17273 the previously costed operation was also a promote-demote we reduce
17274 the cost of the currently being costed operation to simulate the final
17275 conversion cost. Note that for SVE we can do better here if the converted
17276 value comes from a load since the widening load would consume the widening
17277 operations. However since we're in stage 3 we can't change the helper
17278 vect_is_extending_load and duplicating the code seems not useful. */
17279 gassign *assign = NULL;
17280 if (kind == vec_promote_demote
17281 && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17282 && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17284 auto new_count = count * 2 - m_num_last_promote_demote;
17285 m_num_last_promote_demote = count;
17286 count = new_count;
17288 else
17289 m_num_last_promote_demote = 0;
17291 if (stmt_info && aarch64_use_new_vector_costs_p ())
17293 /* Account for any extra "embedded" costs that apply additively
17294 to the base cost calculated above. */
17295 stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17296 vectype, m_vec_flags, stmt_cost);
17298 /* If we're recording a nonzero vector loop body cost for the
17299 innermost loop, also estimate the operations that would need
17300 to be issued by all relevant implementations of the loop. */
17301 if (loop_vinfo
17302 && (m_costing_for_scalar || where == vect_body)
17303 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17304 && stmt_cost != 0)
17305 for (auto &ops : m_ops)
17306 count_ops (count, kind, stmt_info, &ops);
17308 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17309 estimate the number of statements in the unrolled Advanced SIMD
17310 loop. For simplicitly, we assume that one iteration of the
17311 Advanced SIMD loop would need the same number of statements
17312 as one iteration of the SVE loop. */
17313 if (where == vect_body && m_unrolled_advsimd_niters)
17314 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17316 /* Detect the use of an averaging operation. */
17317 gimple *stmt = stmt_info->stmt;
17318 if (is_gimple_call (stmt)
17319 && gimple_call_internal_p (stmt))
17321 switch (gimple_call_internal_fn (stmt))
17323 case IFN_AVG_FLOOR:
17324 case IFN_AVG_CEIL:
17325 m_has_avg = true;
17326 default:
17327 break;
17332 /* If the statement stores to a decl that is known to be the argument
17333 to a vld1 in the same function, ignore the store for costing purposes.
17334 See the comment above m_stores_to_vector_load_decl for more details. */
17335 if (stmt_info
17336 && (kind == vector_store || kind == unaligned_store)
17337 && aarch64_accesses_vector_load_decl_p (stmt_info))
17339 stmt_cost = 0;
17340 m_stores_to_vector_load_decl = true;
17343 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17346 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17347 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17348 says that we should prefer the Advanced SIMD loop. */
17349 bool
17350 aarch64_vector_costs::prefer_unrolled_loop () const
17352 if (!m_unrolled_advsimd_stmts)
17353 return false;
17355 if (dump_enabled_p ())
17356 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17357 " unrolled Advanced SIMD loop = "
17358 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17359 m_unrolled_advsimd_stmts);
17361 /* The balance here is tricky. On the one hand, we can't be sure whether
17362 the code is vectorizable with Advanced SIMD or not. However, even if
17363 it isn't vectorizable with Advanced SIMD, there's a possibility that
17364 the scalar code could also be unrolled. Some of the code might then
17365 benefit from SLP, or from using LDP and STP. We therefore apply
17366 the heuristic regardless of can_use_advsimd_p. */
17367 return (m_unrolled_advsimd_stmts
17368 && (m_unrolled_advsimd_stmts
17369 <= (unsigned int) param_max_completely_peeled_insns));
17372 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
17373 how fast the SVE code can be issued and compare it to the equivalent value
17374 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
17375 also compare it to the issue rate of Advanced SIMD code
17376 (ADVSIMD_CYCLES_PER_ITER).
17378 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17379 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
17380 is true if we think the loop body is too expensive. */
17382 fractional_cost
17383 aarch64_vector_costs::
17384 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17385 fractional_cost scalar_cycles_per_iter,
17386 unsigned int orig_body_cost, unsigned int *body_cost,
17387 bool *should_disparage)
17389 if (dump_enabled_p ())
17390 ops->dump ();
17392 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17393 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17395 /* If the scalar version of the loop could issue at least as
17396 quickly as the predicate parts of the SVE loop, make the SVE loop
17397 prohibitively expensive. In this case vectorization is adding an
17398 overhead that the original scalar code didn't have.
17400 This is mostly intended to detect cases in which WHILELOs dominate
17401 for very tight loops, which is something that normal latency-based
17402 costs would not model. Adding this kind of cliffedge would be
17403 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17404 code in the caller handles that case in a more conservative way. */
17405 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17406 if (scalar_cycles_per_iter < sve_estimate)
17408 unsigned int min_cost
17409 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17410 if (*body_cost < min_cost)
17412 if (dump_enabled_p ())
17413 dump_printf_loc (MSG_NOTE, vect_location,
17414 "Increasing body cost to %d because the"
17415 " scalar code could issue within the limit"
17416 " imposed by predicate operations\n",
17417 min_cost);
17418 *body_cost = min_cost;
17419 *should_disparage = true;
17423 return sve_cycles_per_iter;
17426 unsigned int
17427 aarch64_vector_costs::determine_suggested_unroll_factor ()
17429 bool sve = m_vec_flags & VEC_ANY_SVE;
17430 /* If we are trying to unroll an Advanced SIMD main loop that contains
17431 an averaging operation that we do not support with SVE and we might use a
17432 predicated epilogue, we need to be conservative and block unrolling as
17433 this might lead to a less optimal loop for the first and only epilogue
17434 using the original loop's vectorization factor.
17435 TODO: Remove this constraint when we add support for multiple epilogue
17436 vectorization. */
17437 if (!sve && !TARGET_SVE2 && m_has_avg)
17438 return 1;
17440 unsigned int max_unroll_factor = 1;
17441 for (auto vec_ops : m_ops)
17443 aarch64_simd_vec_issue_info const *vec_issue
17444 = vec_ops.simd_issue_info ();
17445 if (!vec_issue)
17446 return 1;
17447 /* Limit unroll factor to a value adjustable by the user, the default
17448 value is 4. */
17449 unsigned int unroll_factor = aarch64_vect_unroll_limit;
17450 unsigned int factor
17451 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17452 unsigned int temp;
17454 /* Sanity check, this should never happen. */
17455 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17456 return 1;
17458 /* Check stores. */
17459 if (vec_ops.stores > 0)
17461 temp = CEIL (factor * vec_issue->stores_per_cycle,
17462 vec_ops.stores);
17463 unroll_factor = MIN (unroll_factor, temp);
17466 /* Check loads + stores. */
17467 if (vec_ops.loads > 0)
17469 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17470 vec_ops.loads + vec_ops.stores);
17471 unroll_factor = MIN (unroll_factor, temp);
17474 /* Check general ops. */
17475 if (vec_ops.general_ops > 0)
17477 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17478 vec_ops.general_ops);
17479 unroll_factor = MIN (unroll_factor, temp);
17481 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17484 /* Make sure unroll factor is power of 2. */
17485 return 1 << ceil_log2 (max_unroll_factor);
17488 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17489 and return the new cost. */
17490 unsigned int
17491 aarch64_vector_costs::
17492 adjust_body_cost (loop_vec_info loop_vinfo,
17493 const aarch64_vector_costs *scalar_costs,
17494 unsigned int body_cost)
17496 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17497 return body_cost;
17499 const auto &scalar_ops = scalar_costs->m_ops[0];
17500 const auto &vector_ops = m_ops[0];
17501 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17502 unsigned int orig_body_cost = body_cost;
17503 bool should_disparage = false;
17505 if (dump_enabled_p ())
17506 dump_printf_loc (MSG_NOTE, vect_location,
17507 "Original vector body cost = %d\n", body_cost);
17509 fractional_cost scalar_cycles_per_iter
17510 = scalar_ops.min_cycles_per_iter () * estimated_vf;
17512 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17514 if (dump_enabled_p ())
17516 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17517 dump_printf_loc (MSG_NOTE, vect_location,
17518 "Vector loop iterates at most %wd times\n",
17519 m_num_vector_iterations);
17520 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17521 scalar_ops.dump ();
17522 dump_printf_loc (MSG_NOTE, vect_location,
17523 " estimated cycles per vector iteration"
17524 " (for VF %d) = %f\n",
17525 estimated_vf, scalar_cycles_per_iter.as_double ());
17528 if (vector_ops.sve_issue_info ())
17530 if (dump_enabled_p ())
17531 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17532 vector_cycles_per_iter
17533 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17534 orig_body_cost, &body_cost, &should_disparage);
17536 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17538 /* Also take Neoverse V1 tuning into account, doubling the
17539 scalar and Advanced SIMD estimates to account for the
17540 doubling in SVE vector length. */
17541 if (dump_enabled_p ())
17542 dump_printf_loc (MSG_NOTE, vect_location,
17543 "Neoverse V1 estimate:\n");
17544 auto vf_factor = m_ops[1].vf_factor ();
17545 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17546 orig_body_cost, &body_cost, &should_disparage);
17549 else
17551 if (dump_enabled_p ())
17553 dump_printf_loc (MSG_NOTE, vect_location,
17554 "Vector issue estimate:\n");
17555 vector_ops.dump ();
17559 /* Decide whether to stick to latency-based costs or whether to try to
17560 take issue rates into account. */
17561 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17562 if (m_vec_flags & VEC_ANY_SVE)
17563 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17565 if (m_num_vector_iterations >= 1
17566 && m_num_vector_iterations < threshold)
17568 if (dump_enabled_p ())
17569 dump_printf_loc (MSG_NOTE, vect_location,
17570 "Low iteration count, so using pure latency"
17571 " costs\n");
17573 /* Increase the cost of the vector code if it looks like the scalar code
17574 could issue more quickly. These values are only rough estimates,
17575 so minor differences should only result in minor changes. */
17576 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17578 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17579 scalar_cycles_per_iter);
17580 if (dump_enabled_p ())
17581 dump_printf_loc (MSG_NOTE, vect_location,
17582 "Increasing body cost to %d because scalar code"
17583 " would issue more quickly\n", body_cost);
17585 /* In general, it's expected that the proposed vector code would be able
17586 to issue more quickly than the original scalar code. This should
17587 already be reflected to some extent in the latency-based costs.
17589 However, the latency-based costs effectively assume that the scalar
17590 code and the vector code execute serially, which tends to underplay
17591 one important case: if the real (non-serialized) execution time of
17592 a scalar iteration is dominated by loop-carried dependencies,
17593 and if the vector code is able to reduce both the length of
17594 the loop-carried dependencies *and* the number of cycles needed
17595 to issue the code in general, we can be more confident that the
17596 vector code is an improvement, even if adding the other (non-loop-carried)
17597 latencies tends to hide this saving. We therefore reduce the cost of the
17598 vector loop body in proportion to the saving. */
17599 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17600 && scalar_ops.reduction_latency == scalar_cycles_per_iter
17601 && scalar_cycles_per_iter > vector_cycles_per_iter
17602 && !should_disparage)
17604 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17605 scalar_cycles_per_iter);
17606 if (dump_enabled_p ())
17607 dump_printf_loc (MSG_NOTE, vect_location,
17608 "Decreasing body cost to %d account for smaller"
17609 " reduction latency\n", body_cost);
17612 return body_cost;
17615 void
17616 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17618 /* Record the issue information for any SVE WHILE instructions that the
17619 loop needs. */
17620 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17621 if (!m_ops.is_empty ()
17622 && loop_vinfo
17623 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
17625 unsigned int num_masks = 0;
17626 rgroup_controls *rgm;
17627 unsigned int num_vectors_m1;
17628 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
17629 num_vectors_m1, rgm)
17630 if (rgm->type)
17631 num_masks += num_vectors_m1 + 1;
17632 for (auto &ops : m_ops)
17633 if (auto *issue = ops.sve_issue_info ())
17634 ops.pred_ops += num_masks * issue->while_pred_ops;
17637 auto *scalar_costs
17638 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17639 if (loop_vinfo
17640 && m_vec_flags
17641 && aarch64_use_new_vector_costs_p ())
17643 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17644 m_costs[vect_body]);
17645 m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17648 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17649 the scalar code in the event of a tie, since there is more chance
17650 of scalar code being optimized with surrounding operations.
17652 In addition, if the vector body is a simple store to a decl that
17653 is elsewhere loaded using vld1, strongly prefer the vector form,
17654 to the extent of giving the prologue a zero cost. See the comment
17655 above m_stores_to_vector_load_decl for details. */
17656 if (!loop_vinfo
17657 && scalar_costs
17658 && m_stp_sequence_cost != ~0U)
17660 if (m_stores_to_vector_load_decl)
17661 m_costs[vect_prologue] = 0;
17662 else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17663 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17666 vector_costs::finish_cost (scalar_costs);
17669 bool
17670 aarch64_vector_costs::
17671 better_main_loop_than_p (const vector_costs *uncast_other) const
17673 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17675 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17676 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17678 if (dump_enabled_p ())
17679 dump_printf_loc (MSG_NOTE, vect_location,
17680 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17681 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17682 vect_vf_for_cost (this_loop_vinfo),
17683 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17684 vect_vf_for_cost (other_loop_vinfo));
17686 /* Apply the unrolling heuristic described above
17687 m_unrolled_advsimd_niters. */
17688 if (bool (m_unrolled_advsimd_stmts)
17689 != bool (other->m_unrolled_advsimd_stmts))
17691 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17692 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17693 if (this_prefer_unrolled != other_prefer_unrolled)
17695 if (dump_enabled_p ())
17696 dump_printf_loc (MSG_NOTE, vect_location,
17697 "Preferring Advanced SIMD loop because"
17698 " it can be unrolled\n");
17699 return other_prefer_unrolled;
17703 for (unsigned int i = 0; i < m_ops.length (); ++i)
17705 if (dump_enabled_p ())
17707 if (i)
17708 dump_printf_loc (MSG_NOTE, vect_location,
17709 "Reconsidering with subtuning %d\n", i);
17710 dump_printf_loc (MSG_NOTE, vect_location,
17711 "Issue info for %s loop:\n",
17712 GET_MODE_NAME (this_loop_vinfo->vector_mode));
17713 this->m_ops[i].dump ();
17714 dump_printf_loc (MSG_NOTE, vect_location,
17715 "Issue info for %s loop:\n",
17716 GET_MODE_NAME (other_loop_vinfo->vector_mode));
17717 other->m_ops[i].dump ();
17720 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17721 * this->m_ops[i].vf_factor ());
17722 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17723 * other->m_ops[i].vf_factor ());
17725 /* If it appears that one loop could process the same amount of data
17726 in fewer cycles, prefer that loop over the other one. */
17727 fractional_cost this_cost
17728 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17729 fractional_cost other_cost
17730 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17731 if (dump_enabled_p ())
17733 dump_printf_loc (MSG_NOTE, vect_location,
17734 "Weighted cycles per iteration of %s loop ~= %f\n",
17735 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17736 this_cost.as_double ());
17737 dump_printf_loc (MSG_NOTE, vect_location,
17738 "Weighted cycles per iteration of %s loop ~= %f\n",
17739 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17740 other_cost.as_double ());
17742 if (this_cost != other_cost)
17744 if (dump_enabled_p ())
17745 dump_printf_loc (MSG_NOTE, vect_location,
17746 "Preferring loop with lower cycles"
17747 " per iteration\n");
17748 return this_cost < other_cost;
17751 /* If the issue rate of SVE code is limited by predicate operations
17752 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17753 and if Advanced SIMD code could issue within the limit imposed
17754 by the predicate operations, the predicate operations are adding an
17755 overhead that the original code didn't have and so we should prefer
17756 the Advanced SIMD version. */
17757 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17758 const aarch64_vec_op_count &b) -> bool
17760 if (a.pred_ops == 0
17761 && (b.min_pred_cycles_per_iter ()
17762 > b.min_nonpred_cycles_per_iter ()))
17764 if (dump_enabled_p ())
17765 dump_printf_loc (MSG_NOTE, vect_location,
17766 "Preferring Advanced SIMD loop since"
17767 " SVE loop is predicate-limited\n");
17768 return true;
17770 return false;
17772 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17773 return true;
17774 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17775 return false;
17778 return vector_costs::better_main_loop_than_p (other);
17781 static void initialize_aarch64_code_model (struct gcc_options *);
17783 /* Parse the TO_PARSE string and put the architecture struct that it
17784 selects into RES and the architectural features into ISA_FLAGS.
17785 Return an aarch_parse_opt_result describing the parse result.
17786 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17787 When the TO_PARSE string contains an invalid extension,
17788 a copy of the string is created and stored to INVALID_EXTENSION. */
17790 static enum aarch_parse_opt_result
17791 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17792 aarch64_feature_flags *isa_flags,
17793 std::string *invalid_extension)
17795 const char *ext;
17796 const struct processor *arch;
17797 size_t len;
17799 ext = strchr (to_parse, '+');
17801 if (ext != NULL)
17802 len = ext - to_parse;
17803 else
17804 len = strlen (to_parse);
17806 if (len == 0)
17807 return AARCH_PARSE_MISSING_ARG;
17810 /* Loop through the list of supported ARCHes to find a match. */
17811 for (arch = all_architectures; arch->name != NULL; arch++)
17813 if (strlen (arch->name) == len
17814 && strncmp (arch->name, to_parse, len) == 0)
17816 auto isa_temp = arch->flags;
17818 if (ext != NULL)
17820 /* TO_PARSE string contains at least one extension. */
17821 enum aarch_parse_opt_result ext_res
17822 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17824 if (ext_res != AARCH_PARSE_OK)
17825 return ext_res;
17827 /* Extension parsing was successful. Confirm the result
17828 arch and ISA flags. */
17829 *res = arch;
17830 *isa_flags = isa_temp;
17831 return AARCH_PARSE_OK;
17835 /* ARCH name not found in list. */
17836 return AARCH_PARSE_INVALID_ARG;
17839 /* Parse the TO_PARSE string and put the result tuning in RES and the
17840 architecture flags in ISA_FLAGS. Return an aarch_parse_opt_result
17841 describing the parse result. If there is an error parsing, RES and
17842 ISA_FLAGS are left unchanged.
17843 When the TO_PARSE string contains an invalid extension,
17844 a copy of the string is created and stored to INVALID_EXTENSION. */
17846 static enum aarch_parse_opt_result
17847 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17848 aarch64_feature_flags *isa_flags,
17849 std::string *invalid_extension)
17851 const char *ext;
17852 const struct processor *cpu;
17853 size_t len;
17855 ext = strchr (to_parse, '+');
17857 if (ext != NULL)
17858 len = ext - to_parse;
17859 else
17860 len = strlen (to_parse);
17862 if (len == 0)
17863 return AARCH_PARSE_MISSING_ARG;
17866 /* Loop through the list of supported CPUs to find a match. */
17867 for (cpu = all_cores; cpu->name != NULL; cpu++)
17869 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17871 auto isa_temp = cpu->flags;
17873 if (ext != NULL)
17875 /* TO_PARSE string contains at least one extension. */
17876 enum aarch_parse_opt_result ext_res
17877 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17879 if (ext_res != AARCH_PARSE_OK)
17880 return ext_res;
17882 /* Extension parsing was successfull. Confirm the result
17883 cpu and ISA flags. */
17884 *res = cpu;
17885 *isa_flags = isa_temp;
17886 return AARCH_PARSE_OK;
17890 /* CPU name not found in list. */
17891 return AARCH_PARSE_INVALID_ARG;
17894 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17895 Return an aarch_parse_opt_result describing the parse result.
17896 If the parsing fails the RES does not change. */
17898 static enum aarch_parse_opt_result
17899 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17901 const struct processor *cpu;
17903 /* Loop through the list of supported CPUs to find a match. */
17904 for (cpu = all_cores; cpu->name != NULL; cpu++)
17906 if (strcmp (cpu->name, to_parse) == 0)
17908 *res = cpu;
17909 return AARCH_PARSE_OK;
17913 /* CPU name not found in list. */
17914 return AARCH_PARSE_INVALID_ARG;
17917 /* Parse TOKEN, which has length LENGTH to see if it is an option
17918 described in FLAG. If it is, return the index bit for that fusion type.
17919 If not, error (printing OPTION_NAME) and return zero. */
17921 static unsigned int
17922 aarch64_parse_one_option_token (const char *token,
17923 size_t length,
17924 const struct aarch64_flag_desc *flag,
17925 const char *option_name)
17927 for (; flag->name != NULL; flag++)
17929 if (length == strlen (flag->name)
17930 && !strncmp (flag->name, token, length))
17931 return flag->flag;
17934 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17935 return 0;
17938 /* Parse OPTION which is a comma-separated list of flags to enable.
17939 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17940 default state we inherit from the CPU tuning structures. OPTION_NAME
17941 gives the top-level option we are parsing in the -moverride string,
17942 for use in error messages. */
17944 static unsigned int
17945 aarch64_parse_boolean_options (const char *option,
17946 const struct aarch64_flag_desc *flags,
17947 unsigned int initial_state,
17948 const char *option_name)
17950 const char separator = '.';
17951 const char* specs = option;
17952 const char* ntoken = option;
17953 unsigned int found_flags = initial_state;
17955 while ((ntoken = strchr (specs, separator)))
17957 size_t token_length = ntoken - specs;
17958 unsigned token_ops = aarch64_parse_one_option_token (specs,
17959 token_length,
17960 flags,
17961 option_name);
17962 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17963 in the token stream, reset the supported operations. So:
17965 adrp+add.cmp+branch.none.adrp+add
17967 would have the result of turning on only adrp+add fusion. */
17968 if (!token_ops)
17969 found_flags = 0;
17971 found_flags |= token_ops;
17972 specs = ++ntoken;
17975 /* We ended with a comma, print something. */
17976 if (!(*specs))
17978 error ("%qs string ill-formed", option_name);
17979 return 0;
17982 /* We still have one more token to parse. */
17983 size_t token_length = strlen (specs);
17984 unsigned token_ops = aarch64_parse_one_option_token (specs,
17985 token_length,
17986 flags,
17987 option_name);
17988 if (!token_ops)
17989 found_flags = 0;
17991 found_flags |= token_ops;
17992 return found_flags;
17995 /* Support for overriding instruction fusion. */
17997 static void
17998 aarch64_parse_fuse_string (const char *fuse_string,
17999 struct tune_params *tune)
18001 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
18002 aarch64_fusible_pairs,
18003 tune->fusible_ops,
18004 "fuse=");
18007 /* Support for overriding other tuning flags. */
18009 static void
18010 aarch64_parse_tune_string (const char *tune_string,
18011 struct tune_params *tune)
18013 tune->extra_tuning_flags
18014 = aarch64_parse_boolean_options (tune_string,
18015 aarch64_tuning_flags,
18016 tune->extra_tuning_flags,
18017 "tune=");
18020 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18021 Accept the valid SVE vector widths allowed by
18022 aarch64_sve_vector_bits_enum and use it to override sve_width
18023 in TUNE. */
18025 static void
18026 aarch64_parse_sve_width_string (const char *tune_string,
18027 struct tune_params *tune)
18029 int width = -1;
18031 int n = sscanf (tune_string, "%d", &width);
18032 if (n == EOF)
18034 error ("invalid format for %<sve_width%>");
18035 return;
18037 switch (width)
18039 case SVE_128:
18040 case SVE_256:
18041 case SVE_512:
18042 case SVE_1024:
18043 case SVE_2048:
18044 break;
18045 default:
18046 error ("invalid %<sve_width%> value: %d", width);
18048 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
18051 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18052 we understand. If it is, extract the option string and handoff to
18053 the appropriate function. */
18055 void
18056 aarch64_parse_one_override_token (const char* token,
18057 size_t length,
18058 struct tune_params *tune)
18060 const struct aarch64_tuning_override_function *fn
18061 = aarch64_tuning_override_functions;
18063 const char *option_part = strchr (token, '=');
18064 if (!option_part)
18066 error ("tuning string missing in option (%s)", token);
18067 return;
18070 /* Get the length of the option name. */
18071 length = option_part - token;
18072 /* Skip the '=' to get to the option string. */
18073 option_part++;
18075 for (; fn->name != NULL; fn++)
18077 if (!strncmp (fn->name, token, length))
18079 fn->parse_override (option_part, tune);
18080 return;
18084 error ("unknown tuning option (%s)",token);
18085 return;
18088 /* A checking mechanism for the implementation of the tls size. */
18090 static void
18091 initialize_aarch64_tls_size (struct gcc_options *opts)
18093 if (aarch64_tls_size == 0)
18094 aarch64_tls_size = 24;
18096 switch (opts->x_aarch64_cmodel_var)
18098 case AARCH64_CMODEL_TINY:
18099 /* Both the default and maximum TLS size allowed under tiny is 1M which
18100 needs two instructions to address, so we clamp the size to 24. */
18101 if (aarch64_tls_size > 24)
18102 aarch64_tls_size = 24;
18103 break;
18104 case AARCH64_CMODEL_SMALL:
18105 /* The maximum TLS size allowed under small is 4G. */
18106 if (aarch64_tls_size > 32)
18107 aarch64_tls_size = 32;
18108 break;
18109 case AARCH64_CMODEL_LARGE:
18110 /* The maximum TLS size allowed under large is 16E.
18111 FIXME: 16E should be 64bit, we only support 48bit offset now. */
18112 if (aarch64_tls_size > 48)
18113 aarch64_tls_size = 48;
18114 break;
18115 default:
18116 gcc_unreachable ();
18119 return;
18122 /* Return the CPU corresponding to the enum CPU. */
18124 static const struct processor *
18125 aarch64_get_tune_cpu (enum aarch64_processor cpu)
18127 gcc_assert (cpu != aarch64_none);
18129 return &all_cores[cpu];
18132 /* Return the architecture corresponding to the enum ARCH. */
18134 static const struct processor *
18135 aarch64_get_arch (enum aarch64_arch arch)
18137 gcc_assert (arch != aarch64_no_arch);
18139 return &all_architectures[arch];
18142 /* Parse STRING looking for options in the format:
18143 string :: option:string
18144 option :: name=substring
18145 name :: {a-z}
18146 substring :: defined by option. */
18148 static void
18149 aarch64_parse_override_string (const char* input_string,
18150 struct tune_params* tune)
18152 const char separator = ':';
18153 size_t string_length = strlen (input_string) + 1;
18154 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18155 char *string = string_root;
18156 strncpy (string, input_string, string_length);
18157 string[string_length - 1] = '\0';
18159 char* ntoken = string;
18161 while ((ntoken = strchr (string, separator)))
18163 size_t token_length = ntoken - string;
18164 /* Make this substring look like a string. */
18165 *ntoken = '\0';
18166 aarch64_parse_one_override_token (string, token_length, tune);
18167 string = ++ntoken;
18170 /* One last option to parse. */
18171 aarch64_parse_one_override_token (string, strlen (string), tune);
18172 free (string_root);
18175 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18176 are best for a generic target with the currently-enabled architecture
18177 extensions. */
18178 static void
18179 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18181 /* Neoverse V1 is the only core that is known to benefit from
18182 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
18183 point enabling it for SVE2 and above. */
18184 if (TARGET_SVE2)
18185 current_tune.extra_tuning_flags
18186 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18189 static void
18190 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18192 /* PR 70044: We have to be careful about being called multiple times for the
18193 same function. This means all changes should be repeatable. */
18195 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18196 Disable the frame pointer flag so the mid-end will not use a frame
18197 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18198 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18199 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
18200 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18201 if (opts->x_flag_omit_frame_pointer == 0)
18202 opts->x_flag_omit_frame_pointer = 2;
18204 /* If not optimizing for size, set the default
18205 alignment to what the target wants. */
18206 if (!opts->x_optimize_size)
18208 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18209 opts->x_str_align_loops = aarch64_tune_params.loop_align;
18210 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18211 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18212 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18213 opts->x_str_align_functions = aarch64_tune_params.function_align;
18216 /* We default to no pc-relative literal loads. */
18218 aarch64_pcrelative_literal_loads = false;
18220 /* If -mpc-relative-literal-loads is set on the command line, this
18221 implies that the user asked for PC relative literal loads. */
18222 if (opts->x_pcrelative_literal_loads == 1)
18223 aarch64_pcrelative_literal_loads = true;
18225 /* In the tiny memory model it makes no sense to disallow PC relative
18226 literal pool loads. */
18227 if (aarch64_cmodel == AARCH64_CMODEL_TINY
18228 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18229 aarch64_pcrelative_literal_loads = true;
18231 /* When enabling the lower precision Newton series for the square root, also
18232 enable it for the reciprocal square root, since the latter is an
18233 intermediary step for the former. */
18234 if (flag_mlow_precision_sqrt)
18235 flag_mrecip_low_precision_sqrt = true;
18238 /* 'Unpack' up the internal tuning structs and update the options
18239 in OPTS. The caller must have set up selected_tune and selected_arch
18240 as all the other target-specific codegen decisions are
18241 derived from them. */
18243 void
18244 aarch64_override_options_internal (struct gcc_options *opts)
18246 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18247 aarch64_tune_flags = tune->flags;
18248 aarch64_tune = tune->sched_core;
18249 /* Make a copy of the tuning parameters attached to the core, which
18250 we may later overwrite. */
18251 aarch64_tune_params = *(tune->tune);
18252 if (tune->tune == &generic_tunings)
18253 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18255 if (opts->x_aarch64_override_tune_string)
18256 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18257 &aarch64_tune_params);
18259 if (opts->x_aarch64_ldp_policy_param)
18260 aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18262 if (opts->x_aarch64_stp_policy_param)
18263 aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18265 /* This target defaults to strict volatile bitfields. */
18266 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18267 opts->x_flag_strict_volatile_bitfields = 1;
18269 if (aarch64_stack_protector_guard == SSP_GLOBAL
18270 && opts->x_aarch64_stack_protector_guard_offset_str)
18272 error ("incompatible options %<-mstack-protector-guard=global%> and "
18273 "%<-mstack-protector-guard-offset=%s%>",
18274 aarch64_stack_protector_guard_offset_str);
18277 if (aarch64_stack_protector_guard == SSP_SYSREG
18278 && !(opts->x_aarch64_stack_protector_guard_offset_str
18279 && opts->x_aarch64_stack_protector_guard_reg_str))
18281 error ("both %<-mstack-protector-guard-offset%> and "
18282 "%<-mstack-protector-guard-reg%> must be used "
18283 "with %<-mstack-protector-guard=sysreg%>");
18286 if (opts->x_aarch64_stack_protector_guard_reg_str)
18288 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18289 error ("specify a system register with a small string length");
18292 if (opts->x_aarch64_stack_protector_guard_offset_str)
18294 char *end;
18295 const char *str = aarch64_stack_protector_guard_offset_str;
18296 errno = 0;
18297 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18298 if (!*str || *end || errno)
18299 error ("%qs is not a valid offset in %qs", str,
18300 "-mstack-protector-guard-offset=");
18301 aarch64_stack_protector_guard_offset = offs;
18304 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18305 && !fixed_regs[R18_REGNUM])
18306 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18308 if ((opts->x_aarch64_isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18309 && !(opts->x_aarch64_isa_flags & AARCH64_FL_SME))
18311 if (opts->x_aarch64_isa_flags & AARCH64_FL_SM_ON)
18312 error ("streaming functions require the ISA extension %qs", "sme");
18313 else
18314 error ("functions with SME state require the ISA extension %qs",
18315 "sme");
18316 inform (input_location, "you can enable %qs using the command-line"
18317 " option %<-march%>, or by using the %<target%>"
18318 " attribute or pragma", "sme");
18319 opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18320 auto new_flags = (opts->x_aarch64_asm_isa_flags
18321 | feature_deps::SME ().enable);
18322 aarch64_set_asm_isa_flags (opts, new_flags);
18325 initialize_aarch64_code_model (opts);
18326 initialize_aarch64_tls_size (opts);
18327 aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18329 int queue_depth = 0;
18330 switch (aarch64_tune_params.autoprefetcher_model)
18332 case tune_params::AUTOPREFETCHER_OFF:
18333 queue_depth = -1;
18334 break;
18335 case tune_params::AUTOPREFETCHER_WEAK:
18336 queue_depth = 0;
18337 break;
18338 case tune_params::AUTOPREFETCHER_STRONG:
18339 queue_depth = max_insn_queue_index + 1;
18340 break;
18341 default:
18342 gcc_unreachable ();
18345 /* We don't mind passing in global_options_set here as we don't use
18346 the *options_set structs anyway. */
18347 SET_OPTION_IF_UNSET (opts, &global_options_set,
18348 param_sched_autopref_queue_depth, queue_depth);
18350 /* Set up parameters to be used in prefetching algorithm. Do not
18351 override the defaults unless we are tuning for a core we have
18352 researched values for. */
18353 if (aarch64_tune_params.prefetch->num_slots > 0)
18354 SET_OPTION_IF_UNSET (opts, &global_options_set,
18355 param_simultaneous_prefetches,
18356 aarch64_tune_params.prefetch->num_slots);
18357 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18358 SET_OPTION_IF_UNSET (opts, &global_options_set,
18359 param_l1_cache_size,
18360 aarch64_tune_params.prefetch->l1_cache_size);
18361 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18362 SET_OPTION_IF_UNSET (opts, &global_options_set,
18363 param_l1_cache_line_size,
18364 aarch64_tune_params.prefetch->l1_cache_line_size);
18366 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18368 SET_OPTION_IF_UNSET (opts, &global_options_set,
18369 param_destruct_interfere_size,
18370 aarch64_tune_params.prefetch->l1_cache_line_size);
18371 SET_OPTION_IF_UNSET (opts, &global_options_set,
18372 param_construct_interfere_size,
18373 aarch64_tune_params.prefetch->l1_cache_line_size);
18375 else
18377 /* For a generic AArch64 target, cover the current range of cache line
18378 sizes. */
18379 SET_OPTION_IF_UNSET (opts, &global_options_set,
18380 param_destruct_interfere_size,
18381 256);
18382 SET_OPTION_IF_UNSET (opts, &global_options_set,
18383 param_construct_interfere_size,
18384 64);
18387 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18388 SET_OPTION_IF_UNSET (opts, &global_options_set,
18389 param_l2_cache_size,
18390 aarch64_tune_params.prefetch->l2_cache_size);
18391 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18392 SET_OPTION_IF_UNSET (opts, &global_options_set,
18393 param_prefetch_dynamic_strides, 0);
18394 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18395 SET_OPTION_IF_UNSET (opts, &global_options_set,
18396 param_prefetch_minimum_stride,
18397 aarch64_tune_params.prefetch->minimum_stride);
18399 /* Use the alternative scheduling-pressure algorithm by default. */
18400 SET_OPTION_IF_UNSET (opts, &global_options_set,
18401 param_sched_pressure_algorithm,
18402 SCHED_PRESSURE_MODEL);
18404 /* Validate the guard size. */
18405 int guard_size = param_stack_clash_protection_guard_size;
18407 if (guard_size != 12 && guard_size != 16)
18408 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18409 "size. Given value %d (%llu KB) is out of range",
18410 guard_size, (1ULL << guard_size) / 1024ULL);
18412 /* Enforce that interval is the same size as size so the mid-end does the
18413 right thing. */
18414 SET_OPTION_IF_UNSET (opts, &global_options_set,
18415 param_stack_clash_protection_probe_interval,
18416 guard_size);
18418 /* The maybe_set calls won't update the value if the user has explicitly set
18419 one. Which means we need to validate that probing interval and guard size
18420 are equal. */
18421 int probe_interval
18422 = param_stack_clash_protection_probe_interval;
18423 if (guard_size != probe_interval)
18424 error ("stack clash guard size %<%d%> must be equal to probing interval "
18425 "%<%d%>", guard_size, probe_interval);
18427 /* Enable sw prefetching at specified optimization level for
18428 CPUS that have prefetch. Lower optimization level threshold by 1
18429 when profiling is enabled. */
18430 if (opts->x_flag_prefetch_loop_arrays < 0
18431 && !opts->x_optimize_size
18432 && aarch64_tune_params.prefetch->default_opt_level >= 0
18433 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18434 opts->x_flag_prefetch_loop_arrays = 1;
18436 /* Avoid loop-dependant FMA chains. */
18437 if (aarch64_tune_params.extra_tuning_flags
18438 & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18439 SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18440 512);
18442 /* Consider fully pipelined FMA in reassociation. */
18443 if (aarch64_tune_params.extra_tuning_flags
18444 & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18445 SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18448 aarch64_override_options_after_change_1 (opts);
18451 /* Print a hint with a suggestion for a core or architecture name that
18452 most closely resembles what the user passed in STR. ARCH is true if
18453 the user is asking for an architecture name. ARCH is false if the user
18454 is asking for a core name. */
18456 static void
18457 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18459 auto_vec<const char *> candidates;
18460 const struct processor *entry = arch ? all_architectures : all_cores;
18461 for (; entry->name != NULL; entry++)
18462 candidates.safe_push (entry->name);
18464 #ifdef HAVE_LOCAL_CPU_DETECT
18465 /* Add also "native" as possible value. */
18466 if (arch)
18467 candidates.safe_push ("native");
18468 #endif
18470 char *s;
18471 const char *hint = candidates_list_and_hint (str, s, candidates);
18472 if (hint)
18473 inform (input_location, "valid arguments are: %s;"
18474 " did you mean %qs?", s, hint);
18475 else
18476 inform (input_location, "valid arguments are: %s", s);
18478 XDELETEVEC (s);
18481 /* Print a hint with a suggestion for a core name that most closely resembles
18482 what the user passed in STR. */
18484 inline static void
18485 aarch64_print_hint_for_core (const char *str)
18487 aarch64_print_hint_for_core_or_arch (str, false);
18490 /* Print a hint with a suggestion for an architecture name that most closely
18491 resembles what the user passed in STR. */
18493 inline static void
18494 aarch64_print_hint_for_arch (const char *str)
18496 aarch64_print_hint_for_core_or_arch (str, true);
18500 /* Print a hint with a suggestion for an extension name
18501 that most closely resembles what the user passed in STR. */
18503 void
18504 aarch64_print_hint_for_extensions (const std::string &str)
18506 auto_vec<const char *> candidates;
18507 aarch64_get_all_extension_candidates (&candidates);
18508 char *s;
18509 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18510 if (hint)
18511 inform (input_location, "valid arguments are: %s;"
18512 " did you mean %qs?", s, hint);
18513 else
18514 inform (input_location, "valid arguments are: %s", s);
18516 XDELETEVEC (s);
18519 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18520 specified in STR and throw errors if appropriate. Put the results if
18521 they are valid in RES and ISA_FLAGS. Return whether the option is
18522 valid. */
18524 static bool
18525 aarch64_validate_mcpu (const char *str, const struct processor **res,
18526 aarch64_feature_flags *isa_flags)
18528 std::string invalid_extension;
18529 enum aarch_parse_opt_result parse_res
18530 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18532 if (parse_res == AARCH_PARSE_OK)
18533 return true;
18535 switch (parse_res)
18537 case AARCH_PARSE_MISSING_ARG:
18538 error ("missing cpu name in %<-mcpu=%s%>", str);
18539 break;
18540 case AARCH_PARSE_INVALID_ARG:
18541 error ("unknown value %qs for %<-mcpu%>", str);
18542 aarch64_print_hint_for_core (str);
18543 /* A common user error is confusing -march and -mcpu.
18544 If the -mcpu string matches a known architecture then suggest
18545 -march=. */
18546 parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18547 if (parse_res == AARCH_PARSE_OK)
18548 inform (input_location, "did you mean %<-march=%s%>?", str);
18549 break;
18550 case AARCH_PARSE_INVALID_FEATURE:
18551 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18552 invalid_extension.c_str (), str);
18553 aarch64_print_hint_for_extensions (invalid_extension);
18554 break;
18555 default:
18556 gcc_unreachable ();
18559 return false;
18562 /* Straight line speculation indicators. */
18563 enum aarch64_sls_hardening_type
18565 SLS_NONE = 0,
18566 SLS_RETBR = 1,
18567 SLS_BLR = 2,
18568 SLS_ALL = 3,
18570 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18572 /* Return whether we should mitigatate Straight Line Speculation for the RET
18573 and BR instructions. */
18574 bool
18575 aarch64_harden_sls_retbr_p (void)
18577 return aarch64_sls_hardening & SLS_RETBR;
18580 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18581 instruction. */
18582 bool
18583 aarch64_harden_sls_blr_p (void)
18585 return aarch64_sls_hardening & SLS_BLR;
18588 /* As of yet we only allow setting these options globally, in the future we may
18589 allow setting them per function. */
18590 static void
18591 aarch64_validate_sls_mitigation (const char *const_str)
18593 char *token_save = NULL;
18594 char *str = NULL;
18596 if (strcmp (const_str, "none") == 0)
18598 aarch64_sls_hardening = SLS_NONE;
18599 return;
18601 if (strcmp (const_str, "all") == 0)
18603 aarch64_sls_hardening = SLS_ALL;
18604 return;
18607 char *str_root = xstrdup (const_str);
18608 str = strtok_r (str_root, ",", &token_save);
18609 if (!str)
18610 error ("invalid argument given to %<-mharden-sls=%>");
18612 int temp = SLS_NONE;
18613 while (str)
18615 if (strcmp (str, "blr") == 0)
18616 temp |= SLS_BLR;
18617 else if (strcmp (str, "retbr") == 0)
18618 temp |= SLS_RETBR;
18619 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18621 error ("%qs must be by itself for %<-mharden-sls=%>", str);
18622 break;
18624 else
18626 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18627 break;
18629 str = strtok_r (NULL, ",", &token_save);
18631 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18632 free (str_root);
18635 /* Validate a command-line -march option. Parse the arch and extensions
18636 (if any) specified in STR and throw errors if appropriate. Put the
18637 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18638 option is valid. */
18640 static bool
18641 aarch64_validate_march (const char *str, const struct processor **res,
18642 aarch64_feature_flags *isa_flags)
18644 std::string invalid_extension;
18645 enum aarch_parse_opt_result parse_res
18646 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18648 if (parse_res == AARCH_PARSE_OK)
18649 return true;
18651 switch (parse_res)
18653 case AARCH_PARSE_MISSING_ARG:
18654 error ("missing arch name in %<-march=%s%>", str);
18655 break;
18656 case AARCH_PARSE_INVALID_ARG:
18657 error ("unknown value %qs for %<-march%>", str);
18658 aarch64_print_hint_for_arch (str);
18659 /* A common user error is confusing -march and -mcpu.
18660 If the -march string matches a known CPU suggest -mcpu. */
18661 parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18662 if (parse_res == AARCH_PARSE_OK)
18663 inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18664 break;
18665 case AARCH_PARSE_INVALID_FEATURE:
18666 error ("invalid feature modifier %qs in %<-march=%s%>",
18667 invalid_extension.c_str (), str);
18668 aarch64_print_hint_for_extensions (invalid_extension);
18669 break;
18670 default:
18671 gcc_unreachable ();
18674 return false;
18677 /* Validate a command-line -mtune option. Parse the cpu
18678 specified in STR and throw errors if appropriate. Put the
18679 result, if it is valid, in RES. Return whether the option is
18680 valid. */
18682 static bool
18683 aarch64_validate_mtune (const char *str, const struct processor **res)
18685 enum aarch_parse_opt_result parse_res
18686 = aarch64_parse_tune (str, res);
18688 if (parse_res == AARCH_PARSE_OK)
18689 return true;
18691 switch (parse_res)
18693 case AARCH_PARSE_MISSING_ARG:
18694 error ("missing cpu name in %<-mtune=%s%>", str);
18695 break;
18696 case AARCH_PARSE_INVALID_ARG:
18697 error ("unknown value %qs for %<-mtune%>", str);
18698 aarch64_print_hint_for_core (str);
18699 break;
18700 default:
18701 gcc_unreachable ();
18703 return false;
18706 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18708 static poly_uint16
18709 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18711 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18712 on big-endian targets, so we would need to forbid subregs that convert
18713 from one to the other. By default a reinterpret sequence would then
18714 involve a store to memory in one mode and a load back in the other.
18715 Even if we optimize that sequence using reverse instructions,
18716 it would still be a significant potential overhead.
18718 For now, it seems better to generate length-agnostic code for that
18719 case instead. */
18720 if (value == SVE_SCALABLE
18721 || (value == SVE_128 && BYTES_BIG_ENDIAN))
18722 return poly_uint16 (2, 2);
18723 else
18724 return (int) value / 64;
18727 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18728 aarch64_isa_flags accordingly. */
18730 void
18731 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18733 aarch64_set_asm_isa_flags (&global_options, flags);
18736 static void
18737 aarch64_handle_no_branch_protection (void)
18739 aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18740 aarch_enable_bti = 0;
18743 static void
18744 aarch64_handle_standard_branch_protection (void)
18746 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18747 aarch64_ra_sign_key = AARCH64_KEY_A;
18748 aarch_enable_bti = 1;
18751 static void
18752 aarch64_handle_pac_ret_protection (void)
18754 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18755 aarch64_ra_sign_key = AARCH64_KEY_A;
18758 static void
18759 aarch64_handle_pac_ret_leaf (void)
18761 aarch_ra_sign_scope = AARCH_FUNCTION_ALL;
18764 static void
18765 aarch64_handle_pac_ret_b_key (void)
18767 aarch64_ra_sign_key = AARCH64_KEY_B;
18770 static void
18771 aarch64_handle_bti_protection (void)
18773 aarch_enable_bti = 1;
18776 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes[] = {
18777 { "leaf", false, aarch64_handle_pac_ret_leaf, NULL, 0 },
18778 { "b-key", false, aarch64_handle_pac_ret_b_key, NULL, 0 },
18779 { NULL, false, NULL, NULL, 0 }
18782 static const struct aarch_branch_protect_type aarch64_branch_protect_types[] =
18784 { "none", true, aarch64_handle_no_branch_protection, NULL, 0 },
18785 { "standard", true, aarch64_handle_standard_branch_protection, NULL, 0 },
18786 { "pac-ret", false, aarch64_handle_pac_ret_protection,
18787 aarch64_pac_ret_subtypes, ARRAY_SIZE (aarch64_pac_ret_subtypes) },
18788 { "bti", false, aarch64_handle_bti_protection, NULL, 0 },
18789 { NULL, false, NULL, NULL, 0 }
18792 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18793 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18794 tuning structs. In particular it must set selected_tune and
18795 aarch64_asm_isa_flags that define the available ISA features and tuning
18796 decisions. It must also set selected_arch as this will be used to
18797 output the .arch asm tags for each function. */
18799 static void
18800 aarch64_override_options (void)
18802 aarch64_feature_flags cpu_isa = 0;
18803 aarch64_feature_flags arch_isa = 0;
18804 aarch64_set_asm_isa_flags (0);
18806 const struct processor *cpu = NULL;
18807 const struct processor *arch = NULL;
18808 const struct processor *tune = NULL;
18810 if (aarch64_harden_sls_string)
18811 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18813 if (aarch64_branch_protection_string)
18814 aarch_validate_mbranch_protection (aarch64_branch_protect_types,
18815 aarch64_branch_protection_string,
18816 "-mbranch-protection=");
18818 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18819 If either of -march or -mtune is given, they override their
18820 respective component of -mcpu. */
18821 if (aarch64_cpu_string)
18822 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18824 if (aarch64_arch_string)
18825 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18827 if (aarch64_tune_string)
18828 aarch64_validate_mtune (aarch64_tune_string, &tune);
18830 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18831 SUBTARGET_OVERRIDE_OPTIONS;
18832 #endif
18834 auto isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
18835 if (cpu && arch)
18837 /* If both -mcpu and -march are specified, warn if they are not
18838 feature compatible. feature compatible means that the inclusion of the
18839 cpu features would end up disabling an achitecture feature. In
18840 otherwords the cpu features need to be a strict superset of the arch
18841 features and if so prefer the -march ISA flags. */
18842 auto full_arch_flags = arch->flags | arch_isa;
18843 auto full_cpu_flags = cpu->flags | cpu_isa;
18844 if (~full_cpu_flags & full_arch_flags)
18846 std::string ext_diff
18847 = aarch64_get_extension_string_for_isa_flags (full_arch_flags,
18848 full_cpu_flags);
18849 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18850 "and resulted in options %<%s%> being added",
18851 aarch64_cpu_string,
18852 aarch64_arch_string,
18853 ext_diff.c_str ());
18856 selected_arch = arch->arch;
18857 aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18859 else if (cpu)
18861 selected_arch = cpu->arch;
18862 aarch64_set_asm_isa_flags (cpu_isa | isa_mode);
18864 else if (arch)
18866 cpu = &all_cores[arch->ident];
18867 selected_arch = arch->arch;
18868 aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18870 else
18872 /* No -mcpu or -march specified, so use the default CPU. */
18873 cpu = &all_cores[TARGET_CPU_DEFAULT];
18874 selected_arch = cpu->arch;
18875 aarch64_set_asm_isa_flags (cpu->flags | isa_mode);
18878 selected_tune = tune ? tune->ident : cpu->ident;
18880 if (aarch_enable_bti == 2)
18882 #ifdef TARGET_ENABLE_BTI
18883 aarch_enable_bti = 1;
18884 #else
18885 aarch_enable_bti = 0;
18886 #endif
18889 /* Return address signing is currently not supported for ILP32 targets. For
18890 LP64 targets use the configured option in the absence of a command-line
18891 option for -mbranch-protection. */
18892 if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
18894 #ifdef TARGET_ENABLE_PAC_RET
18895 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18896 #else
18897 aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18898 #endif
18901 #ifndef HAVE_AS_MABI_OPTION
18902 /* The compiler may have been configured with 2.23.* binutils, which does
18903 not have support for ILP32. */
18904 if (TARGET_ILP32)
18905 error ("assembler does not support %<-mabi=ilp32%>");
18906 #endif
18908 /* Convert -msve-vector-bits to a VG count. */
18909 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18911 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
18912 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18914 /* The pass to insert speculation tracking runs before
18915 shrink-wrapping and the latter does not know how to update the
18916 tracking status. So disable it in this case. */
18917 if (aarch64_track_speculation)
18918 flag_shrink_wrap = 0;
18920 aarch64_override_options_internal (&global_options);
18922 /* Save these options as the default ones in case we push and pop them later
18923 while processing functions with potential target attributes. */
18924 target_option_default_node = target_option_current_node
18925 = build_target_option_node (&global_options, &global_options_set);
18928 /* Implement targetm.override_options_after_change. */
18930 static void
18931 aarch64_override_options_after_change (void)
18933 aarch64_override_options_after_change_1 (&global_options);
18936 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18937 static char *
18938 aarch64_offload_options (void)
18940 if (TARGET_ILP32)
18941 return xstrdup ("-foffload-abi=ilp32");
18942 else
18943 return xstrdup ("-foffload-abi=lp64");
18946 static struct machine_function *
18947 aarch64_init_machine_status (void)
18949 struct machine_function *machine;
18950 machine = ggc_cleared_alloc<machine_function> ();
18951 return machine;
18954 void
18955 aarch64_init_expanders (void)
18957 init_machine_status = aarch64_init_machine_status;
18960 /* A checking mechanism for the implementation of the various code models. */
18961 static void
18962 initialize_aarch64_code_model (struct gcc_options *opts)
18964 aarch64_cmodel = opts->x_aarch64_cmodel_var;
18965 switch (opts->x_aarch64_cmodel_var)
18967 case AARCH64_CMODEL_TINY:
18968 if (opts->x_flag_pic)
18969 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18970 break;
18971 case AARCH64_CMODEL_SMALL:
18972 if (opts->x_flag_pic)
18974 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18975 aarch64_cmodel = (flag_pic == 2
18976 ? AARCH64_CMODEL_SMALL_PIC
18977 : AARCH64_CMODEL_SMALL_SPIC);
18978 #else
18979 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18980 #endif
18982 break;
18983 case AARCH64_CMODEL_LARGE:
18984 if (opts->x_flag_pic)
18985 sorry ("code model %qs with %<-f%s%>", "large",
18986 opts->x_flag_pic > 1 ? "PIC" : "pic");
18987 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18988 sorry ("code model %qs not supported in ilp32 mode", "large");
18989 break;
18990 case AARCH64_CMODEL_TINY_PIC:
18991 case AARCH64_CMODEL_SMALL_PIC:
18992 case AARCH64_CMODEL_SMALL_SPIC:
18993 gcc_unreachable ();
18997 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
18998 using the information saved in PTR. */
19000 static void
19001 aarch64_option_restore (struct gcc_options *opts,
19002 struct gcc_options * /* opts_set */,
19003 struct cl_target_option * /* ptr */)
19005 aarch64_override_options_internal (opts);
19008 /* Implement TARGET_OPTION_PRINT. */
19010 static void
19011 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
19013 const struct processor *cpu
19014 = aarch64_get_tune_cpu (ptr->x_selected_tune);
19015 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
19016 std::string extension
19017 = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
19018 arch->flags);
19020 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
19021 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
19022 arch->name, extension.c_str ());
19025 static GTY(()) tree aarch64_previous_fndecl;
19027 void
19028 aarch64_reset_previous_fndecl (void)
19030 aarch64_previous_fndecl = NULL;
19033 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19034 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19035 make sure optab availability predicates are recomputed when necessary. */
19037 void
19038 aarch64_save_restore_target_globals (tree new_tree)
19040 if (TREE_TARGET_GLOBALS (new_tree))
19041 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
19042 else if (new_tree == target_option_default_node)
19043 restore_target_globals (&default_target_globals);
19044 else
19045 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
19048 /* Return the target_option_node for FNDECL, or the current options
19049 if FNDECL is null. */
19051 static tree
19052 aarch64_fndecl_options (tree fndecl)
19054 if (!fndecl)
19055 return target_option_current_node;
19057 if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
19058 return options;
19060 return target_option_default_node;
19063 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
19064 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19065 of the function, if such exists. This function may be called multiple
19066 times on a single function so use aarch64_previous_fndecl to avoid
19067 setting up identical state. */
19069 static void
19070 aarch64_set_current_function (tree fndecl)
19072 tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
19073 tree new_tree = aarch64_fndecl_options (fndecl);
19075 auto new_isa_mode = (fndecl
19076 ? aarch64_fndecl_isa_mode (fndecl)
19077 : AARCH64_FL_DEFAULT_ISA_MODE);
19078 auto isa_flags = TREE_TARGET_OPTION (new_tree)->x_aarch64_isa_flags;
19080 static bool reported_zt0_p;
19081 if (!reported_zt0_p
19082 && !(isa_flags & AARCH64_FL_SME2)
19083 && fndecl
19084 && aarch64_fndecl_has_state (fndecl, "zt0"))
19086 error ("functions with %qs state require the ISA extension %qs",
19087 "zt0", "sme2");
19088 inform (input_location, "you can enable %qs using the command-line"
19089 " option %<-march%>, or by using the %<target%>"
19090 " attribute or pragma", "sme2");
19091 reported_zt0_p = true;
19094 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
19095 the default have been handled by aarch64_save_restore_target_globals from
19096 aarch64_pragma_target_parse. */
19097 if (old_tree == new_tree
19098 && (!fndecl || aarch64_previous_fndecl)
19099 && (isa_flags & AARCH64_FL_ISA_MODES) == new_isa_mode)
19101 gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19102 return;
19105 aarch64_previous_fndecl = fndecl;
19107 /* First set the target options. */
19108 cl_target_option_restore (&global_options, &global_options_set,
19109 TREE_TARGET_OPTION (new_tree));
19111 /* The ISA mode can vary based on function type attributes and
19112 function declaration attributes. Make sure that the target
19113 options correctly reflect these attributes. */
19114 if ((isa_flags & AARCH64_FL_ISA_MODES) != new_isa_mode)
19116 auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
19117 aarch64_set_asm_isa_flags (base_flags | new_isa_mode);
19119 aarch64_override_options_internal (&global_options);
19120 new_tree = build_target_option_node (&global_options,
19121 &global_options_set);
19122 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
19124 tree new_optimize = build_optimization_node (&global_options,
19125 &global_options_set);
19126 if (new_optimize != optimization_default_node)
19127 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19130 aarch64_save_restore_target_globals (new_tree);
19132 gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19135 /* Enum describing the various ways we can handle attributes.
19136 In many cases we can reuse the generic option handling machinery. */
19138 enum aarch64_attr_opt_type
19140 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
19141 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
19142 aarch64_attr_enum, /* Attribute sets an enum variable. */
19143 aarch64_attr_custom /* Attribute requires a custom handling function. */
19146 /* All the information needed to handle a target attribute.
19147 NAME is the name of the attribute.
19148 ATTR_TYPE specifies the type of behavior of the attribute as described
19149 in the definition of enum aarch64_attr_opt_type.
19150 ALLOW_NEG is true if the attribute supports a "no-" form.
19151 HANDLER is the function that takes the attribute string as an argument
19152 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19153 OPT_NUM is the enum specifying the option that the attribute modifies.
19154 This is needed for attributes that mirror the behavior of a command-line
19155 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19156 aarch64_attr_enum. */
19158 struct aarch64_attribute_info
19160 const char *name;
19161 enum aarch64_attr_opt_type attr_type;
19162 bool allow_neg;
19163 bool (*handler) (const char *);
19164 enum opt_code opt_num;
19167 /* Handle the ARCH_STR argument to the arch= target attribute. */
19169 static bool
19170 aarch64_handle_attr_arch (const char *str)
19172 const struct processor *tmp_arch = NULL;
19173 std::string invalid_extension;
19174 aarch64_feature_flags tmp_flags;
19175 enum aarch_parse_opt_result parse_res
19176 = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19178 if (parse_res == AARCH_PARSE_OK)
19180 gcc_assert (tmp_arch);
19181 selected_arch = tmp_arch->arch;
19182 aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19183 return true;
19186 switch (parse_res)
19188 case AARCH_PARSE_MISSING_ARG:
19189 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19190 break;
19191 case AARCH_PARSE_INVALID_ARG:
19192 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19193 aarch64_print_hint_for_arch (str);
19194 break;
19195 case AARCH_PARSE_INVALID_FEATURE:
19196 error ("invalid feature modifier %s of value %qs in "
19197 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19198 aarch64_print_hint_for_extensions (invalid_extension);
19199 break;
19200 default:
19201 gcc_unreachable ();
19204 return false;
19207 /* Handle the argument CPU_STR to the cpu= target attribute. */
19209 static bool
19210 aarch64_handle_attr_cpu (const char *str)
19212 const struct processor *tmp_cpu = NULL;
19213 std::string invalid_extension;
19214 aarch64_feature_flags tmp_flags;
19215 enum aarch_parse_opt_result parse_res
19216 = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19218 if (parse_res == AARCH_PARSE_OK)
19220 gcc_assert (tmp_cpu);
19221 selected_tune = tmp_cpu->ident;
19222 selected_arch = tmp_cpu->arch;
19223 aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19224 return true;
19227 switch (parse_res)
19229 case AARCH_PARSE_MISSING_ARG:
19230 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19231 break;
19232 case AARCH_PARSE_INVALID_ARG:
19233 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19234 aarch64_print_hint_for_core (str);
19235 break;
19236 case AARCH_PARSE_INVALID_FEATURE:
19237 error ("invalid feature modifier %qs of value %qs in "
19238 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19239 aarch64_print_hint_for_extensions (invalid_extension);
19240 break;
19241 default:
19242 gcc_unreachable ();
19245 return false;
19248 /* Handle the argument STR to the branch-protection= attribute. */
19250 static bool
19251 aarch64_handle_attr_branch_protection (const char* str)
19253 return aarch_validate_mbranch_protection (aarch64_branch_protect_types, str,
19254 "target(\"branch-protection=\")");
19257 /* Handle the argument STR to the tune= target attribute. */
19259 static bool
19260 aarch64_handle_attr_tune (const char *str)
19262 const struct processor *tmp_tune = NULL;
19263 enum aarch_parse_opt_result parse_res
19264 = aarch64_parse_tune (str, &tmp_tune);
19266 if (parse_res == AARCH_PARSE_OK)
19268 gcc_assert (tmp_tune);
19269 selected_tune = tmp_tune->ident;
19270 return true;
19273 switch (parse_res)
19275 case AARCH_PARSE_INVALID_ARG:
19276 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19277 aarch64_print_hint_for_core (str);
19278 break;
19279 default:
19280 gcc_unreachable ();
19283 return false;
19286 /* Parse an architecture extensions target attribute string specified in STR.
19287 For example "+fp+nosimd". Show any errors if needed. Return TRUE
19288 if successful. Update aarch64_isa_flags to reflect the ISA features
19289 modified. */
19291 static bool
19292 aarch64_handle_attr_isa_flags (char *str)
19294 enum aarch_parse_opt_result parse_res;
19295 auto isa_flags = aarch64_asm_isa_flags;
19297 /* We allow "+nothing" in the beginning to clear out all architectural
19298 features if the user wants to handpick specific features. */
19299 if (strncmp ("+nothing", str, 8) == 0)
19301 isa_flags = AARCH64_ISA_MODE;
19302 str += 8;
19305 std::string invalid_extension;
19306 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19308 if (parse_res == AARCH_PARSE_OK)
19310 aarch64_set_asm_isa_flags (isa_flags);
19311 return true;
19314 switch (parse_res)
19316 case AARCH_PARSE_MISSING_ARG:
19317 error ("missing value in %<target()%> pragma or attribute");
19318 break;
19320 case AARCH_PARSE_INVALID_FEATURE:
19321 error ("invalid feature modifier %qs of value %qs in "
19322 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19323 break;
19325 default:
19326 gcc_unreachable ();
19329 return false;
19332 /* The target attributes that we support. On top of these we also support just
19333 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
19334 handled explicitly in aarch64_process_one_target_attr. */
19336 static const struct aarch64_attribute_info aarch64_attributes[] =
19338 { "general-regs-only", aarch64_attr_mask, false, NULL,
19339 OPT_mgeneral_regs_only },
19340 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19341 OPT_mfix_cortex_a53_835769 },
19342 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19343 OPT_mfix_cortex_a53_843419 },
19344 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19345 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19346 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19347 OPT_momit_leaf_frame_pointer },
19348 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19349 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19350 OPT_march_ },
19351 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19352 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19353 OPT_mtune_ },
19354 { "branch-protection", aarch64_attr_custom, false,
19355 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19356 { "sign-return-address", aarch64_attr_enum, false, NULL,
19357 OPT_msign_return_address_ },
19358 { "outline-atomics", aarch64_attr_bool, true, NULL,
19359 OPT_moutline_atomics},
19360 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19363 /* Parse ARG_STR which contains the definition of one target attribute.
19364 Show appropriate errors if any or return true if the attribute is valid. */
19366 static bool
19367 aarch64_process_one_target_attr (char *arg_str)
19369 bool invert = false;
19371 size_t len = strlen (arg_str);
19373 if (len == 0)
19375 error ("malformed %<target()%> pragma or attribute");
19376 return false;
19379 char *str_to_check = (char *) alloca (len + 1);
19380 strcpy (str_to_check, arg_str);
19382 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19383 It is easier to detect and handle it explicitly here rather than going
19384 through the machinery for the rest of the target attributes in this
19385 function. */
19386 if (*str_to_check == '+')
19387 return aarch64_handle_attr_isa_flags (str_to_check);
19389 if (len > 3 && startswith (str_to_check, "no-"))
19391 invert = true;
19392 str_to_check += 3;
19394 char *arg = strchr (str_to_check, '=');
19396 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19397 and point ARG to "foo". */
19398 if (arg)
19400 *arg = '\0';
19401 arg++;
19403 const struct aarch64_attribute_info *p_attr;
19404 bool found = false;
19405 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19407 /* If the names don't match up, or the user has given an argument
19408 to an attribute that doesn't accept one, or didn't give an argument
19409 to an attribute that expects one, fail to match. */
19410 if (strcmp (str_to_check, p_attr->name) != 0)
19411 continue;
19413 found = true;
19414 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19415 || p_attr->attr_type == aarch64_attr_enum;
19417 if (attr_need_arg_p ^ (arg != NULL))
19419 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19420 return false;
19423 /* If the name matches but the attribute does not allow "no-" versions
19424 then we can't match. */
19425 if (invert && !p_attr->allow_neg)
19427 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19428 return false;
19431 switch (p_attr->attr_type)
19433 /* Has a custom handler registered.
19434 For example, cpu=, arch=, tune=. */
19435 case aarch64_attr_custom:
19436 gcc_assert (p_attr->handler);
19437 if (!p_attr->handler (arg))
19438 return false;
19439 break;
19441 /* Either set or unset a boolean option. */
19442 case aarch64_attr_bool:
19444 struct cl_decoded_option decoded;
19446 generate_option (p_attr->opt_num, NULL, !invert,
19447 CL_TARGET, &decoded);
19448 aarch64_handle_option (&global_options, &global_options_set,
19449 &decoded, input_location);
19450 break;
19452 /* Set or unset a bit in the target_flags. aarch64_handle_option
19453 should know what mask to apply given the option number. */
19454 case aarch64_attr_mask:
19456 struct cl_decoded_option decoded;
19457 /* We only need to specify the option number.
19458 aarch64_handle_option will know which mask to apply. */
19459 decoded.opt_index = p_attr->opt_num;
19460 decoded.value = !invert;
19461 aarch64_handle_option (&global_options, &global_options_set,
19462 &decoded, input_location);
19463 break;
19465 /* Use the option setting machinery to set an option to an enum. */
19466 case aarch64_attr_enum:
19468 gcc_assert (arg);
19469 bool valid;
19470 int value;
19471 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19472 &value, CL_TARGET);
19473 if (valid)
19475 set_option (&global_options, NULL, p_attr->opt_num, value,
19476 NULL, DK_UNSPECIFIED, input_location,
19477 global_dc);
19479 else
19481 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19483 break;
19485 default:
19486 gcc_unreachable ();
19490 /* If we reached here we either have found an attribute and validated
19491 it or didn't match any. If we matched an attribute but its arguments
19492 were malformed we will have returned false already. */
19493 return found;
19496 /* Count how many times the character C appears in
19497 NULL-terminated string STR. */
19499 static unsigned int
19500 num_occurences_in_str (char c, char *str)
19502 unsigned int res = 0;
19503 while (*str != '\0')
19505 if (*str == c)
19506 res++;
19508 str++;
19511 return res;
19514 /* Parse the tree in ARGS that contains the target attribute information
19515 and update the global target options space. */
19517 bool
19518 aarch64_process_target_attr (tree args)
19520 if (TREE_CODE (args) == TREE_LIST)
19524 tree head = TREE_VALUE (args);
19525 if (head)
19527 if (!aarch64_process_target_attr (head))
19528 return false;
19530 args = TREE_CHAIN (args);
19531 } while (args);
19533 return true;
19536 if (TREE_CODE (args) != STRING_CST)
19538 error ("attribute %<target%> argument not a string");
19539 return false;
19542 size_t len = strlen (TREE_STRING_POINTER (args));
19543 char *str_to_check = (char *) alloca (len + 1);
19544 strcpy (str_to_check, TREE_STRING_POINTER (args));
19546 if (len == 0)
19548 error ("malformed %<target()%> pragma or attribute");
19549 return false;
19552 /* Used to catch empty spaces between commas i.e.
19553 attribute ((target ("attr1,,attr2"))). */
19554 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19556 /* Handle multiple target attributes separated by ','. */
19557 char *token = strtok_r (str_to_check, ",", &str_to_check);
19559 unsigned int num_attrs = 0;
19560 while (token)
19562 num_attrs++;
19563 if (!aarch64_process_one_target_attr (token))
19565 /* Check if token is possibly an arch extension without
19566 leading '+'. */
19567 aarch64_feature_flags isa_temp = 0;
19568 auto with_plus = std::string ("+") + token;
19569 enum aarch_parse_opt_result ext_res
19570 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19572 if (ext_res == AARCH_PARSE_OK)
19573 error ("arch extension %<%s%> should be prefixed by %<+%>",
19574 token);
19575 else
19576 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19577 return false;
19580 token = strtok_r (NULL, ",", &str_to_check);
19583 if (num_attrs != num_commas + 1)
19585 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19586 return false;
19589 return true;
19592 static bool aarch64_process_target_version_attr (tree args);
19594 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19595 process attribute ((target ("..."))). */
19597 static bool
19598 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19600 struct cl_target_option cur_target;
19601 bool ret;
19602 tree old_optimize;
19603 tree new_target, new_optimize;
19604 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19606 /* If what we're processing is the current pragma string then the
19607 target option node is already stored in target_option_current_node
19608 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19609 having to re-parse the string. This is especially useful to keep
19610 arm_neon.h compile times down since that header contains a lot
19611 of intrinsics enclosed in pragmas. */
19612 if (!existing_target && args == current_target_pragma)
19614 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19615 return true;
19617 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19619 old_optimize
19620 = build_optimization_node (&global_options, &global_options_set);
19621 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19623 /* If the function changed the optimization levels as well as setting
19624 target options, start with the optimizations specified. */
19625 if (func_optimize && func_optimize != old_optimize)
19626 cl_optimization_restore (&global_options, &global_options_set,
19627 TREE_OPTIMIZATION (func_optimize));
19629 /* Save the current target options to restore at the end. */
19630 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19632 /* If fndecl already has some target attributes applied to it, unpack
19633 them so that we add this attribute on top of them, rather than
19634 overwriting them. */
19635 if (existing_target)
19637 struct cl_target_option *existing_options
19638 = TREE_TARGET_OPTION (existing_target);
19640 if (existing_options)
19641 cl_target_option_restore (&global_options, &global_options_set,
19642 existing_options);
19644 else
19645 cl_target_option_restore (&global_options, &global_options_set,
19646 TREE_TARGET_OPTION (target_option_current_node));
19648 ret = aarch64_process_target_attr (args);
19649 if (ret)
19651 tree version_attr = lookup_attribute ("target_version",
19652 DECL_ATTRIBUTES (fndecl));
19653 if (version_attr != NULL_TREE)
19655 /* Reapply any target_version attribute after target attribute.
19656 This should be equivalent to applying the target_version once
19657 after processing all target attributes. */
19658 tree version_args = TREE_VALUE (version_attr);
19659 ret = aarch64_process_target_version_attr (version_args);
19663 /* Set up any additional state. */
19664 if (ret)
19666 aarch64_override_options_internal (&global_options);
19667 new_target = build_target_option_node (&global_options,
19668 &global_options_set);
19670 else
19671 new_target = NULL;
19673 new_optimize = build_optimization_node (&global_options,
19674 &global_options_set);
19676 if (fndecl && ret)
19678 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19680 if (old_optimize != new_optimize)
19681 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19684 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19686 if (old_optimize != new_optimize)
19687 cl_optimization_restore (&global_options, &global_options_set,
19688 TREE_OPTIMIZATION (old_optimize));
19689 return ret;
19692 typedef unsigned long long aarch64_fmv_feature_mask;
19694 typedef struct
19696 const char *name;
19697 aarch64_fmv_feature_mask feature_mask;
19698 aarch64_feature_flags opt_flags;
19699 } aarch64_fmv_feature_datum;
19701 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19702 {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19704 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19705 feature_deps name. */
19706 #define FEAT_RDMA FEAT_RDM
19708 /* FMV features are listed in priority order, to make it easier to sort target
19709 strings. */
19710 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19711 #include "config/aarch64/aarch64-option-extensions.def"
19714 /* Parse a function multiversioning feature string STR, as found in a
19715 target_version or target_clones attribute.
19717 If ISA_FLAGS is nonnull, then update it with the specified architecture
19718 features turned on. If FEATURE_MASK is nonnull, then assign to it a bitmask
19719 representing the set of features explicitly specified in the feature string.
19720 Return an aarch_parse_opt_result describing the result.
19722 When the STR string contains an invalid or duplicate extension, a copy of
19723 the extension string is created and stored to INVALID_EXTENSION. */
19725 static enum aarch_parse_opt_result
19726 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19727 aarch64_fmv_feature_mask *feature_mask,
19728 std::string *invalid_extension)
19730 if (feature_mask)
19731 *feature_mask = 0ULL;
19733 if (strcmp (str, "default") == 0)
19734 return AARCH_PARSE_OK;
19736 while (str != NULL && *str != 0)
19738 const char *ext;
19739 size_t len;
19741 ext = strchr (str, '+');
19743 if (ext != NULL)
19744 len = ext - str;
19745 else
19746 len = strlen (str);
19748 if (len == 0)
19749 return AARCH_PARSE_MISSING_ARG;
19751 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19752 int i;
19753 for (i = 0; i < num_features; i++)
19755 if (strlen (aarch64_fmv_feature_data[i].name) == len
19756 && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19758 if (isa_flags)
19759 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19760 if (feature_mask)
19762 auto old_feature_mask = *feature_mask;
19763 *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19764 if (*feature_mask == old_feature_mask)
19766 /* Duplicate feature. */
19767 if (invalid_extension)
19768 *invalid_extension = std::string (str, len);
19769 return AARCH_PARSE_DUPLICATE_FEATURE;
19772 break;
19776 if (i == num_features)
19778 /* Feature not found in list. */
19779 if (invalid_extension)
19780 *invalid_extension = std::string (str, len);
19781 return AARCH_PARSE_INVALID_FEATURE;
19784 str = ext;
19785 if (str)
19786 /* Skip over the next '+'. */
19787 str++;
19790 return AARCH_PARSE_OK;
19793 /* Parse the tree in ARGS that contains the target_version attribute
19794 information and update the global target options space. */
19796 static bool
19797 aarch64_process_target_version_attr (tree args)
19799 if (TREE_CODE (args) == TREE_LIST)
19801 if (TREE_CHAIN (args))
19803 error ("attribute %<target_version%> has multiple values");
19804 return false;
19806 args = TREE_VALUE (args);
19809 if (!args || TREE_CODE (args) != STRING_CST)
19811 error ("attribute %<target_version%> argument not a string");
19812 return false;
19815 const char *str = TREE_STRING_POINTER (args);
19817 enum aarch_parse_opt_result parse_res;
19818 auto isa_flags = aarch64_asm_isa_flags;
19820 std::string invalid_extension;
19821 parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19822 &invalid_extension);
19824 if (parse_res == AARCH_PARSE_OK)
19826 aarch64_set_asm_isa_flags (isa_flags);
19827 return true;
19830 switch (parse_res)
19832 case AARCH_PARSE_MISSING_ARG:
19833 error ("missing value in %<target_version%> attribute");
19834 break;
19836 case AARCH_PARSE_INVALID_FEATURE:
19837 error ("invalid feature modifier %qs of value %qs in "
19838 "%<target_version%> attribute", invalid_extension.c_str (),
19839 str);
19840 break;
19842 case AARCH_PARSE_DUPLICATE_FEATURE:
19843 error ("duplicate feature modifier %qs of value %qs in "
19844 "%<target_version%> attribute", invalid_extension.c_str (),
19845 str);
19846 break;
19848 default:
19849 gcc_unreachable ();
19852 return false;
19855 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P. This is used to
19856 process attribute ((target_version ("..."))). */
19858 static bool
19859 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
19861 struct cl_target_option cur_target;
19862 bool ret;
19863 tree new_target;
19864 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19866 /* Save the current target options to restore at the end. */
19867 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19869 /* If fndecl already has some target attributes applied to it, unpack
19870 them so that we add this attribute on top of them, rather than
19871 overwriting them. */
19872 if (existing_target)
19874 struct cl_target_option *existing_options
19875 = TREE_TARGET_OPTION (existing_target);
19877 if (existing_options)
19878 cl_target_option_restore (&global_options, &global_options_set,
19879 existing_options);
19881 else
19882 cl_target_option_restore (&global_options, &global_options_set,
19883 TREE_TARGET_OPTION (target_option_current_node));
19885 ret = aarch64_process_target_version_attr (args);
19887 /* Set up any additional state. */
19888 if (ret)
19890 aarch64_override_options_internal (&global_options);
19891 new_target = build_target_option_node (&global_options,
19892 &global_options_set);
19894 else
19895 new_target = NULL;
19897 if (fndecl && ret)
19898 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19900 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19902 return ret;
19905 /* This parses the attribute arguments to target_version in DECL and the
19906 feature mask required to select those targets. No adjustments are made to
19907 add or remove redundant feature requirements. */
19909 static aarch64_fmv_feature_mask
19910 get_feature_mask_for_version (tree decl)
19912 tree version_attr = lookup_attribute ("target_version",
19913 DECL_ATTRIBUTES (decl));
19914 if (version_attr == NULL)
19915 return 0;
19917 const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
19918 (version_attr)));
19919 enum aarch_parse_opt_result parse_res;
19920 aarch64_fmv_feature_mask feature_mask;
19922 parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
19923 NULL);
19925 /* We should have detected any errors before getting here. */
19926 gcc_assert (parse_res == AARCH_PARSE_OK);
19928 return feature_mask;
19931 /* Compare priorities of two feature masks. Return:
19932 1: mask1 is higher priority
19933 -1: mask2 is higher priority
19934 0: masks are equal. */
19936 static int
19937 compare_feature_masks (aarch64_fmv_feature_mask mask1,
19938 aarch64_fmv_feature_mask mask2)
19940 int pop1 = popcount_hwi (mask1);
19941 int pop2 = popcount_hwi (mask2);
19942 if (pop1 > pop2)
19943 return 1;
19944 if (pop2 > pop1)
19945 return -1;
19947 auto diff_mask = mask1 ^ mask2;
19948 if (diff_mask == 0ULL)
19949 return 0;
19950 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19951 for (int i = num_features - 1; i >= 0; i--)
19953 auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
19954 if (diff_mask & bit_mask)
19955 return (mask1 & bit_mask) ? 1 : -1;
19957 gcc_unreachable();
19960 /* Compare priorities of two version decls. */
19963 aarch64_compare_version_priority (tree decl1, tree decl2)
19965 auto mask1 = get_feature_mask_for_version (decl1);
19966 auto mask2 = get_feature_mask_for_version (decl2);
19968 return compare_feature_masks (mask1, mask2);
19971 /* Build the struct __ifunc_arg_t type:
19973 struct __ifunc_arg_t
19975 unsigned long _size; // Size of the struct, so it can grow.
19976 unsigned long _hwcap;
19977 unsigned long _hwcap2;
19981 static tree
19982 build_ifunc_arg_type ()
19984 tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
19985 tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19986 get_identifier ("_size"),
19987 long_unsigned_type_node);
19988 tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19989 get_identifier ("_hwcap"),
19990 long_unsigned_type_node);
19991 tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19992 get_identifier ("_hwcap2"),
19993 long_unsigned_type_node);
19995 DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
19996 DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
19997 DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
19999 TYPE_FIELDS (ifunc_arg_type) = field1;
20000 DECL_CHAIN (field1) = field2;
20001 DECL_CHAIN (field2) = field3;
20003 layout_type (ifunc_arg_type);
20005 tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
20006 tree pointer_type = build_pointer_type (const_type);
20008 return pointer_type;
20011 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20012 suffixes. */
20014 tree
20015 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20017 /* For function version, add the target suffix to the assembler name. */
20018 if (TREE_CODE (decl) == FUNCTION_DECL
20019 && DECL_FUNCTION_VERSIONED (decl))
20021 aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20023 std::string name = IDENTIFIER_POINTER (id);
20025 /* For the default version, append ".default". */
20026 if (feature_mask == 0ULL)
20028 name += ".default";
20029 return get_identifier (name.c_str());
20032 name += "._";
20034 int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20035 for (int i = 0; i < num_features; i++)
20037 if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20039 name += "M";
20040 name += aarch64_fmv_feature_data[i].name;
20044 if (DECL_ASSEMBLER_NAME_SET_P (decl))
20045 SET_DECL_RTL (decl, NULL);
20047 id = get_identifier (name.c_str());
20049 return id;
20052 /* Return an identifier for the base assembler name of a versioned function.
20053 This is computed by taking the default version's assembler name, and
20054 stripping off the ".default" suffix if it's already been appended. */
20056 static tree
20057 get_suffixed_assembler_name (tree default_decl, const char *suffix)
20059 std::string name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl));
20061 auto size = name.size ();
20062 if (size >= 8 && name.compare (size - 8, 8, ".default") == 0)
20063 name.resize (size - 8);
20064 name += suffix;
20065 return get_identifier (name.c_str());
20068 /* Make the resolver function decl to dispatch the versions of
20069 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
20070 ifunc alias that will point to the created resolver. Create an
20071 empty basic block in the resolver and store the pointer in
20072 EMPTY_BB. Return the decl of the resolver function. */
20074 static tree
20075 make_resolver_func (const tree default_decl,
20076 const tree ifunc_alias_decl,
20077 basic_block *empty_bb)
20079 tree decl, type, t;
20081 /* Create resolver function name based on default_decl. We need to remove an
20082 existing ".default" suffix if this has already been appended. */
20083 tree decl_name = get_suffixed_assembler_name (default_decl, ".resolver");
20084 const char *resolver_name = IDENTIFIER_POINTER (decl_name);
20086 /* The resolver function should have signature
20087 (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20088 type = build_function_type_list (ptr_type_node,
20089 uint64_type_node,
20090 build_ifunc_arg_type (),
20091 NULL_TREE);
20093 decl = build_fn_decl (resolver_name, type);
20094 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
20096 DECL_NAME (decl) = decl_name;
20097 TREE_USED (decl) = 1;
20098 DECL_ARTIFICIAL (decl) = 1;
20099 DECL_IGNORED_P (decl) = 1;
20100 TREE_PUBLIC (decl) = 0;
20101 DECL_UNINLINABLE (decl) = 1;
20103 /* Resolver is not external, body is generated. */
20104 DECL_EXTERNAL (decl) = 0;
20105 DECL_EXTERNAL (ifunc_alias_decl) = 0;
20107 DECL_CONTEXT (decl) = NULL_TREE;
20108 DECL_INITIAL (decl) = make_node (BLOCK);
20109 DECL_STATIC_CONSTRUCTOR (decl) = 0;
20111 if (DECL_COMDAT_GROUP (default_decl)
20112 || TREE_PUBLIC (default_decl))
20114 /* In this case, each translation unit with a call to this
20115 versioned function will put out a resolver. Ensure it
20116 is comdat to keep just one copy. */
20117 DECL_COMDAT (decl) = 1;
20118 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
20120 else
20121 TREE_PUBLIC (ifunc_alias_decl) = 0;
20123 /* Build result decl and add to function_decl. */
20124 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
20125 DECL_CONTEXT (t) = decl;
20126 DECL_ARTIFICIAL (t) = 1;
20127 DECL_IGNORED_P (t) = 1;
20128 DECL_RESULT (decl) = t;
20130 /* Build parameter decls and add to function_decl. */
20131 tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20132 get_identifier ("hwcap"),
20133 uint64_type_node);
20134 tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20135 get_identifier ("arg"),
20136 build_ifunc_arg_type());
20137 DECL_CONTEXT (arg1) = decl;
20138 DECL_CONTEXT (arg2) = decl;
20139 DECL_ARTIFICIAL (arg1) = 1;
20140 DECL_ARTIFICIAL (arg2) = 1;
20141 DECL_IGNORED_P (arg1) = 1;
20142 DECL_IGNORED_P (arg2) = 1;
20143 DECL_ARG_TYPE (arg1) = uint64_type_node;
20144 DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
20145 DECL_ARGUMENTS (decl) = arg1;
20146 TREE_CHAIN (arg1) = arg2;
20148 gimplify_function_tree (decl);
20149 push_cfun (DECL_STRUCT_FUNCTION (decl));
20150 *empty_bb = init_lowered_empty_function (decl, false,
20151 profile_count::uninitialized ());
20153 cgraph_node::add_new_function (decl, true);
20154 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
20156 pop_cfun ();
20158 gcc_assert (ifunc_alias_decl != NULL);
20159 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
20160 DECL_ATTRIBUTES (ifunc_alias_decl)
20161 = make_attribute ("ifunc", resolver_name,
20162 DECL_ATTRIBUTES (ifunc_alias_decl));
20164 /* Create the alias for dispatch to resolver here. */
20165 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
20166 return decl;
20169 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20170 to return a pointer to VERSION_DECL if all feature bits specified in
20171 FEATURE_MASK are not set in MASK_VAR. This function will be called during
20172 version dispatch to decide which function version to execute. It returns
20173 the basic block at the end, to which more conditions can be added. */
20174 static basic_block
20175 add_condition_to_bb (tree function_decl, tree version_decl,
20176 aarch64_fmv_feature_mask feature_mask,
20177 tree mask_var, basic_block new_bb)
20179 gimple *return_stmt;
20180 tree convert_expr, result_var;
20181 gimple *convert_stmt;
20182 gimple *if_else_stmt;
20184 basic_block bb1, bb2, bb3;
20185 edge e12, e23;
20187 gimple_seq gseq;
20189 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
20191 gcc_assert (new_bb != NULL);
20192 gseq = bb_seq (new_bb);
20194 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
20195 build_fold_addr_expr (version_decl));
20196 result_var = create_tmp_var (ptr_type_node);
20197 convert_stmt = gimple_build_assign (result_var, convert_expr);
20198 return_stmt = gimple_build_return (result_var);
20200 if (feature_mask == 0ULL)
20202 /* Default version. */
20203 gimple_seq_add_stmt (&gseq, convert_stmt);
20204 gimple_seq_add_stmt (&gseq, return_stmt);
20205 set_bb_seq (new_bb, gseq);
20206 gimple_set_bb (convert_stmt, new_bb);
20207 gimple_set_bb (return_stmt, new_bb);
20208 pop_cfun ();
20209 return new_bb;
20212 tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
20213 tree and_expr = build2 (BIT_AND_EXPR,
20214 long_long_unsigned_type_node,
20215 mask_var,
20216 build_int_cst (long_long_unsigned_type_node,
20217 feature_mask));
20218 gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
20219 gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
20220 gimple_set_bb (and_stmt, new_bb);
20221 gimple_seq_add_stmt (&gseq, and_stmt);
20223 tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20224 if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20225 NULL_TREE, NULL_TREE);
20226 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20227 gimple_set_bb (if_else_stmt, new_bb);
20228 gimple_seq_add_stmt (&gseq, if_else_stmt);
20230 gimple_seq_add_stmt (&gseq, convert_stmt);
20231 gimple_seq_add_stmt (&gseq, return_stmt);
20232 set_bb_seq (new_bb, gseq);
20234 bb1 = new_bb;
20235 e12 = split_block (bb1, if_else_stmt);
20236 bb2 = e12->dest;
20237 e12->flags &= ~EDGE_FALLTHRU;
20238 e12->flags |= EDGE_TRUE_VALUE;
20240 e23 = split_block (bb2, return_stmt);
20242 gimple_set_bb (convert_stmt, bb2);
20243 gimple_set_bb (return_stmt, bb2);
20245 bb3 = e23->dest;
20246 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20248 remove_edge (e23);
20249 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20251 pop_cfun ();
20253 return bb3;
20256 /* This function generates the dispatch function for
20257 multi-versioned functions. DISPATCH_DECL is the function which will
20258 contain the dispatch logic. FNDECLS are the function choices for
20259 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
20260 in DISPATCH_DECL in which the dispatch code is generated. */
20262 static int
20263 dispatch_function_versions (tree dispatch_decl,
20264 void *fndecls_p,
20265 basic_block *empty_bb)
20267 gimple *ifunc_cpu_init_stmt;
20268 gimple_seq gseq;
20269 vec<tree> *fndecls;
20271 gcc_assert (dispatch_decl != NULL
20272 && fndecls_p != NULL
20273 && empty_bb != NULL);
20275 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20277 gseq = bb_seq (*empty_bb);
20278 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
20279 constructors, so explicity call __init_cpu_features_resolver here. */
20280 tree init_fn_type = build_function_type_list (void_type_node,
20281 long_unsigned_type_node,
20282 build_ifunc_arg_type(),
20283 NULL);
20284 tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20285 tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20286 init_fn_id, init_fn_type);
20287 tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20288 tree arg2 = TREE_CHAIN (arg1);
20289 ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20290 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20291 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20293 /* Build the struct type for __aarch64_cpu_features. */
20294 tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20295 tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20296 get_identifier ("features"),
20297 long_long_unsigned_type_node);
20298 DECL_FIELD_CONTEXT (field1) = global_type;
20299 TYPE_FIELDS (global_type) = field1;
20300 layout_type (global_type);
20302 tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20303 get_identifier ("__aarch64_cpu_features"),
20304 global_type);
20305 DECL_EXTERNAL (global_var) = 1;
20306 tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20308 tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20309 global_var, field1, NULL_TREE);
20310 gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20311 gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20312 gimple_set_bb (component_stmt, *empty_bb);
20313 gimple_seq_add_stmt (&gseq, component_stmt);
20315 tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20316 gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20317 gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20318 gimple_set_bb (not_stmt, *empty_bb);
20319 gimple_seq_add_stmt (&gseq, not_stmt);
20321 set_bb_seq (*empty_bb, gseq);
20323 pop_cfun ();
20325 /* fndecls_p is actually a vector. */
20326 fndecls = static_cast<vec<tree> *> (fndecls_p);
20328 /* At least one more version other than the default. */
20329 unsigned int num_versions = fndecls->length ();
20330 gcc_assert (num_versions >= 2);
20332 struct function_version_info
20334 tree version_decl;
20335 aarch64_fmv_feature_mask feature_mask;
20336 } *function_versions;
20338 function_versions = (struct function_version_info *)
20339 XNEWVEC (struct function_version_info, (num_versions));
20341 unsigned int actual_versions = 0;
20343 for (tree version_decl : *fndecls)
20345 aarch64_fmv_feature_mask feature_mask;
20346 /* Get attribute string, parse it and find the right features. */
20347 feature_mask = get_feature_mask_for_version (version_decl);
20348 function_versions [actual_versions].version_decl = version_decl;
20349 function_versions [actual_versions].feature_mask = feature_mask;
20350 actual_versions++;
20353 auto compare_feature_version_info = [](const void *p1, const void *p2) {
20354 const function_version_info v1 = *(const function_version_info *)p1;
20355 const function_version_info v2 = *(const function_version_info *)p2;
20356 return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20359 /* Sort the versions according to descending order of dispatch priority. */
20360 qsort (function_versions, actual_versions,
20361 sizeof (struct function_version_info), compare_feature_version_info);
20363 for (unsigned int i = 0; i < actual_versions; ++i)
20364 *empty_bb = add_condition_to_bb (dispatch_decl,
20365 function_versions[i].version_decl,
20366 function_versions[i].feature_mask,
20367 mask_var,
20368 *empty_bb);
20370 free (function_versions);
20371 return 0;
20374 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY. */
20376 tree
20377 aarch64_generate_version_dispatcher_body (void *node_p)
20379 tree resolver_decl;
20380 basic_block empty_bb;
20381 tree default_ver_decl;
20382 struct cgraph_node *versn;
20383 struct cgraph_node *node;
20385 struct cgraph_function_version_info *node_version_info = NULL;
20386 struct cgraph_function_version_info *versn_info = NULL;
20388 node = (cgraph_node *)node_p;
20390 node_version_info = node->function_version ();
20391 gcc_assert (node->dispatcher_function
20392 && node_version_info != NULL);
20394 if (node_version_info->dispatcher_resolver)
20395 return node_version_info->dispatcher_resolver;
20397 /* The first version in the chain corresponds to the default version. */
20398 default_ver_decl = node_version_info->next->this_node->decl;
20400 /* node is going to be an alias, so remove the finalized bit. */
20401 node->definition = false;
20403 resolver_decl = make_resolver_func (default_ver_decl,
20404 node->decl, &empty_bb);
20406 node_version_info->dispatcher_resolver = resolver_decl;
20408 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20410 auto_vec<tree, 2> fn_ver_vec;
20412 for (versn_info = node_version_info->next; versn_info;
20413 versn_info = versn_info->next)
20415 versn = versn_info->this_node;
20416 /* Check for virtual functions here again, as by this time it should
20417 have been determined if this function needs a vtable index or
20418 not. This happens for methods in derived classes that override
20419 virtual methods in base classes but are not explicitly marked as
20420 virtual. */
20421 if (DECL_VINDEX (versn->decl))
20422 sorry ("virtual function multiversioning not supported");
20424 fn_ver_vec.safe_push (versn->decl);
20427 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20428 cgraph_edge::rebuild_edges ();
20429 pop_cfun ();
20431 /* Fix up symbol names. First we need to obtain the base name, which may
20432 have already been mangled. */
20433 tree base_name = get_suffixed_assembler_name (default_ver_decl, "");
20435 /* We need to redo the version mangling on the non-default versions for the
20436 target_clones case. Redoing the mangling for the target_version case is
20437 redundant but does no harm. We need to skip the default version, because
20438 expand_clones will append ".default" later; fortunately that suffix is the
20439 one we want anyway. */
20440 for (versn_info = node_version_info->next->next; versn_info;
20441 versn_info = versn_info->next)
20443 tree version_decl = versn_info->this_node->decl;
20444 tree name = aarch64_mangle_decl_assembler_name (version_decl,
20445 base_name);
20446 symtab->change_decl_assembler_name (version_decl, name);
20449 /* We also need to use the base name for the ifunc declaration. */
20450 symtab->change_decl_assembler_name (node->decl, base_name);
20452 return resolver_decl;
20455 /* Make a dispatcher declaration for the multi-versioned function DECL.
20456 Calls to DECL function will be replaced with calls to the dispatcher
20457 by the front-end. Returns the decl of the dispatcher function. */
20459 tree
20460 aarch64_get_function_versions_dispatcher (void *decl)
20462 tree fn = (tree) decl;
20463 struct cgraph_node *node = NULL;
20464 struct cgraph_node *default_node = NULL;
20465 struct cgraph_function_version_info *node_v = NULL;
20466 struct cgraph_function_version_info *first_v = NULL;
20468 tree dispatch_decl = NULL;
20470 struct cgraph_function_version_info *default_version_info = NULL;
20472 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20474 node = cgraph_node::get (fn);
20475 gcc_assert (node != NULL);
20477 node_v = node->function_version ();
20478 gcc_assert (node_v != NULL);
20480 if (node_v->dispatcher_resolver != NULL)
20481 return node_v->dispatcher_resolver;
20483 /* Find the default version and make it the first node. */
20484 first_v = node_v;
20485 /* Go to the beginning of the chain. */
20486 while (first_v->prev != NULL)
20487 first_v = first_v->prev;
20488 default_version_info = first_v;
20489 while (default_version_info != NULL)
20491 if (get_feature_mask_for_version
20492 (default_version_info->this_node->decl) == 0ULL)
20493 break;
20494 default_version_info = default_version_info->next;
20497 /* If there is no default node, just return NULL. */
20498 if (default_version_info == NULL)
20499 return NULL;
20501 /* Make default info the first node. */
20502 if (first_v != default_version_info)
20504 default_version_info->prev->next = default_version_info->next;
20505 if (default_version_info->next)
20506 default_version_info->next->prev = default_version_info->prev;
20507 first_v->prev = default_version_info;
20508 default_version_info->next = first_v;
20509 default_version_info->prev = NULL;
20512 default_node = default_version_info->this_node;
20514 if (targetm.has_ifunc_p ())
20516 struct cgraph_function_version_info *it_v = NULL;
20517 struct cgraph_node *dispatcher_node = NULL;
20518 struct cgraph_function_version_info *dispatcher_version_info = NULL;
20520 /* Right now, the dispatching is done via ifunc. */
20521 dispatch_decl = make_dispatcher_decl (default_node->decl);
20522 TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20524 dispatcher_node = cgraph_node::get_create (dispatch_decl);
20525 gcc_assert (dispatcher_node != NULL);
20526 dispatcher_node->dispatcher_function = 1;
20527 dispatcher_version_info
20528 = dispatcher_node->insert_new_function_version ();
20529 dispatcher_version_info->next = default_version_info;
20530 dispatcher_node->definition = 1;
20532 /* Set the dispatcher for all the versions. */
20533 it_v = default_version_info;
20534 while (it_v != NULL)
20536 it_v->dispatcher_resolver = dispatch_decl;
20537 it_v = it_v->next;
20540 else
20542 error_at (DECL_SOURCE_LOCATION (default_node->decl),
20543 "multiversioning needs %<ifunc%> which is not supported "
20544 "on this target");
20547 return dispatch_decl;
20550 /* This function returns true if FN1 and FN2 are versions of the same function,
20551 that is, the target_version attributes of the function decls are different.
20552 This assumes that FN1 and FN2 have the same signature. */
20554 bool
20555 aarch64_common_function_versions (tree fn1, tree fn2)
20557 if (TREE_CODE (fn1) != FUNCTION_DECL
20558 || TREE_CODE (fn2) != FUNCTION_DECL)
20559 return false;
20561 return (aarch64_compare_version_priority (fn1, fn2) != 0);
20564 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P. Use an opt-out
20565 rather than an opt-in list. */
20567 static bool
20568 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20570 /* A function that has local SME state cannot be inlined into its caller,
20571 since we only support managing PSTATE.ZA switches at function scope. */
20572 return (!aarch64_fndecl_has_new_state (fndecl, "za")
20573 && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20576 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
20577 tri-bool options (yes, no, don't care) and the default value is
20578 DEF, determine whether to reject inlining. */
20580 static bool
20581 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20582 int dont_care, int def)
20584 /* If the callee doesn't care, always allow inlining. */
20585 if (callee == dont_care)
20586 return true;
20588 /* If the caller doesn't care, always allow inlining. */
20589 if (caller == dont_care)
20590 return true;
20592 /* Otherwise, allow inlining if either the callee and caller values
20593 agree, or if the callee is using the default value. */
20594 return (callee == caller || callee == def);
20597 /* Bit allocations for ipa_fn_summary::target_info. */
20599 /* Set if the function contains a stmt that relies on the function's
20600 choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20601 Not meaningful for streaming-compatible functions. */
20602 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20604 /* Set if the function clobbers ZA and ZT0. Not meaningful for functions that
20605 have ZA state. */
20606 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20607 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20609 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO. */
20611 static bool
20612 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20614 /* We could in principle skip this for streaming-compatible functions
20615 that have ZA state, but that's a rare combination. */
20616 return true;
20619 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO. */
20621 static bool
20622 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20624 if (auto *ga = dyn_cast<const gasm *> (stmt))
20626 /* We don't know what the asm does, so conservatively assume that
20627 it requires the function's current SM mode. */
20628 info |= AARCH64_IPA_SM_FIXED;
20629 for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20631 tree op = gimple_asm_clobber_op (ga, i);
20632 const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20633 if (strcmp (clobber, "za") == 0)
20634 info |= AARCH64_IPA_CLOBBERS_ZA;
20635 if (strcmp (clobber, "zt0") == 0)
20636 info |= AARCH64_IPA_CLOBBERS_ZT0;
20639 if (auto *call = dyn_cast<const gcall *> (stmt))
20641 if (gimple_call_builtin_p (call, BUILT_IN_MD))
20643 /* The attributes on AArch64 builtins are supposed to be accurate.
20644 If the function isn't marked streaming-compatible then it
20645 needs whichever SM mode it selects. */
20646 tree decl = gimple_call_fndecl (call);
20647 if (aarch64_fndecl_pstate_sm (decl) != 0)
20648 info |= AARCH64_IPA_SM_FIXED;
20651 return true;
20654 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
20655 to inline CALLEE into CALLER based on target-specific info.
20656 Make sure that the caller and callee have compatible architectural
20657 features. Then go through the other possible target attributes
20658 and see if they can block inlining. Try not to reject always_inline
20659 callees unless they are incompatible architecturally. */
20661 static bool
20662 aarch64_can_inline_p (tree caller, tree callee)
20664 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20665 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20667 struct cl_target_option *caller_opts
20668 = TREE_TARGET_OPTION (caller_tree ? caller_tree
20669 : target_option_default_node);
20671 struct cl_target_option *callee_opts
20672 = TREE_TARGET_OPTION (callee_tree ? callee_tree
20673 : target_option_default_node);
20675 /* Callee's ISA flags should be a subset of the caller's. */
20676 auto caller_asm_isa = (caller_opts->x_aarch64_asm_isa_flags
20677 & ~AARCH64_FL_ISA_MODES);
20678 auto callee_asm_isa = (callee_opts->x_aarch64_asm_isa_flags
20679 & ~AARCH64_FL_ISA_MODES);
20680 if (callee_asm_isa & ~caller_asm_isa)
20681 return false;
20683 auto caller_isa = (caller_opts->x_aarch64_isa_flags
20684 & ~AARCH64_FL_ISA_MODES);
20685 auto callee_isa = (callee_opts->x_aarch64_isa_flags
20686 & ~AARCH64_FL_ISA_MODES);
20687 if (callee_isa & ~caller_isa)
20688 return false;
20690 /* Return true if the callee might have target_info property PROPERTY.
20691 The answer must be true unless we have positive proof to the contrary. */
20692 auto callee_has_property = [&](unsigned int property)
20694 if (ipa_fn_summaries)
20695 if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20696 if (!(summary->target_info & property))
20697 return false;
20698 return true;
20701 /* Streaming-compatible code can be inlined into functions with any
20702 PSTATE.SM mode. Otherwise the caller and callee must agree on
20703 PSTATE.SM mode, unless we can prove that the callee is naturally
20704 streaming-compatible. */
20705 auto caller_sm = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20706 auto callee_sm = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20707 if (callee_sm
20708 && caller_sm != callee_sm
20709 && callee_has_property (AARCH64_IPA_SM_FIXED))
20710 return false;
20712 /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20713 functions from being inlined into others. We also need to prevent
20714 inlining of shared-ZA functions into functions without ZA state,
20715 since this is an error condition.
20717 The only other problematic case for ZA is inlining a function that
20718 directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state. */
20719 auto caller_za = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20720 auto callee_za = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20721 if (!caller_za && callee_za)
20722 return false;
20723 if (!callee_za
20724 && aarch64_fndecl_has_state (caller, "za")
20725 && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20726 return false;
20727 if (!callee_za
20728 && aarch64_fndecl_has_state (caller, "zt0")
20729 && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20730 return false;
20732 /* Allow non-strict aligned functions inlining into strict
20733 aligned ones. */
20734 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20735 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20736 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20737 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20738 return false;
20740 bool always_inline = lookup_attribute ("always_inline",
20741 DECL_ATTRIBUTES (callee));
20743 /* If the architectural features match up and the callee is always_inline
20744 then the other attributes don't matter. */
20745 if (always_inline)
20746 return true;
20748 if (caller_opts->x_aarch64_cmodel_var
20749 != callee_opts->x_aarch64_cmodel_var)
20750 return false;
20752 if (caller_opts->x_aarch64_tls_dialect
20753 != callee_opts->x_aarch64_tls_dialect)
20754 return false;
20756 /* Honour explicit requests to workaround errata. */
20757 if (!aarch64_tribools_ok_for_inlining_p (
20758 caller_opts->x_aarch64_fix_a53_err835769,
20759 callee_opts->x_aarch64_fix_a53_err835769,
20760 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20761 return false;
20763 if (!aarch64_tribools_ok_for_inlining_p (
20764 caller_opts->x_aarch64_fix_a53_err843419,
20765 callee_opts->x_aarch64_fix_a53_err843419,
20766 2, TARGET_FIX_ERR_A53_843419))
20767 return false;
20769 /* If the user explicitly specified -momit-leaf-frame-pointer for the
20770 caller and calle and they don't match up, reject inlining. */
20771 if (!aarch64_tribools_ok_for_inlining_p (
20772 caller_opts->x_flag_omit_leaf_frame_pointer,
20773 callee_opts->x_flag_omit_leaf_frame_pointer,
20774 2, 1))
20775 return false;
20777 /* If the callee has specific tuning overrides, respect them. */
20778 if (callee_opts->x_aarch64_override_tune_string != NULL
20779 && caller_opts->x_aarch64_override_tune_string == NULL)
20780 return false;
20782 /* If the user specified tuning override strings for the
20783 caller and callee and they don't match up, reject inlining.
20784 We just do a string compare here, we don't analyze the meaning
20785 of the string, as it would be too costly for little gain. */
20786 if (callee_opts->x_aarch64_override_tune_string
20787 && caller_opts->x_aarch64_override_tune_string
20788 && (strcmp (callee_opts->x_aarch64_override_tune_string,
20789 caller_opts->x_aarch64_override_tune_string) != 0))
20790 return false;
20792 return true;
20795 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20796 been already. */
20798 arm_pcs
20799 aarch64_tlsdesc_abi_id ()
20801 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20802 if (!tlsdesc_abi.initialized_p ())
20804 HARD_REG_SET full_reg_clobbers;
20805 CLEAR_HARD_REG_SET (full_reg_clobbers);
20806 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20807 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20808 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20809 SET_HARD_REG_BIT (full_reg_clobbers, regno);
20810 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20812 return ARM_PCS_TLSDESC;
20815 /* Return true if SYMBOL_REF X binds locally. */
20817 static bool
20818 aarch64_symbol_binds_local_p (const_rtx x)
20820 return (SYMBOL_REF_DECL (x)
20821 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20822 : SYMBOL_REF_LOCAL_P (x));
20825 /* Return true if SYMBOL_REF X is thread local */
20826 static bool
20827 aarch64_tls_symbol_p (rtx x)
20829 if (! TARGET_HAVE_TLS)
20830 return false;
20832 x = strip_salt (x);
20833 if (!SYMBOL_REF_P (x))
20834 return false;
20836 return SYMBOL_REF_TLS_MODEL (x) != 0;
20839 /* Classify a TLS symbol into one of the TLS kinds. */
20840 enum aarch64_symbol_type
20841 aarch64_classify_tls_symbol (rtx x)
20843 enum tls_model tls_kind = tls_symbolic_operand_type (x);
20845 switch (tls_kind)
20847 case TLS_MODEL_GLOBAL_DYNAMIC:
20848 case TLS_MODEL_LOCAL_DYNAMIC:
20849 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
20851 case TLS_MODEL_INITIAL_EXEC:
20852 switch (aarch64_cmodel)
20854 case AARCH64_CMODEL_TINY:
20855 case AARCH64_CMODEL_TINY_PIC:
20856 return SYMBOL_TINY_TLSIE;
20857 default:
20858 return SYMBOL_SMALL_TLSIE;
20861 case TLS_MODEL_LOCAL_EXEC:
20862 if (aarch64_tls_size == 12)
20863 return SYMBOL_TLSLE12;
20864 else if (aarch64_tls_size == 24)
20865 return SYMBOL_TLSLE24;
20866 else if (aarch64_tls_size == 32)
20867 return SYMBOL_TLSLE32;
20868 else if (aarch64_tls_size == 48)
20869 return SYMBOL_TLSLE48;
20870 else
20871 gcc_unreachable ();
20873 case TLS_MODEL_EMULATED:
20874 case TLS_MODEL_NONE:
20875 return SYMBOL_FORCE_TO_MEM;
20877 default:
20878 gcc_unreachable ();
20882 /* Return the correct method for accessing X + OFFSET, where X is either
20883 a SYMBOL_REF or LABEL_REF. */
20885 enum aarch64_symbol_type
20886 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
20888 x = strip_salt (x);
20890 if (LABEL_REF_P (x))
20892 switch (aarch64_cmodel)
20894 case AARCH64_CMODEL_LARGE:
20895 return SYMBOL_FORCE_TO_MEM;
20897 case AARCH64_CMODEL_TINY_PIC:
20898 case AARCH64_CMODEL_TINY:
20899 return SYMBOL_TINY_ABSOLUTE;
20901 case AARCH64_CMODEL_SMALL_SPIC:
20902 case AARCH64_CMODEL_SMALL_PIC:
20903 case AARCH64_CMODEL_SMALL:
20904 return SYMBOL_SMALL_ABSOLUTE;
20906 default:
20907 gcc_unreachable ();
20911 if (SYMBOL_REF_P (x))
20913 if (aarch64_tls_symbol_p (x))
20914 return aarch64_classify_tls_symbol (x);
20916 switch (aarch64_cmodel)
20918 case AARCH64_CMODEL_TINY_PIC:
20919 case AARCH64_CMODEL_TINY:
20920 /* With -fPIC non-local symbols use the GOT. For orthogonality
20921 always use the GOT for extern weak symbols. */
20922 if ((flag_pic || SYMBOL_REF_WEAK (x))
20923 && !aarch64_symbol_binds_local_p (x))
20924 return SYMBOL_TINY_GOT;
20926 /* When we retrieve symbol + offset address, we have to make sure
20927 the offset does not cause overflow of the final address. But
20928 we have no way of knowing the address of symbol at compile time
20929 so we can't accurately say if the distance between the PC and
20930 symbol + offset is outside the addressible range of +/-1MB in the
20931 TINY code model. So we limit the maximum offset to +/-64KB and
20932 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
20933 If offset_within_block_p is true we allow larger offsets. */
20934 if (!(IN_RANGE (offset, -0x10000, 0x10000)
20935 || offset_within_block_p (x, offset)))
20936 return SYMBOL_FORCE_TO_MEM;
20938 return SYMBOL_TINY_ABSOLUTE;
20941 case AARCH64_CMODEL_SMALL_SPIC:
20942 case AARCH64_CMODEL_SMALL_PIC:
20943 case AARCH64_CMODEL_SMALL:
20944 if ((flag_pic || SYMBOL_REF_WEAK (x))
20945 && !aarch64_symbol_binds_local_p (x))
20946 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
20947 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
20949 /* Same reasoning as the tiny code model, but the offset cap here is
20950 1MB, allowing +/-3.9GB for the offset to the symbol. */
20951 if (!(IN_RANGE (offset, -0x100000, 0x100000)
20952 || offset_within_block_p (x, offset)))
20953 return SYMBOL_FORCE_TO_MEM;
20955 return SYMBOL_SMALL_ABSOLUTE;
20957 case AARCH64_CMODEL_LARGE:
20958 /* This is alright even in PIC code as the constant
20959 pool reference is always PC relative and within
20960 the same translation unit. */
20961 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
20962 return SYMBOL_SMALL_ABSOLUTE;
20963 else
20964 return SYMBOL_FORCE_TO_MEM;
20966 default:
20967 gcc_unreachable ();
20971 /* By default push everything into the constant pool. */
20972 return SYMBOL_FORCE_TO_MEM;
20975 bool
20976 aarch64_constant_address_p (rtx x)
20978 return (CONSTANT_P (x) && memory_address_p (DImode, x));
20981 bool
20982 aarch64_legitimate_pic_operand_p (rtx x)
20984 poly_int64 offset;
20985 x = strip_offset_and_salt (x, &offset);
20986 if (SYMBOL_REF_P (x))
20987 return false;
20989 return true;
20992 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
20993 that should be rematerialized rather than spilled. */
20995 static bool
20996 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
20998 /* Support CSE and rematerialization of common constants. */
20999 if (CONST_INT_P (x)
21000 || CONST_DOUBLE_P (x))
21001 return true;
21003 /* Only accept variable-length vector constants if they can be
21004 handled directly.
21006 ??? It would be possible (but complex) to handle rematerialization
21007 of other constants via secondary reloads. */
21008 if (!GET_MODE_SIZE (mode).is_constant ())
21009 return aarch64_simd_valid_immediate (x, NULL);
21011 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21012 least be forced to memory and loaded from there. */
21013 if (CONST_VECTOR_P (x))
21014 return !targetm.cannot_force_const_mem (mode, x);
21016 /* Do not allow vector struct mode constants for Advanced SIMD.
21017 We could support 0 and -1 easily, but they need support in
21018 aarch64-simd.md. */
21019 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21020 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21021 return false;
21023 if (GET_CODE (x) == HIGH)
21024 x = XEXP (x, 0);
21026 /* Accept polynomial constants that can be calculated by using the
21027 destination of a move as the sole temporary. Constants that
21028 require a second temporary cannot be rematerialized (they can't be
21029 forced to memory and also aren't legitimate constants). */
21030 poly_int64 offset;
21031 if (poly_int_rtx_p (x, &offset))
21032 return aarch64_offset_temporaries (false, offset) <= 1;
21034 /* If an offset is being added to something else, we need to allow the
21035 base to be moved into the destination register, meaning that there
21036 are no free temporaries for the offset. */
21037 x = strip_offset_and_salt (x, &offset);
21038 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
21039 return false;
21041 /* Do not allow const (plus (anchor_symbol, const_int)). */
21042 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
21043 return false;
21045 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
21046 so spilling them is better than rematerialization. */
21047 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
21048 return true;
21050 /* Label references are always constant. */
21051 if (LABEL_REF_P (x))
21052 return true;
21054 return false;
21058 aarch64_load_tp (rtx target)
21060 if (!target
21061 || GET_MODE (target) != Pmode
21062 || !register_operand (target, Pmode))
21063 target = gen_reg_rtx (Pmode);
21065 /* Can return in any reg. */
21066 emit_insn (gen_aarch64_load_tp_hard (target));
21067 return target;
21070 /* On AAPCS systems, this is the "struct __va_list". */
21071 static GTY(()) tree va_list_type;
21073 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21074 Return the type to use as __builtin_va_list.
21076 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21078 struct __va_list
21080 void *__stack;
21081 void *__gr_top;
21082 void *__vr_top;
21083 int __gr_offs;
21084 int __vr_offs;
21085 }; */
21087 static tree
21088 aarch64_build_builtin_va_list (void)
21090 tree va_list_name;
21091 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21093 /* Create the type. */
21094 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
21095 /* Give it the required name. */
21096 va_list_name = build_decl (BUILTINS_LOCATION,
21097 TYPE_DECL,
21098 get_identifier ("__va_list"),
21099 va_list_type);
21100 DECL_ARTIFICIAL (va_list_name) = 1;
21101 TYPE_NAME (va_list_type) = va_list_name;
21102 TYPE_STUB_DECL (va_list_type) = va_list_name;
21104 /* Create the fields. */
21105 f_stack = build_decl (BUILTINS_LOCATION,
21106 FIELD_DECL, get_identifier ("__stack"),
21107 ptr_type_node);
21108 f_grtop = build_decl (BUILTINS_LOCATION,
21109 FIELD_DECL, get_identifier ("__gr_top"),
21110 ptr_type_node);
21111 f_vrtop = build_decl (BUILTINS_LOCATION,
21112 FIELD_DECL, get_identifier ("__vr_top"),
21113 ptr_type_node);
21114 f_groff = build_decl (BUILTINS_LOCATION,
21115 FIELD_DECL, get_identifier ("__gr_offs"),
21116 integer_type_node);
21117 f_vroff = build_decl (BUILTINS_LOCATION,
21118 FIELD_DECL, get_identifier ("__vr_offs"),
21119 integer_type_node);
21121 /* Tell tree-stdarg pass about our internal offset fields.
21122 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21123 purpose to identify whether the code is updating va_list internal
21124 offset fields through irregular way. */
21125 va_list_gpr_counter_field = f_groff;
21126 va_list_fpr_counter_field = f_vroff;
21128 DECL_ARTIFICIAL (f_stack) = 1;
21129 DECL_ARTIFICIAL (f_grtop) = 1;
21130 DECL_ARTIFICIAL (f_vrtop) = 1;
21131 DECL_ARTIFICIAL (f_groff) = 1;
21132 DECL_ARTIFICIAL (f_vroff) = 1;
21134 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
21135 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
21136 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
21137 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
21138 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
21140 TYPE_FIELDS (va_list_type) = f_stack;
21141 DECL_CHAIN (f_stack) = f_grtop;
21142 DECL_CHAIN (f_grtop) = f_vrtop;
21143 DECL_CHAIN (f_vrtop) = f_groff;
21144 DECL_CHAIN (f_groff) = f_vroff;
21146 /* Compute its layout. */
21147 layout_type (va_list_type);
21149 return va_list_type;
21152 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
21153 static void
21154 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
21156 const CUMULATIVE_ARGS *cum;
21157 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21158 tree stack, grtop, vrtop, groff, vroff;
21159 tree t;
21160 int gr_save_area_size = cfun->va_list_gpr_size;
21161 int vr_save_area_size = cfun->va_list_fpr_size;
21162 int vr_offset;
21164 cum = &crtl->args.info;
21165 if (cfun->va_list_gpr_size)
21166 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
21167 cfun->va_list_gpr_size);
21168 if (cfun->va_list_fpr_size)
21169 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
21170 * UNITS_PER_VREG, cfun->va_list_fpr_size);
21172 if (!TARGET_FLOAT)
21174 gcc_assert (cum->aapcs_nvrn == 0);
21175 vr_save_area_size = 0;
21178 f_stack = TYPE_FIELDS (va_list_type_node);
21179 f_grtop = DECL_CHAIN (f_stack);
21180 f_vrtop = DECL_CHAIN (f_grtop);
21181 f_groff = DECL_CHAIN (f_vrtop);
21182 f_vroff = DECL_CHAIN (f_groff);
21184 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
21185 NULL_TREE);
21186 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
21187 NULL_TREE);
21188 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
21189 NULL_TREE);
21190 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
21191 NULL_TREE);
21192 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
21193 NULL_TREE);
21195 /* Emit code to initialize STACK, which points to the next varargs stack
21196 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
21197 by named arguments. STACK is 8-byte aligned. */
21198 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
21199 if (cum->aapcs_stack_size > 0)
21200 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
21201 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
21202 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21204 /* Emit code to initialize GRTOP, the top of the GR save area.
21205 virtual_incoming_args_rtx should have been 16 byte aligned. */
21206 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
21207 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
21208 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21210 /* Emit code to initialize VRTOP, the top of the VR save area.
21211 This address is gr_save_area_bytes below GRTOP, rounded
21212 down to the next 16-byte boundary. */
21213 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21214 vr_offset = ROUND_UP (gr_save_area_size,
21215 STACK_BOUNDARY / BITS_PER_UNIT);
21217 if (vr_offset)
21218 t = fold_build_pointer_plus_hwi (t, -vr_offset);
21219 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21220 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21222 /* Emit code to initialize GROFF, the offset from GRTOP of the
21223 next GPR argument. */
21224 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21225 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21226 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21228 /* Likewise emit code to initialize VROFF, the offset from FTOP
21229 of the next VR argument. */
21230 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21231 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21232 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21235 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
21237 static tree
21238 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21239 gimple_seq *post_p ATTRIBUTE_UNUSED)
21241 tree addr;
21242 bool indirect_p;
21243 bool is_ha; /* is HFA or HVA. */
21244 bool dw_align; /* double-word align. */
21245 machine_mode ag_mode = VOIDmode;
21246 int nregs;
21247 machine_mode mode;
21249 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21250 tree stack, f_top, f_off, off, arg, roundup, on_stack;
21251 HOST_WIDE_INT size, rsize, adjust, align;
21252 tree t, u, cond1, cond2;
21254 indirect_p = pass_va_arg_by_reference (type);
21255 if (indirect_p)
21256 type = build_pointer_type (type);
21258 mode = TYPE_MODE (type);
21260 f_stack = TYPE_FIELDS (va_list_type_node);
21261 f_grtop = DECL_CHAIN (f_stack);
21262 f_vrtop = DECL_CHAIN (f_grtop);
21263 f_groff = DECL_CHAIN (f_vrtop);
21264 f_vroff = DECL_CHAIN (f_groff);
21266 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21267 f_stack, NULL_TREE);
21268 size = int_size_in_bytes (type);
21270 unsigned int abi_break_gcc_9;
21271 unsigned int abi_break_gcc_13;
21272 unsigned int abi_break_gcc_14;
21273 align
21274 = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21275 &abi_break_gcc_13, &abi_break_gcc_14)
21276 / BITS_PER_UNIT;
21278 dw_align = false;
21279 adjust = 0;
21280 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21281 &is_ha, false))
21283 /* No frontends can create types with variable-sized modes, so we
21284 shouldn't be asked to pass or return them. */
21285 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21287 /* TYPE passed in fp/simd registers. */
21288 if (!TARGET_FLOAT)
21289 aarch64_err_no_fpadvsimd (mode);
21291 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21292 unshare_expr (valist), f_vrtop, NULL_TREE);
21293 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21294 unshare_expr (valist), f_vroff, NULL_TREE);
21296 rsize = nregs * UNITS_PER_VREG;
21298 if (is_ha)
21300 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21301 adjust = UNITS_PER_VREG - ag_size;
21303 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21304 && size < UNITS_PER_VREG)
21306 adjust = UNITS_PER_VREG - size;
21309 else
21311 /* TYPE passed in general registers. */
21312 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21313 unshare_expr (valist), f_grtop, NULL_TREE);
21314 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21315 unshare_expr (valist), f_groff, NULL_TREE);
21316 rsize = ROUND_UP (size, UNITS_PER_WORD);
21317 nregs = rsize / UNITS_PER_WORD;
21319 if (align <= 8
21320 && abi_break_gcc_13
21321 && warn_psabi
21322 && !bitint_or_aggr_of_bitint_p (type))
21323 inform (input_location, "parameter passing for argument of type "
21324 "%qT changed in GCC 13.1", type);
21326 if (warn_psabi
21327 && abi_break_gcc_14
21328 && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
21329 && !bitint_or_aggr_of_bitint_p (type))
21330 inform (input_location, "parameter passing for argument of type "
21331 "%qT changed in GCC 14.1", type);
21333 if (align > 8)
21335 if (abi_break_gcc_9
21336 && warn_psabi
21337 && !bitint_or_aggr_of_bitint_p (type))
21338 inform (input_location, "parameter passing for argument of type "
21339 "%qT changed in GCC 9.1", type);
21340 dw_align = true;
21343 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21344 && size < UNITS_PER_WORD)
21346 adjust = UNITS_PER_WORD - size;
21350 /* Get a local temporary for the field value. */
21351 off = get_initialized_tmp_var (f_off, pre_p, NULL);
21353 /* Emit code to branch if off >= 0. */
21354 t = build2 (GE_EXPR, boolean_type_node, off,
21355 build_int_cst (TREE_TYPE (off), 0));
21356 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21358 if (dw_align)
21360 /* Emit: offs = (offs + 15) & -16. */
21361 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21362 build_int_cst (TREE_TYPE (off), 15));
21363 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21364 build_int_cst (TREE_TYPE (off), -16));
21365 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21367 else
21368 roundup = NULL;
21370 /* Update ap.__[g|v]r_offs */
21371 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21372 build_int_cst (TREE_TYPE (off), rsize));
21373 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21375 /* String up. */
21376 if (roundup)
21377 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21379 /* [cond2] if (ap.__[g|v]r_offs > 0) */
21380 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21381 build_int_cst (TREE_TYPE (f_off), 0));
21382 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21384 /* String up: make sure the assignment happens before the use. */
21385 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21386 COND_EXPR_ELSE (cond1) = t;
21388 /* Prepare the trees handling the argument that is passed on the stack;
21389 the top level node will store in ON_STACK. */
21390 arg = get_initialized_tmp_var (stack, pre_p, NULL);
21391 if (align > 8)
21393 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
21394 t = fold_build_pointer_plus_hwi (arg, 15);
21395 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21396 build_int_cst (TREE_TYPE (t), -16));
21397 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21399 else
21400 roundup = NULL;
21401 /* Advance ap.__stack */
21402 t = fold_build_pointer_plus_hwi (arg, size + 7);
21403 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21404 build_int_cst (TREE_TYPE (t), -8));
21405 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21406 /* String up roundup and advance. */
21407 if (roundup)
21408 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21409 /* String up with arg */
21410 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21411 /* Big-endianness related address adjustment. */
21412 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21413 && size < UNITS_PER_WORD)
21415 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21416 size_int (UNITS_PER_WORD - size));
21417 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21420 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21421 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21423 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
21424 t = off;
21425 if (adjust)
21426 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21427 build_int_cst (TREE_TYPE (off), adjust));
21429 t = fold_convert (sizetype, t);
21430 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21432 if (is_ha)
21434 /* type ha; // treat as "struct {ftype field[n];}"
21435 ... [computing offs]
21436 for (i = 0; i <nregs; ++i, offs += 16)
21437 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21438 return ha; */
21439 int i;
21440 tree tmp_ha, field_t, field_ptr_t;
21442 /* Declare a local variable. */
21443 tmp_ha = create_tmp_var_raw (type, "ha");
21444 gimple_add_tmp_var (tmp_ha);
21446 /* Establish the base type. */
21447 switch (ag_mode)
21449 case E_SFmode:
21450 field_t = float_type_node;
21451 field_ptr_t = float_ptr_type_node;
21452 break;
21453 case E_DFmode:
21454 field_t = double_type_node;
21455 field_ptr_t = double_ptr_type_node;
21456 break;
21457 case E_TFmode:
21458 field_t = long_double_type_node;
21459 field_ptr_t = long_double_ptr_type_node;
21460 break;
21461 case E_SDmode:
21462 field_t = dfloat32_type_node;
21463 field_ptr_t = build_pointer_type (dfloat32_type_node);
21464 break;
21465 case E_DDmode:
21466 field_t = dfloat64_type_node;
21467 field_ptr_t = build_pointer_type (dfloat64_type_node);
21468 break;
21469 case E_TDmode:
21470 field_t = dfloat128_type_node;
21471 field_ptr_t = build_pointer_type (dfloat128_type_node);
21472 break;
21473 case E_HFmode:
21474 field_t = aarch64_fp16_type_node;
21475 field_ptr_t = aarch64_fp16_ptr_type_node;
21476 break;
21477 case E_BFmode:
21478 field_t = bfloat16_type_node;
21479 field_ptr_t = aarch64_bf16_ptr_type_node;
21480 break;
21481 case E_V2SImode:
21482 case E_V4SImode:
21484 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21485 field_t = build_vector_type_for_mode (innertype, ag_mode);
21486 field_ptr_t = build_pointer_type (field_t);
21488 break;
21489 default:
21490 gcc_assert (0);
21493 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
21494 TREE_ADDRESSABLE (tmp_ha) = 1;
21495 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21496 addr = t;
21497 t = fold_convert (field_ptr_t, addr);
21498 t = build2 (MODIFY_EXPR, field_t,
21499 build1 (INDIRECT_REF, field_t, tmp_ha),
21500 build1 (INDIRECT_REF, field_t, t));
21502 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
21503 for (i = 1; i < nregs; ++i)
21505 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21506 u = fold_convert (field_ptr_t, addr);
21507 u = build2 (MODIFY_EXPR, field_t,
21508 build2 (MEM_REF, field_t, tmp_ha,
21509 build_int_cst (field_ptr_t,
21510 (i *
21511 int_size_in_bytes (field_t)))),
21512 build1 (INDIRECT_REF, field_t, u));
21513 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21516 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21517 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21520 COND_EXPR_ELSE (cond2) = t;
21521 addr = fold_convert (build_pointer_type (type), cond1);
21522 addr = build_va_arg_indirect_ref (addr);
21524 if (indirect_p)
21525 addr = build_va_arg_indirect_ref (addr);
21527 return addr;
21530 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
21532 static void
21533 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21534 const function_arg_info &arg,
21535 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21537 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21538 CUMULATIVE_ARGS local_cum;
21539 int gr_saved = cfun->va_list_gpr_size;
21540 int vr_saved = cfun->va_list_fpr_size;
21542 /* The caller has advanced CUM up to, but not beyond, the last named
21543 argument. Advance a local copy of CUM past the last "real" named
21544 argument, to find out how many registers are left over. */
21545 local_cum = *cum;
21546 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21547 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21549 /* Found out how many registers we need to save.
21550 Honor tree-stdvar analysis results. */
21551 if (cfun->va_list_gpr_size)
21552 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21553 cfun->va_list_gpr_size / UNITS_PER_WORD);
21554 if (cfun->va_list_fpr_size)
21555 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21556 cfun->va_list_fpr_size / UNITS_PER_VREG);
21558 if (!TARGET_FLOAT)
21560 gcc_assert (local_cum.aapcs_nvrn == 0);
21561 vr_saved = 0;
21564 if (!no_rtl)
21566 if (gr_saved > 0)
21568 rtx ptr, mem;
21570 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
21571 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21572 - gr_saved * UNITS_PER_WORD);
21573 mem = gen_frame_mem (BLKmode, ptr);
21574 set_mem_alias_set (mem, get_varargs_alias_set ());
21576 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21577 mem, gr_saved);
21579 if (vr_saved > 0)
21581 /* We can't use move_block_from_reg, because it will use
21582 the wrong mode, storing D regs only. */
21583 machine_mode mode = TImode;
21584 int off, i, vr_start;
21586 /* Set OFF to the offset from virtual_incoming_args_rtx of
21587 the first vector register. The VR save area lies below
21588 the GR one, and is aligned to 16 bytes. */
21589 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21590 STACK_BOUNDARY / BITS_PER_UNIT);
21591 off -= vr_saved * UNITS_PER_VREG;
21593 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21594 for (i = 0; i < vr_saved; ++i)
21596 rtx ptr, mem;
21598 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21599 mem = gen_frame_mem (mode, ptr);
21600 set_mem_alias_set (mem, get_varargs_alias_set ());
21601 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21602 off += UNITS_PER_VREG;
21607 /* We don't save the size into *PRETEND_SIZE because we want to avoid
21608 any complication of having crtl->args.pretend_args_size changed. */
21609 cfun->machine->frame.saved_varargs_size
21610 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21611 STACK_BOUNDARY / BITS_PER_UNIT)
21612 + vr_saved * UNITS_PER_VREG);
21615 static void
21616 aarch64_conditional_register_usage (void)
21618 int i;
21619 if (!TARGET_FLOAT)
21621 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21623 fixed_regs[i] = 1;
21624 call_used_regs[i] = 1;
21625 CLEAR_HARD_REG_BIT (operand_reg_set, i);
21628 if (!TARGET_SVE)
21629 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21631 fixed_regs[i] = 1;
21632 call_used_regs[i] = 1;
21635 /* Only allow these registers to be accessed via special patterns. */
21636 CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21637 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21638 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21639 for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21640 CLEAR_HARD_REG_BIT (operand_reg_set, i);
21642 /* When tracking speculation, we need a couple of call-clobbered registers
21643 to track the speculation state. It would be nice to just use
21644 IP0 and IP1, but currently there are numerous places that just
21645 assume these registers are free for other uses (eg pointer
21646 authentication). */
21647 if (aarch64_track_speculation)
21649 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21650 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21651 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21652 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21656 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
21658 bool
21659 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21661 /* For records we're passed a FIELD_DECL, for arrays we're passed
21662 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
21663 const_tree type = TREE_TYPE (field_or_array);
21665 /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21666 For structures, the "multiple" case is indicated by MODE being
21667 VOIDmode. */
21668 unsigned int num_zr, num_pr;
21669 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21671 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21672 return !simple_cst_equal (TYPE_SIZE (field_or_array),
21673 TYPE_SIZE (type));
21674 return mode == VOIDmode;
21677 return default_member_type_forces_blk (field_or_array, mode);
21680 /* Bitmasks that indicate whether earlier versions of GCC would have
21681 taken a different path through the ABI logic. This should result in
21682 a -Wpsabi warning if the earlier path led to a different ABI decision.
21684 WARN_PSABI_EMPTY_CXX17_BASE
21685 Indicates that the type includes an artificial empty C++17 base field
21686 that, prior to GCC 10.1, would prevent the type from being treated as
21687 a HFA or HVA. See PR94383 for details.
21689 WARN_PSABI_NO_UNIQUE_ADDRESS
21690 Indicates that the type includes an empty [[no_unique_address]] field
21691 that, prior to GCC 10.1, would prevent the type from being treated as
21692 a HFA or HVA. */
21693 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21694 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21695 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21697 /* Walk down the type tree of TYPE counting consecutive base elements.
21698 If *MODEP is VOIDmode, then set it to the first valid floating point
21699 type. If a non-floating point type is found, or if a floating point
21700 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21701 otherwise return the count in the sub-tree.
21703 The WARN_PSABI_FLAGS argument allows the caller to check whether this
21704 function has changed its behavior relative to earlier versions of GCC.
21705 Normally the argument should be nonnull and point to a zero-initialized
21706 variable. The function then records whether the ABI decision might
21707 be affected by a known fix to the ABI logic, setting the associated
21708 WARN_PSABI_* bits if so.
21710 When the argument is instead a null pointer, the function tries to
21711 simulate the behavior of GCC before all such ABI fixes were made.
21712 This is useful to check whether the function returns something
21713 different after the ABI fixes. */
21714 static int
21715 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21716 unsigned int *warn_psabi_flags)
21718 machine_mode mode;
21719 HOST_WIDE_INT size;
21721 if (aarch64_sve::builtin_type_p (type))
21722 return -1;
21724 switch (TREE_CODE (type))
21726 case REAL_TYPE:
21727 mode = TYPE_MODE (type);
21728 if (mode != DFmode && mode != SFmode
21729 && mode != TFmode && mode != HFmode
21730 && mode != SDmode && mode != DDmode && mode != TDmode)
21731 return -1;
21733 if (*modep == VOIDmode)
21734 *modep = mode;
21736 if (*modep == mode)
21737 return 1;
21739 break;
21741 case COMPLEX_TYPE:
21742 mode = TYPE_MODE (TREE_TYPE (type));
21743 if (mode != DFmode && mode != SFmode
21744 && mode != TFmode && mode != HFmode)
21745 return -1;
21747 if (*modep == VOIDmode)
21748 *modep = mode;
21750 if (*modep == mode)
21751 return 2;
21753 break;
21755 case VECTOR_TYPE:
21756 /* Use V2SImode and V4SImode as representatives of all 64-bit
21757 and 128-bit vector types. */
21758 size = int_size_in_bytes (type);
21759 switch (size)
21761 case 8:
21762 mode = V2SImode;
21763 break;
21764 case 16:
21765 mode = V4SImode;
21766 break;
21767 default:
21768 return -1;
21771 if (*modep == VOIDmode)
21772 *modep = mode;
21774 /* Vector modes are considered to be opaque: two vectors are
21775 equivalent for the purposes of being homogeneous aggregates
21776 if they are the same size. */
21777 if (*modep == mode)
21778 return 1;
21780 break;
21782 case ARRAY_TYPE:
21784 int count;
21785 tree index = TYPE_DOMAIN (type);
21787 /* Can't handle incomplete types nor sizes that are not
21788 fixed. */
21789 if (!COMPLETE_TYPE_P (type)
21790 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21791 return -1;
21793 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21794 warn_psabi_flags);
21795 if (count == -1
21796 || !index
21797 || !TYPE_MAX_VALUE (index)
21798 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21799 || !TYPE_MIN_VALUE (index)
21800 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21801 || count < 0)
21802 return -1;
21804 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21805 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21807 /* There must be no padding. */
21808 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21809 count * GET_MODE_BITSIZE (*modep)))
21810 return -1;
21812 return count;
21815 case RECORD_TYPE:
21817 int count = 0;
21818 int sub_count;
21819 tree field;
21821 /* Can't handle incomplete types nor sizes that are not
21822 fixed. */
21823 if (!COMPLETE_TYPE_P (type)
21824 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21825 return -1;
21827 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21829 if (TREE_CODE (field) != FIELD_DECL)
21830 continue;
21832 if (DECL_FIELD_ABI_IGNORED (field))
21834 /* See whether this is something that earlier versions of
21835 GCC failed to ignore. */
21836 unsigned int flag;
21837 if (lookup_attribute ("no_unique_address",
21838 DECL_ATTRIBUTES (field)))
21839 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
21840 else if (cxx17_empty_base_field_p (field))
21841 flag = WARN_PSABI_EMPTY_CXX17_BASE;
21842 else
21843 /* No compatibility problem. */
21844 continue;
21846 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
21847 if (warn_psabi_flags)
21849 *warn_psabi_flags |= flag;
21850 continue;
21853 /* A zero-width bitfield may affect layout in some
21854 circumstances, but adds no members. The determination
21855 of whether or not a type is an HFA is performed after
21856 layout is complete, so if the type still looks like an
21857 HFA afterwards, it is still classed as one. This is
21858 potentially an ABI break for the hard-float ABI. */
21859 else if (DECL_BIT_FIELD (field)
21860 && integer_zerop (DECL_SIZE (field)))
21862 /* Prior to GCC-12 these fields were striped early,
21863 hiding them from the back-end entirely and
21864 resulting in the correct behaviour for argument
21865 passing. Simulate that old behaviour without
21866 generating a warning. */
21867 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
21868 continue;
21869 if (warn_psabi_flags)
21871 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
21872 continue;
21876 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21877 warn_psabi_flags);
21878 if (sub_count < 0)
21879 return -1;
21880 count += sub_count;
21883 /* There must be no padding. */
21884 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21885 count * GET_MODE_BITSIZE (*modep)))
21886 return -1;
21888 return count;
21891 case UNION_TYPE:
21892 case QUAL_UNION_TYPE:
21894 /* These aren't very interesting except in a degenerate case. */
21895 int count = 0;
21896 int sub_count;
21897 tree field;
21899 /* Can't handle incomplete types nor sizes that are not
21900 fixed. */
21901 if (!COMPLETE_TYPE_P (type)
21902 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21903 return -1;
21905 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21907 if (TREE_CODE (field) != FIELD_DECL)
21908 continue;
21910 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21911 warn_psabi_flags);
21912 if (sub_count < 0)
21913 return -1;
21914 count = count > sub_count ? count : sub_count;
21917 /* There must be no padding. */
21918 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21919 count * GET_MODE_BITSIZE (*modep)))
21920 return -1;
21922 return count;
21925 default:
21926 break;
21929 return -1;
21932 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
21933 type as described in AAPCS64 \S 4.1.2.
21935 See the comment above aarch64_composite_type_p for the notes on MODE. */
21937 static bool
21938 aarch64_short_vector_p (const_tree type,
21939 machine_mode mode)
21941 poly_int64 size = -1;
21943 if (type && VECTOR_TYPE_P (type))
21945 if (aarch64_sve::builtin_type_p (type))
21946 return false;
21947 size = int_size_in_bytes (type);
21949 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
21950 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
21952 /* The containing "else if" is too loose: it means that we look at TYPE
21953 if the type is a vector type (good), but that we otherwise ignore TYPE
21954 and look only at the mode. This is wrong because the type describes
21955 the language-level information whereas the mode is purely an internal
21956 GCC concept. We can therefore reach here for types that are not
21957 vectors in the AAPCS64 sense.
21959 We can't "fix" that for the traditional Advanced SIMD vector modes
21960 without breaking backwards compatibility. However, there's no such
21961 baggage for the structure modes, which were introduced in GCC 12. */
21962 if (aarch64_advsimd_struct_mode_p (mode))
21963 return false;
21965 /* For similar reasons, rely only on the type, not the mode, when
21966 processing SVE types. */
21967 if (type && aarch64_some_values_include_pst_objects_p (type))
21968 /* Leave later code to report an error if SVE is disabled. */
21969 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
21970 else
21971 size = GET_MODE_SIZE (mode);
21973 if (known_eq (size, 8) || known_eq (size, 16))
21975 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
21976 they are being treated as scalable AAPCS64 types. */
21977 gcc_assert (!aarch64_sve_mode_p (mode)
21978 && !aarch64_advsimd_struct_mode_p (mode));
21979 return true;
21981 return false;
21984 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
21985 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
21986 array types. The C99 floating-point complex types are also considered
21987 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
21988 types, which are GCC extensions and out of the scope of AAPCS64, are
21989 treated as composite types here as well.
21991 Note that MODE itself is not sufficient in determining whether a type
21992 is such a composite type or not. This is because
21993 stor-layout.cc:compute_record_mode may have already changed the MODE
21994 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
21995 structure with only one field may have its MODE set to the mode of the
21996 field. Also an integer mode whose size matches the size of the
21997 RECORD_TYPE type may be used to substitute the original mode
21998 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
21999 solely relied on. */
22001 static bool
22002 aarch64_composite_type_p (const_tree type,
22003 machine_mode mode)
22005 if (aarch64_short_vector_p (type, mode))
22006 return false;
22008 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
22009 return true;
22011 if (type
22012 && TREE_CODE (type) == BITINT_TYPE
22013 && int_size_in_bytes (type) > 16)
22014 return true;
22016 if (mode == BLKmode
22017 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
22018 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
22019 return true;
22021 return false;
22024 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22025 shall be passed or returned in simd/fp register(s) (providing these
22026 parameter passing registers are available).
22028 Upon successful return, *COUNT returns the number of needed registers,
22029 *BASE_MODE returns the mode of the individual register and when IS_HA
22030 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22031 floating-point aggregate or a homogeneous short-vector aggregate.
22033 SILENT_P is true if the function should refrain from reporting any
22034 diagnostics. This should only be used if the caller is certain that
22035 any ABI decisions would eventually come through this function with
22036 SILENT_P set to false. */
22038 static bool
22039 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
22040 const_tree type,
22041 machine_mode *base_mode,
22042 int *count,
22043 bool *is_ha,
22044 bool silent_p)
22046 if (is_ha != NULL) *is_ha = false;
22048 machine_mode new_mode = VOIDmode;
22049 bool composite_p = aarch64_composite_type_p (type, mode);
22051 if ((!composite_p
22052 && (GET_MODE_CLASS (mode) == MODE_FLOAT
22053 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
22054 || aarch64_short_vector_p (type, mode))
22056 *count = 1;
22057 new_mode = mode;
22059 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
22061 if (is_ha != NULL) *is_ha = true;
22062 *count = 2;
22063 new_mode = GET_MODE_INNER (mode);
22065 else if (type && composite_p)
22067 unsigned int warn_psabi_flags = 0;
22068 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
22069 &warn_psabi_flags);
22070 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
22072 static unsigned last_reported_type_uid;
22073 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
22074 int alt;
22075 if (!silent_p
22076 && warn_psabi
22077 && warn_psabi_flags
22078 && uid != last_reported_type_uid
22079 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
22080 != ag_count))
22082 const char *url10
22083 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
22084 const char *url12
22085 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
22086 gcc_assert (alt == -1);
22087 last_reported_type_uid = uid;
22088 /* Use TYPE_MAIN_VARIANT to strip any redundant const
22089 qualification. */
22090 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
22091 inform (input_location, "parameter passing for argument of "
22092 "type %qT with %<[[no_unique_address]]%> members "
22093 "changed %{in GCC 10.1%}",
22094 TYPE_MAIN_VARIANT (type), url10);
22095 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
22096 inform (input_location, "parameter passing for argument of "
22097 "type %qT when C++17 is enabled changed to match "
22098 "C++14 %{in GCC 10.1%}",
22099 TYPE_MAIN_VARIANT (type), url10);
22100 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
22101 inform (input_location, "parameter passing for argument of "
22102 "type %qT changed %{in GCC 12.1%}",
22103 TYPE_MAIN_VARIANT (type), url12);
22106 if (is_ha != NULL) *is_ha = true;
22107 *count = ag_count;
22109 else
22110 return false;
22112 else
22113 return false;
22115 gcc_assert (!aarch64_sve_mode_p (new_mode));
22116 *base_mode = new_mode;
22117 return true;
22120 /* Implement TARGET_STRUCT_VALUE_RTX. */
22122 static rtx
22123 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
22124 int incoming ATTRIBUTE_UNUSED)
22126 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
22129 /* Implements target hook vector_mode_supported_p. */
22130 static bool
22131 aarch64_vector_mode_supported_p (machine_mode mode)
22133 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22134 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22137 /* Implements target hook vector_mode_supported_any_target_p. */
22138 static bool
22139 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
22141 unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
22142 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22145 /* Return the full-width SVE vector mode for element mode MODE, if one
22146 exists. */
22147 opt_machine_mode
22148 aarch64_full_sve_mode (scalar_mode mode)
22150 switch (mode)
22152 case E_DFmode:
22153 return VNx2DFmode;
22154 case E_SFmode:
22155 return VNx4SFmode;
22156 case E_HFmode:
22157 return VNx8HFmode;
22158 case E_BFmode:
22159 return VNx8BFmode;
22160 case E_DImode:
22161 return VNx2DImode;
22162 case E_SImode:
22163 return VNx4SImode;
22164 case E_HImode:
22165 return VNx8HImode;
22166 case E_QImode:
22167 return VNx16QImode;
22168 default:
22169 return opt_machine_mode ();
22173 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22174 if it exists. */
22175 opt_machine_mode
22176 aarch64_vq_mode (scalar_mode mode)
22178 switch (mode)
22180 case E_DFmode:
22181 return V2DFmode;
22182 case E_SFmode:
22183 return V4SFmode;
22184 case E_HFmode:
22185 return V8HFmode;
22186 case E_BFmode:
22187 return V8BFmode;
22188 case E_SImode:
22189 return V4SImode;
22190 case E_HImode:
22191 return V8HImode;
22192 case E_QImode:
22193 return V16QImode;
22194 case E_DImode:
22195 return V2DImode;
22196 default:
22197 return opt_machine_mode ();
22201 /* Return appropriate SIMD container
22202 for MODE within a vector of WIDTH bits. */
22203 static machine_mode
22204 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
22206 if (TARGET_SVE
22207 && maybe_ne (width, 128)
22208 && known_eq (width, BITS_PER_SVE_VECTOR))
22209 return aarch64_full_sve_mode (mode).else_mode (word_mode);
22211 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
22212 if (TARGET_BASE_SIMD)
22214 if (known_eq (width, 128))
22215 return aarch64_vq_mode (mode).else_mode (word_mode);
22216 else
22217 switch (mode)
22219 case E_SFmode:
22220 return V2SFmode;
22221 case E_HFmode:
22222 return V4HFmode;
22223 case E_BFmode:
22224 return V4BFmode;
22225 case E_SImode:
22226 return V2SImode;
22227 case E_HImode:
22228 return V4HImode;
22229 case E_QImode:
22230 return V8QImode;
22231 default:
22232 break;
22235 return word_mode;
22238 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22239 and return whether the SVE mode should be preferred over the
22240 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
22241 static bool
22242 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22244 /* Take into account the aarch64-autovec-preference param if non-zero. */
22245 bool only_asimd_p = aarch64_autovec_preference == 1;
22246 bool only_sve_p = aarch64_autovec_preference == 2;
22248 if (only_asimd_p)
22249 return false;
22250 if (only_sve_p)
22251 return true;
22253 /* The preference in case of a tie in costs. */
22254 bool prefer_asimd = aarch64_autovec_preference == 3;
22255 bool prefer_sve = aarch64_autovec_preference == 4;
22257 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22258 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22259 /* If the CPU information does not have an SVE width registered use the
22260 generic poly_int comparison that prefers SVE. If a preference is
22261 explicitly requested avoid this path. */
22262 if (aarch64_tune_params.sve_width == SVE_SCALABLE
22263 && !prefer_asimd
22264 && !prefer_sve)
22265 return maybe_gt (nunits_sve, nunits_asimd);
22267 /* Otherwise estimate the runtime width of the modes involved. */
22268 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22269 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22271 /* Preferring SVE means picking it first unless the Advanced SIMD mode
22272 is clearly wider. */
22273 if (prefer_sve)
22274 return est_sve >= est_asimd;
22275 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22276 is clearly wider. */
22277 if (prefer_asimd)
22278 return est_sve > est_asimd;
22280 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
22281 return est_sve > est_asimd;
22284 /* Return 128-bit container as the preferred SIMD mode for MODE. */
22285 static machine_mode
22286 aarch64_preferred_simd_mode (scalar_mode mode)
22288 /* Take into account explicit auto-vectorization ISA preferences through
22289 aarch64_cmp_autovec_modes. */
22290 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22291 return aarch64_full_sve_mode (mode).else_mode (word_mode);
22292 if (TARGET_SIMD)
22293 return aarch64_vq_mode (mode).else_mode (word_mode);
22294 return word_mode;
22297 /* Return a list of possible vector sizes for the vectorizer
22298 to iterate over. */
22299 static unsigned int
22300 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22302 static const machine_mode sve_modes[] = {
22303 /* Try using full vectors for all element types. */
22304 VNx16QImode,
22306 /* Try using 16-bit containers for 8-bit elements and full vectors
22307 for wider elements. */
22308 VNx8QImode,
22310 /* Try using 32-bit containers for 8-bit and 16-bit elements and
22311 full vectors for wider elements. */
22312 VNx4QImode,
22314 /* Try using 64-bit containers for all element types. */
22315 VNx2QImode
22318 static const machine_mode advsimd_modes[] = {
22319 /* Try using 128-bit vectors for all element types. */
22320 V16QImode,
22322 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22323 for wider elements. */
22324 V8QImode,
22326 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22327 for wider elements.
22329 TODO: We could support a limited form of V4QImode too, so that
22330 we use 32-bit vectors for 8-bit elements. */
22331 V4HImode,
22333 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22334 for 64-bit elements.
22336 TODO: We could similarly support limited forms of V2QImode and V2HImode
22337 for this case. */
22338 V2SImode
22341 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22342 This is because:
22344 - If we can't use N-byte Advanced SIMD vectors then the placement
22345 doesn't matter; we'll just continue as though the Advanced SIMD
22346 entry didn't exist.
22348 - If an SVE main loop with N bytes ends up being cheaper than an
22349 Advanced SIMD main loop with N bytes then by default we'll replace
22350 the Advanced SIMD version with the SVE one.
22352 - If an Advanced SIMD main loop with N bytes ends up being cheaper
22353 than an SVE main loop with N bytes then by default we'll try to
22354 use the SVE loop to vectorize the epilogue instead. */
22356 bool only_asimd_p = aarch64_autovec_preference == 1;
22357 bool only_sve_p = aarch64_autovec_preference == 2;
22359 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22360 unsigned int advsimd_i = 0;
22362 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22364 if (sve_i < ARRAY_SIZE (sve_modes)
22365 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22366 advsimd_modes[advsimd_i]))
22367 modes->safe_push (sve_modes[sve_i++]);
22368 else
22369 modes->safe_push (advsimd_modes[advsimd_i++]);
22371 while (sve_i < ARRAY_SIZE (sve_modes))
22372 modes->safe_push (sve_modes[sve_i++]);
22374 unsigned int flags = 0;
22375 if (aarch64_vect_compare_costs)
22376 flags |= VECT_COMPARE_COSTS;
22377 return flags;
22380 /* Implement TARGET_MANGLE_TYPE. */
22382 static const char *
22383 aarch64_mangle_type (const_tree type)
22385 /* The AArch64 ABI documents say that "__va_list" has to be
22386 mangled as if it is in the "std" namespace. */
22387 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22388 return "St9__va_list";
22390 /* Half-precision floating point types. */
22391 if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22393 if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22394 return NULL;
22395 if (TYPE_MODE (type) == BFmode)
22396 return "u6__bf16";
22397 else
22398 return "Dh";
22401 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
22402 builtin types. */
22403 if (TYPE_NAME (type) != NULL)
22405 const char *res;
22406 if ((res = aarch64_general_mangle_builtin_type (type))
22407 || (res = aarch64_sve::mangle_builtin_type (type)))
22408 return res;
22411 /* Use the default mangling. */
22412 return NULL;
22415 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
22417 static bool
22418 aarch64_verify_type_context (location_t loc, type_context_kind context,
22419 const_tree type, bool silent_p)
22421 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22424 /* Find the first rtx_insn before insn that will generate an assembly
22425 instruction. */
22427 static rtx_insn *
22428 aarch64_prev_real_insn (rtx_insn *insn)
22430 if (!insn)
22431 return NULL;
22435 insn = prev_real_insn (insn);
22437 while (insn && recog_memoized (insn) < 0);
22439 return insn;
22442 static bool
22443 is_madd_op (enum attr_type t1)
22445 unsigned int i;
22446 /* A number of these may be AArch32 only. */
22447 enum attr_type mlatypes[] = {
22448 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22449 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22450 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22453 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22455 if (t1 == mlatypes[i])
22456 return true;
22459 return false;
22462 /* Check if there is a register dependency between a load and the insn
22463 for which we hold recog_data. */
22465 static bool
22466 dep_between_memop_and_curr (rtx memop)
22468 rtx load_reg;
22469 int opno;
22471 gcc_assert (GET_CODE (memop) == SET);
22473 if (!REG_P (SET_DEST (memop)))
22474 return false;
22476 load_reg = SET_DEST (memop);
22477 for (opno = 1; opno < recog_data.n_operands; opno++)
22479 rtx operand = recog_data.operand[opno];
22480 if (REG_P (operand)
22481 && reg_overlap_mentioned_p (load_reg, operand))
22482 return true;
22485 return false;
22489 /* When working around the Cortex-A53 erratum 835769,
22490 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22491 instruction and has a preceding memory instruction such that a NOP
22492 should be inserted between them. */
22494 bool
22495 aarch64_madd_needs_nop (rtx_insn* insn)
22497 enum attr_type attr_type;
22498 rtx_insn *prev;
22499 rtx body;
22501 if (!TARGET_FIX_ERR_A53_835769)
22502 return false;
22504 if (!INSN_P (insn) || recog_memoized (insn) < 0)
22505 return false;
22507 attr_type = get_attr_type (insn);
22508 if (!is_madd_op (attr_type))
22509 return false;
22511 prev = aarch64_prev_real_insn (insn);
22512 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22513 Restore recog state to INSN to avoid state corruption. */
22514 extract_constrain_insn_cached (insn);
22516 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22517 return false;
22519 body = single_set (prev);
22521 /* If the previous insn is a memory op and there is no dependency between
22522 it and the DImode madd, emit a NOP between them. If body is NULL then we
22523 have a complex memory operation, probably a load/store pair.
22524 Be conservative for now and emit a NOP. */
22525 if (GET_MODE (recog_data.operand[0]) == DImode
22526 && (!body || !dep_between_memop_and_curr (body)))
22527 return true;
22529 return false;
22534 /* Implement FINAL_PRESCAN_INSN. */
22536 void
22537 aarch64_final_prescan_insn (rtx_insn *insn)
22539 if (aarch64_madd_needs_nop (insn))
22540 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22544 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22545 instruction. */
22547 bool
22548 aarch64_sve_index_immediate_p (rtx base_or_step)
22550 return (CONST_INT_P (base_or_step)
22551 && IN_RANGE (INTVAL (base_or_step), -16, 15));
22554 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22555 when applied to mode MODE. Negate X first if NEGATE_P is true. */
22557 bool
22558 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22560 rtx elt = unwrap_const_vec_duplicate (x);
22561 if (!CONST_INT_P (elt))
22562 return false;
22564 HOST_WIDE_INT val = INTVAL (elt);
22565 if (negate_p)
22566 val = -val;
22567 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22569 if (val & 0xff)
22570 return IN_RANGE (val, 0, 0xff);
22571 return IN_RANGE (val, 0, 0xff00);
22574 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22575 instructions when applied to mode MODE. Negate X first if NEGATE_P
22576 is true. */
22578 bool
22579 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22581 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22582 return false;
22584 /* After the optional negation, the immediate must be nonnegative.
22585 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22586 instead of SQADD Zn.B, Zn.B, #129. */
22587 rtx elt = unwrap_const_vec_duplicate (x);
22588 return negate_p == (INTVAL (elt) < 0);
22591 /* Return true if X is a valid immediate operand for an SVE logical
22592 instruction such as AND. */
22594 bool
22595 aarch64_sve_bitmask_immediate_p (rtx x)
22597 rtx elt;
22599 return (const_vec_duplicate_p (x, &elt)
22600 && CONST_INT_P (elt)
22601 && aarch64_bitmask_imm (INTVAL (elt),
22602 GET_MODE_INNER (GET_MODE (x))));
22605 /* Return true if X is a valid immediate for the SVE DUP and CPY
22606 instructions. */
22608 bool
22609 aarch64_sve_dup_immediate_p (rtx x)
22611 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22612 if (!CONST_INT_P (x))
22613 return false;
22615 HOST_WIDE_INT val = INTVAL (x);
22616 if (val & 0xff)
22617 return IN_RANGE (val, -0x80, 0x7f);
22618 return IN_RANGE (val, -0x8000, 0x7f00);
22621 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22622 SIGNED_P says whether the operand is signed rather than unsigned. */
22624 bool
22625 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22627 x = unwrap_const_vec_duplicate (x);
22628 return (CONST_INT_P (x)
22629 && (signed_p
22630 ? IN_RANGE (INTVAL (x), -16, 15)
22631 : IN_RANGE (INTVAL (x), 0, 127)));
22634 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22635 instruction. Negate X first if NEGATE_P is true. */
22637 bool
22638 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22640 rtx elt;
22641 REAL_VALUE_TYPE r;
22643 if (!const_vec_duplicate_p (x, &elt)
22644 || !CONST_DOUBLE_P (elt))
22645 return false;
22647 r = *CONST_DOUBLE_REAL_VALUE (elt);
22649 if (negate_p)
22650 r = real_value_negate (&r);
22652 if (real_equal (&r, &dconst1))
22653 return true;
22654 if (real_equal (&r, &dconsthalf))
22655 return true;
22656 return false;
22659 /* Return true if X is a valid immediate operand for an SVE FMUL
22660 instruction. */
22662 bool
22663 aarch64_sve_float_mul_immediate_p (rtx x)
22665 rtx elt;
22667 return (const_vec_duplicate_p (x, &elt)
22668 && CONST_DOUBLE_P (elt)
22669 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22670 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22673 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22674 for the Advanced SIMD operation described by WHICH and INSN. If INFO
22675 is nonnull, use it to describe valid immediates. */
22676 static bool
22677 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22678 simd_immediate_info *info,
22679 enum simd_immediate_check which,
22680 simd_immediate_info::insn_type insn)
22682 /* Try a 4-byte immediate with LSL. */
22683 for (unsigned int shift = 0; shift < 32; shift += 8)
22684 if ((val32 & (0xff << shift)) == val32)
22686 if (info)
22687 *info = simd_immediate_info (SImode, val32 >> shift, insn,
22688 simd_immediate_info::LSL, shift);
22689 return true;
22692 /* Try a 2-byte immediate with LSL. */
22693 unsigned int imm16 = val32 & 0xffff;
22694 if (imm16 == (val32 >> 16))
22695 for (unsigned int shift = 0; shift < 16; shift += 8)
22696 if ((imm16 & (0xff << shift)) == imm16)
22698 if (info)
22699 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22700 simd_immediate_info::LSL, shift);
22701 return true;
22704 /* Try a 4-byte immediate with MSL, except for cases that MVN
22705 can handle. */
22706 if (which == AARCH64_CHECK_MOV)
22707 for (unsigned int shift = 8; shift < 24; shift += 8)
22709 unsigned int low = (1 << shift) - 1;
22710 if (((val32 & (0xff << shift)) | low) == val32)
22712 if (info)
22713 *info = simd_immediate_info (SImode, val32 >> shift, insn,
22714 simd_immediate_info::MSL, shift);
22715 return true;
22719 return false;
22722 /* Return true if replicating VAL64 is a valid immediate for the
22723 Advanced SIMD operation described by WHICH. If INFO is nonnull,
22724 use it to describe valid immediates. */
22725 static bool
22726 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22727 simd_immediate_info *info,
22728 enum simd_immediate_check which)
22730 unsigned int val32 = val64 & 0xffffffff;
22731 unsigned int val16 = val64 & 0xffff;
22732 unsigned int val8 = val64 & 0xff;
22734 if (val32 == (val64 >> 32))
22736 if ((which & AARCH64_CHECK_ORR) != 0
22737 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22738 simd_immediate_info::MOV))
22739 return true;
22741 if ((which & AARCH64_CHECK_BIC) != 0
22742 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22743 simd_immediate_info::MVN))
22744 return true;
22746 /* Try using a replicated byte. */
22747 if (which == AARCH64_CHECK_MOV
22748 && val16 == (val32 >> 16)
22749 && val8 == (val16 >> 8))
22751 if (info)
22752 *info = simd_immediate_info (QImode, val8);
22753 return true;
22757 /* Try using a bit-to-bytemask. */
22758 if (which == AARCH64_CHECK_MOV)
22760 unsigned int i;
22761 for (i = 0; i < 64; i += 8)
22763 unsigned char byte = (val64 >> i) & 0xff;
22764 if (byte != 0 && byte != 0xff)
22765 break;
22767 if (i == 64)
22769 if (info)
22770 *info = simd_immediate_info (DImode, val64);
22771 return true;
22774 return false;
22777 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22778 instruction. If INFO is nonnull, use it to describe valid immediates. */
22780 static bool
22781 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
22782 simd_immediate_info *info)
22784 scalar_int_mode mode = DImode;
22785 unsigned int val32 = val64 & 0xffffffff;
22786 if (val32 == (val64 >> 32))
22788 mode = SImode;
22789 unsigned int val16 = val32 & 0xffff;
22790 if (val16 == (val32 >> 16))
22792 mode = HImode;
22793 unsigned int val8 = val16 & 0xff;
22794 if (val8 == (val16 >> 8))
22795 mode = QImode;
22798 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
22799 if (IN_RANGE (val, -0x80, 0x7f))
22801 /* DUP with no shift. */
22802 if (info)
22803 *info = simd_immediate_info (mode, val);
22804 return true;
22806 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
22808 /* DUP with LSL #8. */
22809 if (info)
22810 *info = simd_immediate_info (mode, val);
22811 return true;
22813 if (aarch64_bitmask_imm (val64, mode))
22815 /* DUPM. */
22816 if (info)
22817 *info = simd_immediate_info (mode, val);
22818 return true;
22820 return false;
22823 /* Return true if X is an UNSPEC_PTRUE constant of the form:
22825 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
22827 where PATTERN is the svpattern as a CONST_INT and where ZERO
22828 is a zero constant of the required PTRUE mode (which can have
22829 fewer elements than X's mode, if zero bits are significant).
22831 If so, and if INFO is nonnull, describe the immediate in INFO. */
22832 bool
22833 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
22835 if (GET_CODE (x) != CONST)
22836 return false;
22838 x = XEXP (x, 0);
22839 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
22840 return false;
22842 if (info)
22844 aarch64_svpattern pattern
22845 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
22846 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
22847 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
22848 *info = simd_immediate_info (int_mode, pattern);
22850 return true;
22853 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
22854 it to describe valid immediates. */
22856 static bool
22857 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
22859 if (aarch64_sve_ptrue_svpattern_p (x, info))
22860 return true;
22862 if (x == CONST0_RTX (GET_MODE (x)))
22864 if (info)
22865 *info = simd_immediate_info (DImode, 0);
22866 return true;
22869 /* Analyze the value as a VNx16BImode. This should be relatively
22870 efficient, since rtx_vector_builder has enough built-in capacity
22871 to store all VLA predicate constants without needing the heap. */
22872 rtx_vector_builder builder;
22873 if (!aarch64_get_sve_pred_bits (builder, x))
22874 return false;
22876 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
22877 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
22879 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
22880 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
22881 if (pattern != AARCH64_NUM_SVPATTERNS)
22883 if (info)
22885 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
22886 *info = simd_immediate_info (int_mode, pattern);
22888 return true;
22891 return false;
22894 /* Return true if OP is a valid SIMD immediate for the operation
22895 described by WHICH. If INFO is nonnull, use it to describe valid
22896 immediates. */
22897 bool
22898 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
22899 enum simd_immediate_check which)
22901 machine_mode mode = GET_MODE (op);
22902 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22903 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
22904 return false;
22906 if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
22907 return false;
22909 if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
22910 return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
22912 if (vec_flags & VEC_SVE_PRED)
22913 return aarch64_sve_pred_valid_immediate (op, info);
22915 scalar_mode elt_mode = GET_MODE_INNER (mode);
22916 rtx base, step;
22917 unsigned int n_elts;
22918 if (CONST_VECTOR_P (op)
22919 && CONST_VECTOR_DUPLICATE_P (op))
22920 n_elts = CONST_VECTOR_NPATTERNS (op);
22921 else if ((vec_flags & VEC_SVE_DATA)
22922 && const_vec_series_p (op, &base, &step))
22924 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
22925 if (!aarch64_sve_index_immediate_p (base)
22926 || !aarch64_sve_index_immediate_p (step))
22927 return false;
22929 if (info)
22931 /* Get the corresponding container mode. E.g. an INDEX on V2SI
22932 should yield two integer values per 128-bit block, meaning
22933 that we need to treat it in the same way as V2DI and then
22934 ignore the upper 32 bits of each element. */
22935 elt_mode = aarch64_sve_container_int_mode (mode);
22936 *info = simd_immediate_info (elt_mode, base, step);
22938 return true;
22940 else if (CONST_VECTOR_P (op)
22941 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
22942 /* N_ELTS set above. */;
22943 else
22944 return false;
22946 scalar_float_mode elt_float_mode;
22947 if (n_elts == 1
22948 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
22950 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
22951 if (aarch64_float_const_zero_rtx_p (elt)
22952 || aarch64_float_const_representable_p (elt))
22954 if (info)
22955 *info = simd_immediate_info (elt_float_mode, elt);
22956 return true;
22960 /* If all elements in an SVE vector have the same value, we have a free
22961 choice between using the element mode and using the container mode.
22962 Using the element mode means that unused parts of the vector are
22963 duplicates of the used elements, while using the container mode means
22964 that the unused parts are an extension of the used elements. Using the
22965 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
22966 for its container mode VNx4SI while 0x00000101 isn't.
22968 If not all elements in an SVE vector have the same value, we need the
22969 transition from one element to the next to occur at container boundaries.
22970 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
22971 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
22972 scalar_int_mode elt_int_mode;
22973 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
22974 elt_int_mode = aarch64_sve_container_int_mode (mode);
22975 else
22976 elt_int_mode = int_mode_for_mode (elt_mode).require ();
22978 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
22979 if (elt_size > 8)
22980 return false;
22982 /* Expand the vector constant out into a byte vector, with the least
22983 significant byte of the register first. */
22984 auto_vec<unsigned char, 16> bytes;
22985 bytes.reserve (n_elts * elt_size);
22986 for (unsigned int i = 0; i < n_elts; i++)
22988 /* The vector is provided in gcc endian-neutral fashion.
22989 For aarch64_be Advanced SIMD, it must be laid out in the vector
22990 register in reverse order. */
22991 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
22992 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
22994 if (elt_mode != elt_int_mode)
22995 elt = gen_lowpart (elt_int_mode, elt);
22997 if (!CONST_INT_P (elt))
22998 return false;
23000 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
23001 for (unsigned int byte = 0; byte < elt_size; byte++)
23003 bytes.quick_push (elt_val & 0xff);
23004 elt_val >>= BITS_PER_UNIT;
23008 /* The immediate must repeat every eight bytes. */
23009 unsigned int nbytes = bytes.length ();
23010 for (unsigned i = 8; i < nbytes; ++i)
23011 if (bytes[i] != bytes[i - 8])
23012 return false;
23014 /* Get the repeating 8-byte value as an integer. No endian correction
23015 is needed here because bytes is already in lsb-first order. */
23016 unsigned HOST_WIDE_INT val64 = 0;
23017 for (unsigned int i = 0; i < 8; i++)
23018 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
23019 << (i * BITS_PER_UNIT));
23021 if (vec_flags & VEC_SVE_DATA)
23022 return aarch64_sve_valid_immediate (val64, info);
23023 else
23024 return aarch64_advsimd_valid_immediate (val64, info, which);
23027 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23028 has a step in the range of INDEX. Return the index expression if so,
23029 otherwise return null. */
23031 aarch64_check_zero_based_sve_index_immediate (rtx x)
23033 rtx base, step;
23034 if (const_vec_series_p (x, &base, &step)
23035 && base == const0_rtx
23036 && aarch64_sve_index_immediate_p (step))
23037 return step;
23038 return NULL_RTX;
23041 /* Check of immediate shift constants are within range. */
23042 bool
23043 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
23045 x = unwrap_const_vec_duplicate (x);
23046 if (!CONST_INT_P (x))
23047 return false;
23048 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
23049 if (left)
23050 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
23051 else
23052 return IN_RANGE (INTVAL (x), 1, bit_width);
23055 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23056 operation of width WIDTH at bit position POS. */
23059 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
23061 gcc_assert (CONST_INT_P (width));
23062 gcc_assert (CONST_INT_P (pos));
23064 unsigned HOST_WIDE_INT mask
23065 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
23066 return GEN_INT (mask << UINTVAL (pos));
23069 bool
23070 aarch64_mov_operand_p (rtx x, machine_mode mode)
23072 if (GET_CODE (x) == HIGH
23073 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
23074 return true;
23076 if (CONST_INT_P (x))
23077 return true;
23079 if (VECTOR_MODE_P (GET_MODE (x)))
23081 /* Require predicate constants to be VNx16BI before RA, so that we
23082 force everything to have a canonical form. */
23083 if (!lra_in_progress
23084 && !reload_completed
23085 && aarch64_sve_pred_mode_p (GET_MODE (x))
23086 && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
23087 && GET_MODE (x) != VNx16BImode)
23088 return false;
23090 return aarch64_simd_valid_immediate (x, NULL);
23093 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
23094 x = strip_salt (x);
23096 /* GOT accesses are valid moves. */
23097 if (SYMBOL_REF_P (x)
23098 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
23099 return true;
23101 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
23102 return true;
23104 if (TARGET_SVE
23105 && (aarch64_sve_cnt_immediate_p (x)
23106 || aarch64_sve_rdvl_immediate_p (x)))
23107 return true;
23109 if (aarch64_rdsvl_immediate_p (x))
23110 return true;
23112 return aarch64_classify_symbolic_expression (x)
23113 == SYMBOL_TINY_ABSOLUTE;
23116 /* Return a function-invariant register that contains VALUE. *CACHED_INSN
23117 caches instructions that set up such registers, so that they can be
23118 reused by future calls. */
23120 static rtx
23121 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
23123 rtx_insn *insn = *cached_insn;
23124 if (insn && INSN_P (insn) && !insn->deleted ())
23126 rtx pat = PATTERN (insn);
23127 if (GET_CODE (pat) == SET)
23129 rtx dest = SET_DEST (pat);
23130 if (REG_P (dest)
23131 && !HARD_REGISTER_P (dest)
23132 && rtx_equal_p (SET_SRC (pat), value))
23133 return dest;
23136 rtx reg = gen_reg_rtx (GET_MODE (value));
23137 *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
23138 function_beg_insn);
23139 return reg;
23142 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23143 the constant creation. */
23146 aarch64_gen_shareable_zero (machine_mode mode)
23148 rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
23149 CONST0_RTX (V4SImode));
23150 return lowpart_subreg (mode, reg, GET_MODE (reg));
23153 /* INSN is some form of extension or shift that can be split into a
23154 permutation involving a shared zero. Return true if we should
23155 perform such a split.
23157 ??? For now, make sure that the split instruction executes more
23158 frequently than the zero that feeds it. In future it would be good
23159 to split without that restriction and instead recombine shared zeros
23160 if they turn out not to be worthwhile. This would allow splits in
23161 single-block functions and would also cope more naturally with
23162 rematerialization. */
23164 bool
23165 aarch64_split_simd_shift_p (rtx_insn *insn)
23167 return (can_create_pseudo_p ()
23168 && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
23169 && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
23170 < BLOCK_FOR_INSN (insn)->count));
23173 /* Return a const_int vector of VAL. */
23175 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
23177 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
23178 return gen_const_vec_duplicate (mode, c);
23181 /* Check OP is a legal scalar immediate for the MOVI instruction. */
23183 bool
23184 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
23186 machine_mode vmode;
23188 vmode = aarch64_simd_container_mode (mode, 64);
23189 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
23190 return aarch64_simd_valid_immediate (op_v, NULL);
23193 /* Construct and return a PARALLEL RTX vector with elements numbering the
23194 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23195 the vector - from the perspective of the architecture. This does not
23196 line up with GCC's perspective on lane numbers, so we end up with
23197 different masks depending on our target endian-ness. The diagram
23198 below may help. We must draw the distinction when building masks
23199 which select one half of the vector. An instruction selecting
23200 architectural low-lanes for a big-endian target, must be described using
23201 a mask selecting GCC high-lanes.
23203 Big-Endian Little-Endian
23205 GCC 0 1 2 3 3 2 1 0
23206 | x | x | x | x | | x | x | x | x |
23207 Architecture 3 2 1 0 3 2 1 0
23209 Low Mask: { 2, 3 } { 0, 1 }
23210 High Mask: { 0, 1 } { 2, 3 }
23212 MODE Is the mode of the vector and NUNITS is the number of units in it. */
23215 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
23217 rtvec v = rtvec_alloc (nunits / 2);
23218 int high_base = nunits / 2;
23219 int low_base = 0;
23220 int base;
23221 rtx t1;
23222 int i;
23224 if (BYTES_BIG_ENDIAN)
23225 base = high ? low_base : high_base;
23226 else
23227 base = high ? high_base : low_base;
23229 for (i = 0; i < nunits / 2; i++)
23230 RTVEC_ELT (v, i) = GEN_INT (base + i);
23232 t1 = gen_rtx_PARALLEL (mode, v);
23233 return t1;
23236 /* Check OP for validity as a PARALLEL RTX vector with elements
23237 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23238 from the perspective of the architecture. See the diagram above
23239 aarch64_simd_vect_par_cnst_half for more details. */
23241 bool
23242 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23243 bool high)
23245 int nelts;
23246 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23247 return false;
23249 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23250 HOST_WIDE_INT count_op = XVECLEN (op, 0);
23251 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23252 int i = 0;
23254 if (count_op != count_ideal)
23255 return false;
23257 for (i = 0; i < count_ideal; i++)
23259 rtx elt_op = XVECEXP (op, 0, i);
23260 rtx elt_ideal = XVECEXP (ideal, 0, i);
23262 if (!CONST_INT_P (elt_op)
23263 || INTVAL (elt_ideal) != INTVAL (elt_op))
23264 return false;
23266 return true;
23269 /* Return a PARALLEL containing NELTS elements, with element I equal
23270 to BASE + I * STEP. */
23273 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23275 rtvec vec = rtvec_alloc (nelts);
23276 for (unsigned int i = 0; i < nelts; ++i)
23277 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23278 return gen_rtx_PARALLEL (VOIDmode, vec);
23281 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23282 series with step STEP. */
23284 bool
23285 aarch64_stepped_int_parallel_p (rtx op, int step)
23287 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23288 return false;
23290 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23291 for (int i = 1; i < XVECLEN (op, 0); ++i)
23292 if (!CONST_INT_P (XVECEXP (op, 0, i))
23293 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23294 return false;
23296 return true;
23299 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23300 sequence of strided registers, with the stride being equal STRIDE.
23301 The operands are already known to be FPRs. */
23302 bool
23303 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23304 unsigned int stride)
23306 for (unsigned int i = 1; i < num_operands; ++i)
23307 if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23308 return false;
23309 return true;
23312 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
23313 HIGH (exclusive). */
23314 void
23315 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23316 const_tree exp)
23318 HOST_WIDE_INT lane;
23319 gcc_assert (CONST_INT_P (operand));
23320 lane = INTVAL (operand);
23322 if (lane < low || lane >= high)
23324 if (exp)
23325 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23326 lane, low, high - 1);
23327 else
23328 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23332 /* Peform endian correction on lane number N, which indexes a vector
23333 of mode MODE, and return the result as an SImode rtx. */
23336 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23338 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23341 /* Return TRUE if OP is a valid vector addressing mode. */
23343 bool
23344 aarch64_simd_mem_operand_p (rtx op)
23346 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
23347 || REG_P (XEXP (op, 0)));
23350 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
23352 bool
23353 aarch64_sve_ld1r_operand_p (rtx op)
23355 struct aarch64_address_info addr;
23356 scalar_mode mode;
23358 return (MEM_P (op)
23359 && is_a <scalar_mode> (GET_MODE (op), &mode)
23360 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23361 && addr.type == ADDRESS_REG_IMM
23362 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23365 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23366 where the size of the read data is specified by `mode` and the size of the
23367 vector elements are specified by `elem_mode`. */
23368 bool
23369 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23370 scalar_mode elem_mode)
23372 struct aarch64_address_info addr;
23373 if (!MEM_P (op)
23374 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23375 return false;
23377 if (addr.type == ADDRESS_REG_IMM)
23378 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23380 if (addr.type == ADDRESS_REG_REG)
23381 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23383 return false;
23386 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
23387 bool
23388 aarch64_sve_ld1rq_operand_p (rtx op)
23390 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23391 GET_MODE_INNER (GET_MODE (op)));
23394 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23395 accessing a vector where the element size is specified by `elem_mode`. */
23396 bool
23397 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23399 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23402 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
23403 bool
23404 aarch64_sve_ldff1_operand_p (rtx op)
23406 if (!MEM_P (op))
23407 return false;
23409 struct aarch64_address_info addr;
23410 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23411 return false;
23413 if (addr.type == ADDRESS_REG_IMM)
23414 return known_eq (addr.const_offset, 0);
23416 return addr.type == ADDRESS_REG_REG;
23419 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
23420 bool
23421 aarch64_sve_ldnf1_operand_p (rtx op)
23423 struct aarch64_address_info addr;
23425 return (MEM_P (op)
23426 && aarch64_classify_address (&addr, XEXP (op, 0),
23427 GET_MODE (op), false)
23428 && addr.type == ADDRESS_REG_IMM);
23431 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23432 The conditions for STR are the same. */
23433 bool
23434 aarch64_sve_ldr_operand_p (rtx op)
23436 struct aarch64_address_info addr;
23438 return (MEM_P (op)
23439 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23440 false, ADDR_QUERY_ANY)
23441 && addr.type == ADDRESS_REG_IMM);
23444 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23445 addressing memory of mode MODE. */
23446 bool
23447 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23449 struct aarch64_address_info addr;
23450 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23451 return false;
23453 if (addr.type == ADDRESS_REG_IMM)
23454 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23456 return addr.type == ADDRESS_REG_REG;
23459 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23460 We need to be able to access the individual pieces, so the range
23461 is different from LD[234] and ST[234]. */
23462 bool
23463 aarch64_sve_struct_memory_operand_p (rtx op)
23465 if (!MEM_P (op))
23466 return false;
23468 machine_mode mode = GET_MODE (op);
23469 struct aarch64_address_info addr;
23470 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23471 ADDR_QUERY_ANY)
23472 || addr.type != ADDRESS_REG_IMM)
23473 return false;
23475 poly_int64 first = addr.const_offset;
23476 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23477 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23478 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23481 /* Return true if OFFSET is a constant integer and if VNUM is
23482 OFFSET * the number of bytes in an SVE vector. This is the requirement
23483 that exists in SME LDR and STR instructions, where the VL offset must
23484 equal the ZA slice offset. */
23485 bool
23486 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23488 if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23489 return false;
23491 if (TARGET_STREAMING)
23493 poly_int64 const_vnum;
23494 return (poly_int_rtx_p (vnum, &const_vnum)
23495 && known_eq (const_vnum,
23496 INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23498 else
23500 HOST_WIDE_INT factor;
23501 return (aarch64_sme_vq_unspec_p (vnum, &factor)
23502 && factor == INTVAL (offset) * 16);
23506 /* Emit a register copy from operand to operand, taking care not to
23507 early-clobber source registers in the process.
23509 COUNT is the number of components into which the copy needs to be
23510 decomposed. */
23511 void
23512 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23513 unsigned int count)
23515 unsigned int i;
23516 int rdest = REGNO (operands[0]);
23517 int rsrc = REGNO (operands[1]);
23519 if (!reg_overlap_mentioned_p (operands[0], operands[1])
23520 || rdest < rsrc)
23521 for (i = 0; i < count; i++)
23522 emit_move_insn (gen_rtx_REG (mode, rdest + i),
23523 gen_rtx_REG (mode, rsrc + i));
23524 else
23525 for (i = 0; i < count; i++)
23526 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23527 gen_rtx_REG (mode, rsrc + count - i - 1));
23530 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23531 one of VSTRUCT modes: OI, CI, or XI. */
23533 aarch64_simd_attr_length_rglist (machine_mode mode)
23535 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
23536 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23539 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
23540 alignment of a vector to 128 bits. SVE predicates have an alignment of
23541 16 bits. */
23542 static HOST_WIDE_INT
23543 aarch64_simd_vector_alignment (const_tree type)
23545 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23546 be set for non-predicate vectors of booleans. Modes are the most
23547 direct way we have of identifying real SVE predicate types. */
23548 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23549 return 16;
23550 widest_int min_size
23551 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23552 return wi::umin (min_size, 128).to_uhwi ();
23555 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
23556 static poly_uint64
23557 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23559 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23561 /* If the length of the vector is a fixed power of 2, try to align
23562 to that length, otherwise don't try to align at all. */
23563 HOST_WIDE_INT result;
23564 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23565 || !pow2p_hwi (result))
23566 result = TYPE_ALIGN (TREE_TYPE (type));
23567 return result;
23569 return TYPE_ALIGN (type);
23572 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
23573 static bool
23574 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23576 if (is_packed)
23577 return false;
23579 /* For fixed-length vectors, check that the vectorizer will aim for
23580 full-vector alignment. This isn't true for generic GCC vectors
23581 that are wider than the ABI maximum of 128 bits. */
23582 poly_uint64 preferred_alignment =
23583 aarch64_vectorize_preferred_vector_alignment (type);
23584 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23585 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23586 preferred_alignment))
23587 return false;
23589 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
23590 return true;
23593 /* Return true if the vector misalignment factor is supported by the
23594 target. */
23595 static bool
23596 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23597 const_tree type, int misalignment,
23598 bool is_packed)
23600 if (TARGET_SIMD && STRICT_ALIGNMENT)
23602 /* Return if movmisalign pattern is not supported for this mode. */
23603 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23604 return false;
23606 /* Misalignment factor is unknown at compile time. */
23607 if (misalignment == -1)
23608 return false;
23610 return default_builtin_support_vector_misalignment (mode, type, misalignment,
23611 is_packed);
23614 /* If VALS is a vector constant that can be loaded into a register
23615 using DUP, generate instructions to do so and return an RTX to
23616 assign to the register. Otherwise return NULL_RTX. */
23617 static rtx
23618 aarch64_simd_dup_constant (rtx vals)
23620 machine_mode mode = GET_MODE (vals);
23621 machine_mode inner_mode = GET_MODE_INNER (mode);
23622 rtx x;
23624 if (!const_vec_duplicate_p (vals, &x))
23625 return NULL_RTX;
23627 /* We can load this constant by using DUP and a constant in a
23628 single ARM register. This will be cheaper than a vector
23629 load. */
23630 x = force_reg (inner_mode, x);
23631 return gen_vec_duplicate (mode, x);
23635 /* Generate code to load VALS, which is a PARALLEL containing only
23636 constants (for vec_init) or CONST_VECTOR, efficiently into a
23637 register. Returns an RTX to copy into the register, or NULL_RTX
23638 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
23639 static rtx
23640 aarch64_simd_make_constant (rtx vals)
23642 machine_mode mode = GET_MODE (vals);
23643 rtx const_dup;
23644 rtx const_vec = NULL_RTX;
23645 int n_const = 0;
23646 int i;
23648 if (CONST_VECTOR_P (vals))
23649 const_vec = vals;
23650 else if (GET_CODE (vals) == PARALLEL)
23652 /* A CONST_VECTOR must contain only CONST_INTs and
23653 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23654 Only store valid constants in a CONST_VECTOR. */
23655 int n_elts = XVECLEN (vals, 0);
23656 for (i = 0; i < n_elts; ++i)
23658 rtx x = XVECEXP (vals, 0, i);
23659 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23660 n_const++;
23662 if (n_const == n_elts)
23663 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
23665 else
23666 gcc_unreachable ();
23668 if (const_vec != NULL_RTX
23669 && aarch64_simd_valid_immediate (const_vec, NULL))
23670 /* Load using MOVI/MVNI. */
23671 return const_vec;
23672 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
23673 /* Loaded using DUP. */
23674 return const_dup;
23675 else if (const_vec != NULL_RTX)
23676 /* Load from constant pool. We cannot take advantage of single-cycle
23677 LD1 because we need a PC-relative addressing mode. */
23678 return const_vec;
23679 else
23680 /* A PARALLEL containing something not valid inside CONST_VECTOR.
23681 We cannot construct an initializer. */
23682 return NULL_RTX;
23685 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23686 The caller has already tried a divide-and-conquer approach, so do
23687 not consider that case here. */
23689 void
23690 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
23692 machine_mode mode = GET_MODE (target);
23693 scalar_mode inner_mode = GET_MODE_INNER (mode);
23694 /* The number of vector elements. */
23695 int n_elts = XVECLEN (vals, 0);
23696 /* The number of vector elements which are not constant. */
23697 int n_var = 0;
23698 rtx any_const = NULL_RTX;
23699 /* The first element of vals. */
23700 rtx v0 = XVECEXP (vals, 0, 0);
23701 bool all_same = true;
23703 /* This is a special vec_init<M><N> where N is not an element mode but a
23704 vector mode with half the elements of M. We expect to find two entries
23705 of mode N in VALS and we must put their concatentation into TARGET. */
23706 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
23708 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
23709 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
23710 && known_eq (GET_MODE_SIZE (mode),
23711 2 * GET_MODE_SIZE (narrow_mode)));
23712 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
23713 XVECEXP (vals, 0, 0),
23714 XVECEXP (vals, 0, 1)));
23715 return;
23718 /* Count the number of variable elements to initialise. */
23719 for (int i = 0; i < n_elts; ++i)
23721 rtx x = XVECEXP (vals, 0, i);
23722 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
23723 ++n_var;
23724 else
23725 any_const = x;
23727 all_same &= rtx_equal_p (x, v0);
23730 /* No variable elements, hand off to aarch64_simd_make_constant which knows
23731 how best to handle this. */
23732 if (n_var == 0)
23734 rtx constant = aarch64_simd_make_constant (vals);
23735 if (constant != NULL_RTX)
23737 emit_move_insn (target, constant);
23738 return;
23742 /* Splat a single non-constant element if we can. */
23743 if (all_same)
23745 rtx x = force_reg (inner_mode, v0);
23746 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23747 return;
23750 enum insn_code icode = optab_handler (vec_set_optab, mode);
23751 gcc_assert (icode != CODE_FOR_nothing);
23753 /* If there are only variable elements, try to optimize
23754 the insertion using dup for the most common element
23755 followed by insertions. */
23757 /* The algorithm will fill matches[*][0] with the earliest matching element,
23758 and matches[X][1] with the count of duplicate elements (if X is the
23759 earliest element which has duplicates). */
23761 if (n_var >= n_elts - 1 && n_elts <= 16)
23763 int matches[16][2] = {0};
23764 for (int i = 0; i < n_elts; i++)
23766 for (int j = 0; j <= i; j++)
23768 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
23770 matches[i][0] = j;
23771 matches[j][1]++;
23772 break;
23776 int maxelement = 0;
23777 int maxv = 0;
23778 rtx const_elem = NULL_RTX;
23779 int const_elem_pos = 0;
23781 for (int i = 0; i < n_elts; i++)
23783 if (matches[i][1] > maxv)
23785 maxelement = i;
23786 maxv = matches[i][1];
23788 if (CONST_INT_P (XVECEXP (vals, 0, i))
23789 || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
23791 const_elem_pos = i;
23792 const_elem = XVECEXP (vals, 0, i);
23796 /* Create a duplicate of the most common element, unless all elements
23797 are equally useless to us, in which case just immediately set the
23798 vector register using the first element. */
23800 if (maxv == 1)
23802 /* For vectors of two 64-bit elements, we can do even better. */
23803 if (n_elts == 2
23804 && (inner_mode == E_DImode
23805 || inner_mode == E_DFmode))
23808 rtx x0 = XVECEXP (vals, 0, 0);
23809 rtx x1 = XVECEXP (vals, 0, 1);
23810 /* Combine can pick up this case, but handling it directly
23811 here leaves clearer RTL.
23813 This is load_pair_lanes<mode>, and also gives us a clean-up
23814 for store_pair_lanes<mode>. */
23815 if (memory_operand (x0, inner_mode)
23816 && memory_operand (x1, inner_mode)
23817 && aarch64_mergeable_load_pair_p (mode, x0, x1))
23819 rtx t;
23820 if (inner_mode == DFmode)
23821 t = gen_load_pair_lanesdf (target, x0, x1);
23822 else
23823 t = gen_load_pair_lanesdi (target, x0, x1);
23824 emit_insn (t);
23825 return;
23828 /* The subreg-move sequence below will move into lane zero of the
23829 vector register. For big-endian we want that position to hold
23830 the last element of VALS. */
23831 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
23833 /* If we have a single constant element, use that for duplicating
23834 instead. */
23835 if (const_elem)
23837 maxelement = const_elem_pos;
23838 aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
23840 else
23842 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23843 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
23846 else
23848 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23849 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23852 /* Insert the rest. */
23853 for (int i = 0; i < n_elts; i++)
23855 rtx x = XVECEXP (vals, 0, i);
23856 if (matches[i][0] == maxelement)
23857 continue;
23858 x = force_reg (inner_mode, x);
23859 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23861 return;
23864 /* Initialise a vector which is part-variable. We want to first try
23865 to build those lanes which are constant in the most efficient way we
23866 can. */
23867 if (n_var != n_elts)
23869 rtx copy = copy_rtx (vals);
23871 /* Load constant part of vector. We really don't care what goes into the
23872 parts we will overwrite, but we're more likely to be able to load the
23873 constant efficiently if it has fewer, larger, repeating parts
23874 (see aarch64_simd_valid_immediate). */
23875 for (int i = 0; i < n_elts; i++)
23877 rtx x = XVECEXP (vals, 0, i);
23878 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23879 continue;
23880 rtx subst = any_const;
23881 for (int bit = n_elts / 2; bit > 0; bit /= 2)
23883 /* Look in the copied vector, as more elements are const. */
23884 rtx test = XVECEXP (copy, 0, i ^ bit);
23885 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
23887 subst = test;
23888 break;
23891 XVECEXP (copy, 0, i) = subst;
23893 aarch64_expand_vector_init_fallback (target, copy);
23896 /* Insert the variable lanes directly. */
23897 for (int i = 0; i < n_elts; i++)
23899 rtx x = XVECEXP (vals, 0, i);
23900 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23901 continue;
23902 x = force_reg (inner_mode, x);
23903 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23907 /* Return even or odd half of VALS depending on EVEN_P. */
23909 static rtx
23910 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
23912 int n = XVECLEN (vals, 0);
23913 machine_mode new_mode
23914 = aarch64_simd_container_mode (GET_MODE_INNER (mode),
23915 GET_MODE_BITSIZE (mode).to_constant () / 2);
23916 rtvec vec = rtvec_alloc (n / 2);
23917 for (int i = 0; i < n / 2; i++)
23918 RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
23919 : XVECEXP (vals, 0, 2 * i + 1);
23920 return gen_rtx_PARALLEL (new_mode, vec);
23923 /* Return true if SET is a scalar move. */
23925 static bool
23926 scalar_move_insn_p (rtx set)
23928 rtx src = SET_SRC (set);
23929 rtx dest = SET_DEST (set);
23930 return (is_a<scalar_mode> (GET_MODE (dest))
23931 && aarch64_mov_operand (src, GET_MODE (dest)));
23934 /* Similar to seq_cost, but ignore cost for scalar moves. */
23936 static unsigned
23937 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
23939 unsigned cost = 0;
23941 for (; seq; seq = NEXT_INSN (seq))
23942 if (NONDEBUG_INSN_P (seq))
23944 if (rtx set = single_set (seq))
23946 if (!scalar_move_insn_p (set))
23947 cost += set_rtx_cost (set, speed);
23949 else
23951 int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
23952 if (this_cost > 0)
23953 cost += this_cost;
23954 else
23955 cost++;
23959 return cost;
23962 /* Expand a vector initialization sequence, such that TARGET is
23963 initialized to contain VALS. */
23965 void
23966 aarch64_expand_vector_init (rtx target, rtx vals)
23968 /* Try decomposing the initializer into even and odd halves and
23969 then ZIP them together. Use the resulting sequence if it is
23970 strictly cheaper than loading VALS directly.
23972 Prefer the fallback sequence in the event of a tie, since it
23973 will tend to use fewer registers. */
23975 machine_mode mode = GET_MODE (target);
23976 int n_elts = XVECLEN (vals, 0);
23978 if (n_elts < 4
23979 || maybe_ne (GET_MODE_BITSIZE (mode), 128))
23981 aarch64_expand_vector_init_fallback (target, vals);
23982 return;
23985 start_sequence ();
23986 rtx halves[2];
23987 unsigned costs[2];
23988 for (int i = 0; i < 2; i++)
23990 start_sequence ();
23991 rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
23992 rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
23993 aarch64_expand_vector_init (tmp_reg, new_vals);
23994 halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
23995 rtx_insn *rec_seq = get_insns ();
23996 end_sequence ();
23997 costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
23998 emit_insn (rec_seq);
24001 rtvec v = gen_rtvec (2, halves[0], halves[1]);
24002 rtx_insn *zip1_insn
24003 = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24004 unsigned seq_total_cost
24005 = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
24006 seq_total_cost += insn_cost (zip1_insn, !optimize_size);
24008 rtx_insn *seq = get_insns ();
24009 end_sequence ();
24011 start_sequence ();
24012 aarch64_expand_vector_init_fallback (target, vals);
24013 rtx_insn *fallback_seq = get_insns ();
24014 unsigned fallback_seq_cost
24015 = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
24016 end_sequence ();
24018 emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
24021 /* Emit RTL corresponding to:
24022 insr TARGET, ELEM. */
24024 static void
24025 emit_insr (rtx target, rtx elem)
24027 machine_mode mode = GET_MODE (target);
24028 scalar_mode elem_mode = GET_MODE_INNER (mode);
24029 elem = force_reg (elem_mode, elem);
24031 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
24032 gcc_assert (icode != CODE_FOR_nothing);
24033 emit_insn (GEN_FCN (icode) (target, target, elem));
24036 /* Subroutine of aarch64_sve_expand_vector_init for handling
24037 trailing constants.
24038 This function works as follows:
24039 (a) Create a new vector consisting of trailing constants.
24040 (b) Initialize TARGET with the constant vector using emit_move_insn.
24041 (c) Insert remaining elements in TARGET using insr.
24042 NELTS is the total number of elements in original vector while
24043 while NELTS_REQD is the number of elements that are actually
24044 significant.
24046 ??? The heuristic used is to do above only if number of constants
24047 is at least half the total number of elements. May need fine tuning. */
24049 static bool
24050 aarch64_sve_expand_vector_init_handle_trailing_constants
24051 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
24053 machine_mode mode = GET_MODE (target);
24054 scalar_mode elem_mode = GET_MODE_INNER (mode);
24055 int n_trailing_constants = 0;
24057 for (int i = nelts_reqd - 1;
24058 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
24059 i--)
24060 n_trailing_constants++;
24062 if (n_trailing_constants >= nelts_reqd / 2)
24064 /* Try to use the natural pattern of BUILDER to extend the trailing
24065 constant elements to a full vector. Replace any variables in the
24066 extra elements with zeros.
24068 ??? It would be better if the builders supported "don't care"
24069 elements, with the builder filling in whichever elements
24070 give the most compact encoding. */
24071 rtx_vector_builder v (mode, nelts, 1);
24072 for (int i = 0; i < nelts; i++)
24074 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
24075 if (!valid_for_const_vector_p (elem_mode, x))
24076 x = CONST0_RTX (elem_mode);
24077 v.quick_push (x);
24079 rtx const_vec = v.build ();
24080 emit_move_insn (target, const_vec);
24082 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
24083 emit_insr (target, builder.elt (i));
24085 return true;
24088 return false;
24091 /* Subroutine of aarch64_sve_expand_vector_init.
24092 Works as follows:
24093 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24094 (b) Skip trailing elements from BUILDER, which are the same as
24095 element NELTS_REQD - 1.
24096 (c) Insert earlier elements in reverse order in TARGET using insr. */
24098 static void
24099 aarch64_sve_expand_vector_init_insert_elems (rtx target,
24100 const rtx_vector_builder &builder,
24101 int nelts_reqd)
24103 machine_mode mode = GET_MODE (target);
24104 scalar_mode elem_mode = GET_MODE_INNER (mode);
24106 struct expand_operand ops[2];
24107 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
24108 gcc_assert (icode != CODE_FOR_nothing);
24110 create_output_operand (&ops[0], target, mode);
24111 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
24112 expand_insn (icode, 2, ops);
24114 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24115 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
24116 emit_insr (target, builder.elt (i));
24119 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24120 when all trailing elements of builder are same.
24121 This works as follows:
24122 (a) Use expand_insn interface to broadcast last vector element in TARGET.
24123 (b) Insert remaining elements in TARGET using insr.
24125 ??? The heuristic used is to do above if number of same trailing elements
24126 is at least 3/4 of total number of elements, loosely based on
24127 heuristic from mostly_zeros_p. May need fine-tuning. */
24129 static bool
24130 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24131 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
24133 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24134 if (ndups >= (3 * nelts_reqd) / 4)
24136 aarch64_sve_expand_vector_init_insert_elems (target, builder,
24137 nelts_reqd - ndups + 1);
24138 return true;
24141 return false;
24144 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24145 of elements in BUILDER.
24147 The function tries to initialize TARGET from BUILDER if it fits one
24148 of the special cases outlined below.
24150 Failing that, the function divides BUILDER into two sub-vectors:
24151 v_even = even elements of BUILDER;
24152 v_odd = odd elements of BUILDER;
24154 and recursively calls itself with v_even and v_odd.
24156 if (recursive call succeeded for v_even or v_odd)
24157 TARGET = zip (v_even, v_odd)
24159 The function returns true if it managed to build TARGET from BUILDER
24160 with one of the special cases, false otherwise.
24162 Example: {a, 1, b, 2, c, 3, d, 4}
24164 The vector gets divided into:
24165 v_even = {a, b, c, d}
24166 v_odd = {1, 2, 3, 4}
24168 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24169 initialize tmp2 from constant vector v_odd using emit_move_insn.
24171 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24172 4 elements, so we construct tmp1 from v_even using insr:
24173 tmp1 = dup(d)
24174 insr tmp1, c
24175 insr tmp1, b
24176 insr tmp1, a
24178 And finally:
24179 TARGET = zip (tmp1, tmp2)
24180 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
24182 static bool
24183 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
24184 int nelts, int nelts_reqd)
24186 machine_mode mode = GET_MODE (target);
24188 /* Case 1: Vector contains trailing constants. */
24190 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24191 (target, builder, nelts, nelts_reqd))
24192 return true;
24194 /* Case 2: Vector contains leading constants. */
24196 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
24197 for (int i = 0; i < nelts_reqd; i++)
24198 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
24199 rev_builder.finalize ();
24201 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24202 (target, rev_builder, nelts, nelts_reqd))
24204 emit_insn (gen_aarch64_sve_rev (mode, target, target));
24205 return true;
24208 /* Case 3: Vector contains trailing same element. */
24210 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24211 (target, builder, nelts_reqd))
24212 return true;
24214 /* Case 4: Vector contains leading same element. */
24216 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24217 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
24219 emit_insn (gen_aarch64_sve_rev (mode, target, target));
24220 return true;
24223 /* Avoid recursing below 4-elements.
24224 ??? The threshold 4 may need fine-tuning. */
24226 if (nelts_reqd <= 4)
24227 return false;
24229 rtx_vector_builder v_even (mode, nelts, 1);
24230 rtx_vector_builder v_odd (mode, nelts, 1);
24232 for (int i = 0; i < nelts * 2; i += 2)
24234 v_even.quick_push (builder.elt (i));
24235 v_odd.quick_push (builder.elt (i + 1));
24238 v_even.finalize ();
24239 v_odd.finalize ();
24241 rtx tmp1 = gen_reg_rtx (mode);
24242 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24243 nelts, nelts_reqd / 2);
24245 rtx tmp2 = gen_reg_rtx (mode);
24246 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24247 nelts, nelts_reqd / 2);
24249 if (!did_even_p && !did_odd_p)
24250 return false;
24252 /* Initialize v_even and v_odd using INSR if it didn't match any of the
24253 special cases and zip v_even, v_odd. */
24255 if (!did_even_p)
24256 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24258 if (!did_odd_p)
24259 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24261 rtvec v = gen_rtvec (2, tmp1, tmp2);
24262 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24263 return true;
24266 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
24268 void
24269 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24271 machine_mode mode = GET_MODE (target);
24272 int nelts = XVECLEN (vals, 0);
24274 rtx_vector_builder v (mode, nelts, 1);
24275 for (int i = 0; i < nelts; i++)
24276 v.quick_push (XVECEXP (vals, 0, i));
24277 v.finalize ();
24279 /* If neither sub-vectors of v could be initialized specially,
24280 then use INSR to insert all elements from v into TARGET.
24281 ??? This might not be optimal for vectors with large
24282 initializers like 16-element or above.
24283 For nelts < 4, it probably isn't useful to handle specially. */
24285 if (nelts < 4
24286 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24287 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24290 /* Check whether VALUE is a vector constant in which every element
24291 is either a power of 2 or a negated power of 2. If so, return
24292 a constant vector of log2s, and flip CODE between PLUS and MINUS
24293 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
24295 static rtx
24296 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24298 if (!CONST_VECTOR_P (value))
24299 return NULL_RTX;
24301 rtx_vector_builder builder;
24302 if (!builder.new_unary_operation (GET_MODE (value), value, false))
24303 return NULL_RTX;
24305 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24306 /* 1 if the result of the multiplication must be negated,
24307 0 if it mustn't, or -1 if we don't yet care. */
24308 int negate = -1;
24309 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24310 for (unsigned int i = 0; i < encoded_nelts; ++i)
24312 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24313 if (!CONST_SCALAR_INT_P (elt))
24314 return NULL_RTX;
24315 rtx_mode_t val (elt, int_mode);
24316 wide_int pow2 = wi::neg (val);
24317 if (val != pow2)
24319 /* It matters whether we negate or not. Make that choice,
24320 and make sure that it's consistent with previous elements. */
24321 if (negate == !wi::neg_p (val))
24322 return NULL_RTX;
24323 negate = wi::neg_p (val);
24324 if (!negate)
24325 pow2 = val;
24327 /* POW2 is now the value that we want to be a power of 2. */
24328 int shift = wi::exact_log2 (pow2);
24329 if (shift < 0)
24330 return NULL_RTX;
24331 builder.quick_push (gen_int_mode (shift, int_mode));
24333 if (negate == -1)
24334 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
24335 code = PLUS;
24336 else if (negate == 1)
24337 code = code == PLUS ? MINUS : PLUS;
24338 return builder.build ();
24341 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24342 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
24343 operands array, in the same order as for fma_optab. Return true if
24344 the function emitted all the necessary instructions, false if the caller
24345 should generate the pattern normally with the new OPERANDS array. */
24347 bool
24348 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24350 machine_mode mode = GET_MODE (operands[0]);
24351 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24353 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24354 NULL_RTX, true, OPTAB_DIRECT);
24355 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24356 operands[3], product, operands[0], true,
24357 OPTAB_DIRECT);
24358 return true;
24360 operands[2] = force_reg (mode, operands[2]);
24361 return false;
24364 /* Likewise, but for a conditional pattern. */
24366 bool
24367 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24369 machine_mode mode = GET_MODE (operands[0]);
24370 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24372 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24373 NULL_RTX, true, OPTAB_DIRECT);
24374 emit_insn (gen_cond (code, mode, operands[0], operands[1],
24375 operands[4], product, operands[5]));
24376 return true;
24378 operands[3] = force_reg (mode, operands[3]);
24379 return false;
24382 static unsigned HOST_WIDE_INT
24383 aarch64_shift_truncation_mask (machine_mode mode)
24385 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24386 return 0;
24387 return GET_MODE_UNIT_BITSIZE (mode) - 1;
24390 /* Select a format to encode pointers in exception handling data. */
24392 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24394 int type;
24395 switch (aarch64_cmodel)
24397 case AARCH64_CMODEL_TINY:
24398 case AARCH64_CMODEL_TINY_PIC:
24399 case AARCH64_CMODEL_SMALL:
24400 case AARCH64_CMODEL_SMALL_PIC:
24401 case AARCH64_CMODEL_SMALL_SPIC:
24402 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
24403 for everything. */
24404 type = DW_EH_PE_sdata4;
24405 break;
24406 default:
24407 /* No assumptions here. 8-byte relocs required. */
24408 type = DW_EH_PE_sdata8;
24409 break;
24411 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24414 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
24416 static void
24417 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24419 if (TREE_CODE (decl) == FUNCTION_DECL)
24421 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24422 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24424 fprintf (stream, "\t.variant_pcs\t");
24425 assemble_name (stream, name);
24426 fprintf (stream, "\n");
24431 /* The last .arch and .tune assembly strings that we printed. */
24432 static std::string aarch64_last_printed_arch_string;
24433 static std::string aarch64_last_printed_tune_string;
24435 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
24436 by the function fndecl. */
24438 void
24439 aarch64_declare_function_name (FILE *stream, const char* name,
24440 tree fndecl)
24442 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24444 struct cl_target_option *targ_options;
24445 if (target_parts)
24446 targ_options = TREE_TARGET_OPTION (target_parts);
24447 else
24448 targ_options = TREE_TARGET_OPTION (target_option_current_node);
24449 gcc_assert (targ_options);
24451 const struct processor *this_arch
24452 = aarch64_get_arch (targ_options->x_selected_arch);
24454 auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
24455 std::string extension
24456 = aarch64_get_extension_string_for_isa_flags (isa_flags,
24457 this_arch->flags);
24458 /* Only update the assembler .arch string if it is distinct from the last
24459 such string we printed. */
24460 std::string to_print = this_arch->name + extension;
24461 if (to_print != aarch64_last_printed_arch_string)
24463 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24464 aarch64_last_printed_arch_string = to_print;
24467 /* Print the cpu name we're tuning for in the comments, might be
24468 useful to readers of the generated asm. Do it only when it changes
24469 from function to function and verbose assembly is requested. */
24470 const struct processor *this_tune
24471 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24473 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24475 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24476 this_tune->name);
24477 aarch64_last_printed_tune_string = this_tune->name;
24480 aarch64_asm_output_variant_pcs (stream, fndecl, name);
24482 /* Don't forget the type directive for ELF. */
24483 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24484 ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24486 cfun->machine->label_is_assembled = true;
24489 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
24491 void
24492 aarch64_print_patchable_function_entry (FILE *file,
24493 unsigned HOST_WIDE_INT patch_area_size,
24494 bool record_p)
24496 if (!cfun->machine->label_is_assembled)
24498 /* Emit the patching area before the entry label, if any. */
24499 default_print_patchable_function_entry (file, patch_area_size,
24500 record_p);
24501 return;
24504 rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24505 GEN_INT (record_p));
24506 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24508 if (!aarch_bti_enabled ()
24509 || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24511 /* Emit the patchable_area at the beginning of the function. */
24512 rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24513 INSN_ADDRESSES_NEW (insn, -1);
24514 return;
24517 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24518 if (!insn
24519 || !INSN_P (insn)
24520 || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24521 || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24523 /* Emit a BTI_C. */
24524 insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24527 /* Emit the patchable_area after BTI_C. */
24528 insn = emit_insn_after (pa, insn);
24529 INSN_ADDRESSES_NEW (insn, -1);
24532 /* Output patchable area. */
24534 void
24535 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24537 default_print_patchable_function_entry (asm_out_file, patch_area_size,
24538 record_p);
24541 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
24543 void
24544 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24546 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24547 const char *value = IDENTIFIER_POINTER (target);
24548 aarch64_asm_output_variant_pcs (stream, decl, name);
24549 ASM_OUTPUT_DEF (stream, name, value);
24552 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
24553 function symbol references. */
24555 void
24556 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24558 default_elf_asm_output_external (stream, decl, name);
24559 aarch64_asm_output_variant_pcs (stream, decl, name);
24562 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24563 Used to output the .cfi_b_key_frame directive when signing the current
24564 function with the B key. */
24566 void
24567 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24569 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24570 && aarch64_ra_sign_key == AARCH64_KEY_B)
24571 asm_fprintf (f, "\t.cfi_b_key_frame\n");
24574 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
24576 static void
24577 aarch64_start_file (void)
24579 struct cl_target_option *default_options
24580 = TREE_TARGET_OPTION (target_option_default_node);
24582 const struct processor *default_arch
24583 = aarch64_get_arch (default_options->x_selected_arch);
24584 auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
24585 std::string extension
24586 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
24587 default_arch->flags);
24589 aarch64_last_printed_arch_string = default_arch->name + extension;
24590 aarch64_last_printed_tune_string = "";
24591 asm_fprintf (asm_out_file, "\t.arch %s\n",
24592 aarch64_last_printed_arch_string.c_str ());
24594 default_file_start ();
24597 /* Emit load exclusive. */
24599 static void
24600 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24601 rtx mem, rtx model_rtx)
24603 if (mode == TImode)
24604 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
24605 gen_highpart (DImode, rval),
24606 mem, model_rtx));
24607 else
24608 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
24611 /* Emit store exclusive. */
24613 static void
24614 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
24615 rtx mem, rtx rval, rtx model_rtx)
24617 if (mode == TImode)
24618 emit_insn (gen_aarch64_store_exclusive_pair
24619 (bval, mem, operand_subword (rval, 0, 0, TImode),
24620 operand_subword (rval, 1, 0, TImode), model_rtx));
24621 else
24622 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
24625 /* Mark the previous jump instruction as unlikely. */
24627 static void
24628 aarch64_emit_unlikely_jump (rtx insn)
24630 rtx_insn *jump = emit_jump_insn (insn);
24631 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
24634 /* We store the names of the various atomic helpers in a 5x5 array.
24635 Return the libcall function given MODE, MODEL and NAMES. */
24638 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
24639 const atomic_ool_names *names)
24641 memmodel model = memmodel_from_int (INTVAL (model_rtx));
24642 int mode_idx, model_idx;
24644 switch (mode)
24646 case E_QImode:
24647 mode_idx = 0;
24648 break;
24649 case E_HImode:
24650 mode_idx = 1;
24651 break;
24652 case E_SImode:
24653 mode_idx = 2;
24654 break;
24655 case E_DImode:
24656 mode_idx = 3;
24657 break;
24658 case E_TImode:
24659 mode_idx = 4;
24660 break;
24661 default:
24662 gcc_unreachable ();
24665 switch (model)
24667 case MEMMODEL_RELAXED:
24668 model_idx = 0;
24669 break;
24670 case MEMMODEL_CONSUME:
24671 case MEMMODEL_ACQUIRE:
24672 model_idx = 1;
24673 break;
24674 case MEMMODEL_RELEASE:
24675 model_idx = 2;
24676 break;
24677 case MEMMODEL_ACQ_REL:
24678 case MEMMODEL_SEQ_CST:
24679 model_idx = 3;
24680 break;
24681 case MEMMODEL_SYNC_ACQUIRE:
24682 case MEMMODEL_SYNC_RELEASE:
24683 case MEMMODEL_SYNC_SEQ_CST:
24684 model_idx = 4;
24685 break;
24686 default:
24687 gcc_unreachable ();
24690 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
24691 VISIBILITY_HIDDEN);
24694 #define DEF0(B, N) \
24695 { "__aarch64_" #B #N "_relax", \
24696 "__aarch64_" #B #N "_acq", \
24697 "__aarch64_" #B #N "_rel", \
24698 "__aarch64_" #B #N "_acq_rel", \
24699 "__aarch64_" #B #N "_sync" }
24701 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24702 { NULL, NULL, NULL, NULL }
24703 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24705 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
24706 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
24707 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
24708 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
24709 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
24710 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
24712 #undef DEF0
24713 #undef DEF4
24714 #undef DEF5
24716 /* Expand a compare and swap pattern. */
24718 void
24719 aarch64_expand_compare_and_swap (rtx operands[])
24721 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
24722 machine_mode mode, r_mode;
24724 bval = operands[0];
24725 rval = operands[1];
24726 mem = operands[2];
24727 oldval = operands[3];
24728 newval = operands[4];
24729 is_weak = operands[5];
24730 mod_s = operands[6];
24731 mod_f = operands[7];
24732 mode = GET_MODE (mem);
24734 /* Normally the succ memory model must be stronger than fail, but in the
24735 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24736 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
24737 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
24738 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
24739 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
24741 r_mode = mode;
24742 if (mode == QImode || mode == HImode)
24744 r_mode = SImode;
24745 rval = gen_reg_rtx (r_mode);
24748 if (TARGET_LSE)
24750 /* The CAS insn requires oldval and rval overlap, but we need to
24751 have a copy of oldval saved across the operation to tell if
24752 the operation is successful. */
24753 if (reg_overlap_mentioned_p (rval, oldval))
24754 rval = copy_to_mode_reg (r_mode, oldval);
24755 else
24756 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
24757 if (mode == TImode)
24758 newval = force_reg (mode, newval);
24760 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
24761 newval, mod_s));
24762 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24764 else if (TARGET_OUTLINE_ATOMICS)
24766 /* Oldval must satisfy compare afterward. */
24767 if (!aarch64_plus_operand (oldval, mode))
24768 oldval = force_reg (mode, oldval);
24769 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
24770 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
24771 oldval, mode, newval, mode,
24772 XEXP (mem, 0), Pmode);
24773 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24775 else
24777 /* The oldval predicate varies by mode. Test it and force to reg. */
24778 insn_code code = code_for_aarch64_compare_and_swap (mode);
24779 if (!insn_data[code].operand[2].predicate (oldval, mode))
24780 oldval = force_reg (mode, oldval);
24782 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
24783 is_weak, mod_s, mod_f));
24784 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
24787 if (r_mode != mode)
24788 rval = gen_lowpart (mode, rval);
24789 emit_move_insn (operands[1], rval);
24791 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
24792 emit_insn (gen_rtx_SET (bval, x));
24795 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24796 sequence implementing an atomic operation. */
24798 static void
24799 aarch64_emit_post_barrier (enum memmodel model)
24801 const enum memmodel base_model = memmodel_base (model);
24803 if (is_mm_sync (model)
24804 && (base_model == MEMMODEL_ACQUIRE
24805 || base_model == MEMMODEL_ACQ_REL
24806 || base_model == MEMMODEL_SEQ_CST))
24808 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
24812 /* Split a compare and swap pattern. */
24814 void
24815 aarch64_split_compare_and_swap (rtx operands[])
24817 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
24818 gcc_assert (epilogue_completed);
24820 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
24821 machine_mode mode;
24822 bool is_weak;
24823 rtx_code_label *label1, *label2;
24824 enum memmodel model;
24826 rval = operands[0];
24827 mem = operands[1];
24828 oldval = operands[2];
24829 newval = operands[3];
24830 model_rtx = operands[5];
24831 scratch = operands[7];
24832 mode = GET_MODE (mem);
24833 model = memmodel_from_int (INTVAL (model_rtx));
24834 is_weak = operands[4] != const0_rtx && mode != TImode;
24836 /* When OLDVAL is zero and we want the strong version we can emit a tighter
24837 loop:
24838 .label1:
24839 LD[A]XR rval, [mem]
24840 CBNZ rval, .label2
24841 ST[L]XR scratch, newval, [mem]
24842 CBNZ scratch, .label1
24843 .label2:
24844 CMP rval, 0. */
24845 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
24846 oldval == const0_rtx && mode != TImode);
24848 label1 = NULL;
24849 if (!is_weak)
24851 label1 = gen_label_rtx ();
24852 emit_label (label1);
24854 label2 = gen_label_rtx ();
24856 /* The initial load can be relaxed for a __sync operation since a final
24857 barrier will be emitted to stop code hoisting. */
24858 if (is_mm_sync (model))
24859 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
24860 else
24861 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
24863 if (strong_zero_p)
24864 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
24865 else
24867 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24868 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
24870 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24871 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
24872 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24874 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
24876 if (!is_weak)
24878 x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
24879 aarch64_emit_unlikely_jump (x);
24881 else
24882 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24884 /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
24885 store the returned value and loop if the STLXP fails. */
24886 if (mode == TImode)
24888 rtx_code_label *label3 = gen_label_rtx ();
24889 emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
24890 emit_barrier ();
24892 emit_label (label2);
24893 aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
24895 x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
24896 aarch64_emit_unlikely_jump (x);
24898 label2 = label3;
24901 emit_label (label2);
24903 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
24904 to set the condition flags. If this is not used it will be removed by
24905 later passes. */
24906 if (strong_zero_p)
24907 aarch64_gen_compare_reg (NE, rval, const0_rtx);
24909 /* Emit any final barrier needed for a __sync operation. */
24910 if (is_mm_sync (model))
24911 aarch64_emit_post_barrier (model);
24914 /* Split an atomic operation. */
24916 void
24917 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
24918 rtx value, rtx model_rtx, rtx cond)
24920 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
24921 gcc_assert (epilogue_completed);
24923 machine_mode mode = GET_MODE (mem);
24924 machine_mode wmode = (mode == DImode ? DImode : SImode);
24925 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
24926 const bool is_sync = is_mm_sync (model);
24927 rtx_code_label *label;
24928 rtx x;
24930 /* Split the atomic operation into a sequence. */
24931 label = gen_label_rtx ();
24932 emit_label (label);
24934 if (new_out)
24935 new_out = gen_lowpart (wmode, new_out);
24936 if (old_out)
24937 old_out = gen_lowpart (wmode, old_out);
24938 else
24939 old_out = new_out;
24940 value = simplify_gen_subreg (wmode, value, mode, 0);
24942 /* The initial load can be relaxed for a __sync operation since a final
24943 barrier will be emitted to stop code hoisting. */
24944 if (is_sync)
24945 aarch64_emit_load_exclusive (mode, old_out, mem,
24946 GEN_INT (MEMMODEL_RELAXED));
24947 else
24948 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
24950 switch (code)
24952 case SET:
24953 new_out = value;
24954 break;
24956 case NOT:
24957 x = gen_rtx_AND (wmode, old_out, value);
24958 emit_insn (gen_rtx_SET (new_out, x));
24959 x = gen_rtx_NOT (wmode, new_out);
24960 emit_insn (gen_rtx_SET (new_out, x));
24961 break;
24963 case MINUS:
24964 if (CONST_INT_P (value))
24966 value = GEN_INT (-UINTVAL (value));
24967 code = PLUS;
24969 /* Fall through. */
24971 default:
24972 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
24973 emit_insn (gen_rtx_SET (new_out, x));
24974 break;
24977 aarch64_emit_store_exclusive (mode, cond, mem,
24978 gen_lowpart (mode, new_out), model_rtx);
24980 x = aarch64_gen_compare_zero_and_branch (NE, cond, label);
24981 aarch64_emit_unlikely_jump (x);
24983 /* Emit any final barrier needed for a __sync operation. */
24984 if (is_sync)
24985 aarch64_emit_post_barrier (model);
24988 static void
24989 aarch64_init_libfuncs (void)
24991 /* Half-precision float operations. The compiler handles all operations
24992 with NULL libfuncs by converting to SFmode. */
24994 /* Conversions. */
24995 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
24996 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
24998 /* Arithmetic. */
24999 set_optab_libfunc (add_optab, HFmode, NULL);
25000 set_optab_libfunc (sdiv_optab, HFmode, NULL);
25001 set_optab_libfunc (smul_optab, HFmode, NULL);
25002 set_optab_libfunc (neg_optab, HFmode, NULL);
25003 set_optab_libfunc (sub_optab, HFmode, NULL);
25005 /* Comparisons. */
25006 set_optab_libfunc (eq_optab, HFmode, NULL);
25007 set_optab_libfunc (ne_optab, HFmode, NULL);
25008 set_optab_libfunc (lt_optab, HFmode, NULL);
25009 set_optab_libfunc (le_optab, HFmode, NULL);
25010 set_optab_libfunc (ge_optab, HFmode, NULL);
25011 set_optab_libfunc (gt_optab, HFmode, NULL);
25012 set_optab_libfunc (unord_optab, HFmode, NULL);
25015 /* Target hook for c_mode_for_suffix. */
25016 static machine_mode
25017 aarch64_c_mode_for_suffix (char suffix)
25019 if (suffix == 'q')
25020 return TFmode;
25022 return VOIDmode;
25025 /* We can only represent floating point constants which will fit in
25026 "quarter-precision" values. These values are characterised by
25027 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
25030 (-1)^s * (n/16) * 2^r
25032 Where:
25033 's' is the sign bit.
25034 'n' is an integer in the range 16 <= n <= 31.
25035 'r' is an integer in the range -3 <= r <= 4. */
25037 /* Return true iff X can be represented by a quarter-precision
25038 floating point immediate operand X. Note, we cannot represent 0.0. */
25039 bool
25040 aarch64_float_const_representable_p (rtx x)
25042 /* This represents our current view of how many bits
25043 make up the mantissa. */
25044 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
25045 int exponent;
25046 unsigned HOST_WIDE_INT mantissa, mask;
25047 REAL_VALUE_TYPE r, m;
25048 bool fail;
25050 x = unwrap_const_vec_duplicate (x);
25051 if (!CONST_DOUBLE_P (x))
25052 return false;
25054 if (GET_MODE (x) == VOIDmode
25055 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
25056 return false;
25058 r = *CONST_DOUBLE_REAL_VALUE (x);
25060 /* We cannot represent infinities, NaNs or +/-zero. We won't
25061 know if we have +zero until we analyse the mantissa, but we
25062 can reject the other invalid values. */
25063 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
25064 || REAL_VALUE_MINUS_ZERO (r))
25065 return false;
25067 /* For BFmode, only handle 0.0. */
25068 if (GET_MODE (x) == BFmode)
25069 return real_iszero (&r, false);
25071 /* Extract exponent. */
25072 r = real_value_abs (&r);
25073 exponent = REAL_EXP (&r);
25075 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
25076 highest (sign) bit, with a fixed binary point at bit point_pos.
25077 m1 holds the low part of the mantissa, m2 the high part.
25078 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
25079 bits for the mantissa, this can fail (low bits will be lost). */
25080 real_ldexp (&m, &r, point_pos - exponent);
25081 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
25083 /* If the low part of the mantissa has bits set we cannot represent
25084 the value. */
25085 if (w.ulow () != 0)
25086 return false;
25087 /* We have rejected the lower HOST_WIDE_INT, so update our
25088 understanding of how many bits lie in the mantissa and
25089 look only at the high HOST_WIDE_INT. */
25090 mantissa = w.elt (1);
25091 point_pos -= HOST_BITS_PER_WIDE_INT;
25093 /* We can only represent values with a mantissa of the form 1.xxxx. */
25094 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
25095 if ((mantissa & mask) != 0)
25096 return false;
25098 /* Having filtered unrepresentable values, we may now remove all
25099 but the highest 5 bits. */
25100 mantissa >>= point_pos - 5;
25102 /* We cannot represent the value 0.0, so reject it. This is handled
25103 elsewhere. */
25104 if (mantissa == 0)
25105 return false;
25107 /* Then, as bit 4 is always set, we can mask it off, leaving
25108 the mantissa in the range [0, 15]. */
25109 mantissa &= ~(1 << 4);
25110 gcc_assert (mantissa <= 15);
25112 /* GCC internally does not use IEEE754-like encoding (where normalized
25113 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
25114 Our mantissa values are shifted 4 places to the left relative to
25115 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
25116 by 5 places to correct for GCC's representation. */
25117 exponent = 5 - exponent;
25119 return (exponent >= 0 && exponent <= 7);
25122 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
25123 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
25124 output MOVI/MVNI, ORR or BIC immediate. */
25125 char*
25126 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
25127 enum simd_immediate_check which)
25129 bool is_valid;
25130 static char templ[40];
25131 const char *mnemonic;
25132 const char *shift_op;
25133 unsigned int lane_count = 0;
25134 char element_char;
25136 struct simd_immediate_info info;
25138 /* This will return true to show const_vector is legal for use as either
25139 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
25140 It will also update INFO to show how the immediate should be generated.
25141 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
25142 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
25143 gcc_assert (is_valid);
25145 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25146 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
25148 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25150 gcc_assert (info.insn == simd_immediate_info::MOV
25151 && info.u.mov.shift == 0);
25152 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25153 move immediate path. */
25154 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25155 info.u.mov.value = GEN_INT (0);
25156 else
25158 const unsigned int buf_size = 20;
25159 char float_buf[buf_size] = {'\0'};
25160 real_to_decimal_for_mode (float_buf,
25161 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25162 buf_size, buf_size, 1, info.elt_mode);
25164 if (lane_count == 1)
25165 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
25166 else
25167 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
25168 lane_count, element_char, float_buf);
25169 return templ;
25173 gcc_assert (CONST_INT_P (info.u.mov.value));
25175 if (which == AARCH64_CHECK_MOV)
25177 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
25178 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
25179 ? "msl" : "lsl");
25180 if (lane_count == 1)
25181 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
25182 mnemonic, UINTVAL (info.u.mov.value));
25183 else if (info.u.mov.shift)
25184 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25185 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
25186 element_char, UINTVAL (info.u.mov.value), shift_op,
25187 info.u.mov.shift);
25188 else
25189 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25190 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25191 element_char, UINTVAL (info.u.mov.value));
25193 else
25195 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
25196 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
25197 if (info.u.mov.shift)
25198 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25199 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25200 element_char, UINTVAL (info.u.mov.value), "lsl",
25201 info.u.mov.shift);
25202 else
25203 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25204 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25205 element_char, UINTVAL (info.u.mov.value));
25207 return templ;
25210 char*
25211 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25214 /* If a floating point number was passed and we desire to use it in an
25215 integer mode do the conversion to integer. */
25216 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25218 unsigned HOST_WIDE_INT ival;
25219 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25220 gcc_unreachable ();
25221 immediate = gen_int_mode (ival, mode);
25224 machine_mode vmode;
25225 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25226 a 128 bit vector mode. */
25227 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25229 vmode = aarch64_simd_container_mode (mode, width);
25230 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25231 return aarch64_output_simd_mov_immediate (v_op, width);
25234 /* Return the output string to use for moving immediate CONST_VECTOR
25235 into an SVE register. */
25237 char *
25238 aarch64_output_sve_mov_immediate (rtx const_vector)
25240 static char templ[40];
25241 struct simd_immediate_info info;
25242 char element_char;
25244 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
25245 gcc_assert (is_valid);
25247 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25249 machine_mode vec_mode = GET_MODE (const_vector);
25250 if (aarch64_sve_pred_mode_p (vec_mode))
25252 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25253 if (info.insn == simd_immediate_info::MOV)
25255 gcc_assert (info.u.mov.value == const0_rtx);
25256 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25258 else
25260 gcc_assert (info.insn == simd_immediate_info::PTRUE);
25261 unsigned int total_bytes;
25262 if (info.u.pattern == AARCH64_SV_ALL
25263 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25264 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25265 total_bytes / GET_MODE_SIZE (info.elt_mode));
25266 else
25267 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25268 svpattern_token (info.u.pattern));
25270 return buf;
25273 if (info.insn == simd_immediate_info::INDEX)
25275 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25276 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25277 element_char, INTVAL (info.u.index.base),
25278 INTVAL (info.u.index.step));
25279 return templ;
25282 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25284 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25285 info.u.mov.value = GEN_INT (0);
25286 else
25288 const int buf_size = 20;
25289 char float_buf[buf_size] = {};
25290 real_to_decimal_for_mode (float_buf,
25291 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25292 buf_size, buf_size, 1, info.elt_mode);
25294 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25295 element_char, float_buf);
25296 return templ;
25300 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25301 element_char, INTVAL (info.u.mov.value));
25302 return templ;
25305 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
25306 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25307 pattern. */
25309 char *
25310 aarch64_output_sve_ptrues (rtx const_unspec)
25312 static char templ[40];
25314 struct simd_immediate_info info;
25315 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
25316 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25318 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25319 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25320 svpattern_token (info.u.pattern));
25321 return templ;
25324 /* Split operands into moves from op[1] + op[2] into op[0]. */
25326 void
25327 aarch64_split_combinev16qi (rtx operands[3])
25329 machine_mode halfmode = GET_MODE (operands[1]);
25331 gcc_assert (halfmode == V16QImode);
25333 rtx destlo = simplify_gen_subreg (halfmode, operands[0],
25334 GET_MODE (operands[0]), 0);
25335 rtx desthi = simplify_gen_subreg (halfmode, operands[0],
25336 GET_MODE (operands[0]),
25337 GET_MODE_SIZE (halfmode));
25339 bool skiplo = rtx_equal_p (destlo, operands[1]);
25340 bool skiphi = rtx_equal_p (desthi, operands[2]);
25342 if (skiplo && skiphi)
25344 /* No-op move. Can't split to nothing; emit something. */
25345 emit_note (NOTE_INSN_DELETED);
25346 return;
25349 /* Special case of reversed high/low parts. */
25350 if (reg_overlap_mentioned_p (operands[2], destlo)
25351 && reg_overlap_mentioned_p (operands[1], desthi))
25353 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25354 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25355 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25357 else if (!reg_overlap_mentioned_p (operands[2], destlo))
25359 /* Try to avoid unnecessary moves if part of the result
25360 is in the right place already. */
25361 if (!skiplo)
25362 emit_move_insn (destlo, operands[1]);
25363 if (!skiphi)
25364 emit_move_insn (desthi, operands[2]);
25366 else
25368 if (!skiphi)
25369 emit_move_insn (desthi, operands[2]);
25370 if (!skiplo)
25371 emit_move_insn (destlo, operands[1]);
25375 /* vec_perm support. */
25377 struct expand_vec_perm_d
25379 rtx target, op0, op1;
25380 vec_perm_indices perm;
25381 machine_mode vmode;
25382 machine_mode op_mode;
25383 unsigned int vec_flags;
25384 unsigned int op_vec_flags;
25385 bool one_vector_p;
25386 bool testing_p;
25389 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25391 /* Generate a variable permutation. */
25393 static void
25394 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25396 machine_mode vmode = GET_MODE (target);
25397 bool one_vector_p = rtx_equal_p (op0, op1);
25399 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25400 gcc_checking_assert (GET_MODE (op0) == vmode);
25401 gcc_checking_assert (GET_MODE (op1) == vmode);
25402 gcc_checking_assert (GET_MODE (sel) == vmode);
25403 gcc_checking_assert (TARGET_SIMD);
25405 if (one_vector_p)
25407 if (vmode == V8QImode)
25409 /* Expand the argument to a V16QI mode by duplicating it. */
25410 rtx pair = gen_reg_rtx (V16QImode);
25411 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25412 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25414 else
25416 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25419 else
25421 rtx pair;
25423 if (vmode == V8QImode)
25425 pair = gen_reg_rtx (V16QImode);
25426 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25427 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25429 else
25431 pair = gen_reg_rtx (V2x16QImode);
25432 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25433 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25438 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25439 NELT is the number of elements in the vector. */
25441 void
25442 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25443 unsigned int nelt)
25445 machine_mode vmode = GET_MODE (target);
25446 bool one_vector_p = rtx_equal_p (op0, op1);
25447 rtx mask;
25449 /* The TBL instruction does not use a modulo index, so we must take care
25450 of that ourselves. */
25451 mask = aarch64_simd_gen_const_vector_dup (vmode,
25452 one_vector_p ? nelt - 1 : 2 * nelt - 1);
25453 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25455 /* For big-endian, we also need to reverse the index within the vector
25456 (but not which vector). */
25457 if (BYTES_BIG_ENDIAN)
25459 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
25460 if (!one_vector_p)
25461 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25462 sel = expand_simple_binop (vmode, XOR, sel, mask,
25463 NULL, 0, OPTAB_LIB_WIDEN);
25465 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25468 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
25470 static void
25471 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25473 emit_insn (gen_rtx_SET (target,
25474 gen_rtx_UNSPEC (GET_MODE (target),
25475 gen_rtvec (2, op0, op1), code)));
25478 /* Expand an SVE vec_perm with the given operands. */
25480 void
25481 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25483 machine_mode data_mode = GET_MODE (target);
25484 machine_mode sel_mode = GET_MODE (sel);
25485 /* Enforced by the pattern condition. */
25486 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25488 /* Note: vec_perm indices are supposed to wrap when they go beyond the
25489 size of the two value vectors, i.e. the upper bits of the indices
25490 are effectively ignored. SVE TBL instead produces 0 for any
25491 out-of-range indices, so we need to modulo all the vec_perm indices
25492 to ensure they are all in range. */
25493 rtx sel_reg = force_reg (sel_mode, sel);
25495 /* Check if the sel only references the first values vector. */
25496 if (CONST_VECTOR_P (sel)
25497 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25499 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25500 return;
25503 /* Check if the two values vectors are the same. */
25504 if (rtx_equal_p (op0, op1))
25506 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25507 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25508 NULL, 0, OPTAB_DIRECT);
25509 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25510 return;
25513 /* Run TBL on for each value vector and combine the results. */
25515 rtx res0 = gen_reg_rtx (data_mode);
25516 rtx res1 = gen_reg_rtx (data_mode);
25517 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25518 if (!CONST_VECTOR_P (sel)
25519 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25521 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25522 2 * nunits - 1);
25523 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25524 NULL, 0, OPTAB_DIRECT);
25526 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25527 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25528 NULL, 0, OPTAB_DIRECT);
25529 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25530 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25531 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25532 else
25533 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25536 /* Recognize patterns suitable for the TRN instructions. */
25537 static bool
25538 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25540 HOST_WIDE_INT odd;
25541 poly_uint64 nelt = d->perm.length ();
25542 rtx out, in0, in1;
25543 machine_mode vmode = d->vmode;
25545 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25546 return false;
25548 /* Note that these are little-endian tests.
25549 We correct for big-endian later. */
25550 if (!d->perm[0].is_constant (&odd)
25551 || (odd != 0 && odd != 1)
25552 || !d->perm.series_p (0, 2, odd, 2)
25553 || !d->perm.series_p (1, 2, nelt + odd, 2))
25554 return false;
25556 /* Success! */
25557 if (d->testing_p)
25558 return true;
25560 in0 = d->op0;
25561 in1 = d->op1;
25562 /* We don't need a big-endian lane correction for SVE; see the comment
25563 at the head of aarch64-sve.md for details. */
25564 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25566 std::swap (in0, in1);
25567 odd = !odd;
25569 out = d->target;
25571 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25572 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25573 return true;
25576 /* Try to re-encode the PERM constant so it combines odd and even elements.
25577 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25578 We retry with this new constant with the full suite of patterns. */
25579 static bool
25580 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25582 expand_vec_perm_d newd;
25584 if (d->vec_flags != VEC_ADVSIMD)
25585 return false;
25587 /* Get the new mode. Always twice the size of the inner
25588 and half the elements. */
25589 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
25590 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
25591 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
25592 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
25594 if (new_mode == word_mode)
25595 return false;
25597 vec_perm_indices newpermindices;
25599 if (!newpermindices.new_shrunk_vector (d->perm, 2))
25600 return false;
25602 newd.vmode = new_mode;
25603 newd.vec_flags = VEC_ADVSIMD;
25604 newd.op_mode = newd.vmode;
25605 newd.op_vec_flags = newd.vec_flags;
25606 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25607 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25608 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25609 newd.testing_p = d->testing_p;
25610 newd.one_vector_p = d->one_vector_p;
25612 newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2,
25613 newpermindices.nelts_per_input ());
25614 return aarch64_expand_vec_perm_const_1 (&newd);
25617 /* Recognize patterns suitable for the UZP instructions. */
25618 static bool
25619 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25621 HOST_WIDE_INT odd;
25622 rtx out, in0, in1;
25623 machine_mode vmode = d->vmode;
25625 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25626 return false;
25628 /* Note that these are little-endian tests.
25629 We correct for big-endian later. */
25630 if (!d->perm[0].is_constant (&odd)
25631 || (odd != 0 && odd != 1)
25632 || !d->perm.series_p (0, 1, odd, 2))
25633 return false;
25635 /* Success! */
25636 if (d->testing_p)
25637 return true;
25639 in0 = d->op0;
25640 in1 = d->op1;
25641 /* We don't need a big-endian lane correction for SVE; see the comment
25642 at the head of aarch64-sve.md for details. */
25643 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25645 std::swap (in0, in1);
25646 odd = !odd;
25648 out = d->target;
25650 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25651 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
25652 return true;
25655 /* Recognize patterns suitable for the ZIP instructions. */
25656 static bool
25657 aarch64_evpc_zip (struct expand_vec_perm_d *d)
25659 unsigned int high;
25660 poly_uint64 nelt = d->perm.length ();
25661 rtx out, in0, in1;
25662 machine_mode vmode = d->vmode;
25664 if (GET_MODE_UNIT_SIZE (vmode) > 8)
25665 return false;
25667 /* Note that these are little-endian tests.
25668 We correct for big-endian later. */
25669 poly_uint64 first = d->perm[0];
25670 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
25671 || !d->perm.series_p (0, 2, first, 1)
25672 || !d->perm.series_p (1, 2, first + nelt, 1))
25673 return false;
25674 high = maybe_ne (first, 0U);
25676 /* Success! */
25677 if (d->testing_p)
25678 return true;
25680 in0 = d->op0;
25681 in1 = d->op1;
25682 /* We don't need a big-endian lane correction for SVE; see the comment
25683 at the head of aarch64-sve.md for details. */
25684 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25686 std::swap (in0, in1);
25687 high = !high;
25689 out = d->target;
25691 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25692 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
25693 return true;
25696 /* Recognize patterns for the EXT insn. */
25698 static bool
25699 aarch64_evpc_ext (struct expand_vec_perm_d *d)
25701 HOST_WIDE_INT location;
25702 rtx offset;
25704 /* The first element always refers to the first vector.
25705 Check if the extracted indices are increasing by one. */
25706 if ((d->vec_flags & VEC_SVE_PRED)
25707 || !d->perm[0].is_constant (&location)
25708 || !d->perm.series_p (0, 1, location, 1))
25709 return false;
25711 /* Success! */
25712 if (d->testing_p)
25713 return true;
25715 /* The case where (location == 0) is a no-op for both big- and little-endian,
25716 and is removed by the mid-end at optimization levels -O1 and higher.
25718 We don't need a big-endian lane correction for SVE; see the comment
25719 at the head of aarch64-sve.md for details. */
25720 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
25722 /* After setup, we want the high elements of the first vector (stored
25723 at the LSB end of the register), and the low elements of the second
25724 vector (stored at the MSB end of the register). So swap. */
25725 std::swap (d->op0, d->op1);
25726 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25727 to_constant () is safe since this is restricted to Advanced SIMD
25728 vectors. */
25729 location = d->perm.length ().to_constant () - location;
25732 offset = GEN_INT (location);
25733 emit_set_insn (d->target,
25734 gen_rtx_UNSPEC (d->vmode,
25735 gen_rtvec (3, d->op0, d->op1, offset),
25736 UNSPEC_EXT));
25737 return true;
25740 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25741 within each 64-bit, 32-bit or 16-bit granule. */
25743 static bool
25744 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
25746 HOST_WIDE_INT diff;
25747 unsigned int i, size, unspec;
25748 machine_mode pred_mode;
25750 if ((d->vec_flags & VEC_SVE_PRED)
25751 || !d->one_vector_p
25752 || !d->perm[0].is_constant (&diff)
25753 || !diff)
25754 return false;
25756 if (d->vec_flags & VEC_SVE_DATA)
25757 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
25758 else
25759 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
25760 if (size == 64)
25762 unspec = UNSPEC_REV64;
25763 pred_mode = VNx2BImode;
25765 else if (size == 32)
25767 unspec = UNSPEC_REV32;
25768 pred_mode = VNx4BImode;
25770 else if (size == 16)
25772 unspec = UNSPEC_REV16;
25773 pred_mode = VNx8BImode;
25775 else
25776 return false;
25778 unsigned int step = diff + 1;
25779 for (i = 0; i < step; ++i)
25780 if (!d->perm.series_p (i, step, diff - i, step))
25781 return false;
25783 /* Success! */
25784 if (d->testing_p)
25785 return true;
25787 if (d->vec_flags & VEC_SVE_DATA)
25789 rtx pred = aarch64_ptrue_reg (pred_mode);
25790 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
25791 d->target, pred, d->op0));
25792 return true;
25794 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
25795 emit_set_insn (d->target, src);
25796 return true;
25799 /* Recognize patterns for the REV insn, which reverses elements within
25800 a full vector. */
25802 static bool
25803 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
25805 poly_uint64 nelt = d->perm.length ();
25807 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
25808 return false;
25810 if (!d->perm.series_p (0, 1, nelt - 1, -1))
25811 return false;
25813 /* Success! */
25814 if (d->testing_p)
25815 return true;
25817 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
25818 emit_set_insn (d->target, src);
25819 return true;
25822 static bool
25823 aarch64_evpc_dup (struct expand_vec_perm_d *d)
25825 rtx out = d->target;
25826 rtx in0;
25827 HOST_WIDE_INT elt;
25828 machine_mode vmode = d->vmode;
25829 rtx lane;
25831 if ((d->vec_flags & VEC_SVE_PRED)
25832 || d->perm.encoding ().encoded_nelts () != 1
25833 || !d->perm[0].is_constant (&elt))
25834 return false;
25836 if ((d->vec_flags & VEC_SVE_DATA)
25837 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
25838 return false;
25840 /* Success! */
25841 if (d->testing_p)
25842 return true;
25844 /* The generic preparation in aarch64_expand_vec_perm_const_1
25845 swaps the operand order and the permute indices if it finds
25846 d->perm[0] to be in the second operand. Thus, we can always
25847 use d->op0 and need not do any extra arithmetic to get the
25848 correct lane number. */
25849 in0 = d->op0;
25850 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
25852 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
25853 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
25854 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
25855 return true;
25858 static bool
25859 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
25861 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
25862 machine_mode vmode = d->vmode;
25864 /* Make sure that the indices are constant. */
25865 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
25866 for (unsigned int i = 0; i < encoded_nelts; ++i)
25867 if (!d->perm[i].is_constant ())
25868 return false;
25870 if (d->testing_p)
25871 return true;
25873 /* Generic code will try constant permutation twice. Once with the
25874 original mode and again with the elements lowered to QImode.
25875 So wait and don't do the selector expansion ourselves. */
25876 if (vmode != V8QImode && vmode != V16QImode)
25877 return false;
25879 /* to_constant is safe since this routine is specific to Advanced SIMD
25880 vectors. */
25881 unsigned int nelt = d->perm.length ().to_constant ();
25882 for (unsigned int i = 0; i < nelt; ++i)
25883 /* If big-endian and two vectors we end up with a weird mixed-endian
25884 mode on NEON. Reverse the index within each word but not the word
25885 itself. to_constant is safe because we checked is_constant above. */
25886 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
25887 ? d->perm[i].to_constant () ^ (nelt - 1)
25888 : d->perm[i].to_constant ());
25890 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
25891 sel = force_reg (vmode, sel);
25893 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
25894 return true;
25897 /* Try to implement D using an SVE TBL instruction. */
25899 static bool
25900 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
25902 unsigned HOST_WIDE_INT nelt;
25904 /* Permuting two variable-length vectors could overflow the
25905 index range. */
25906 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
25907 return false;
25909 if (d->testing_p)
25910 return true;
25912 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
25913 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
25914 if (d->one_vector_p)
25915 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
25916 else
25917 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
25918 return true;
25921 /* Try to implement D using SVE dup instruction. */
25923 static bool
25924 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
25926 if (BYTES_BIG_ENDIAN
25927 || !d->one_vector_p
25928 || d->vec_flags != VEC_SVE_DATA
25929 || d->op_vec_flags != VEC_ADVSIMD
25930 || d->perm.encoding ().nelts_per_pattern () != 1
25931 || !known_eq (d->perm.encoding ().npatterns (),
25932 GET_MODE_NUNITS (d->op_mode))
25933 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
25934 return false;
25936 int npatterns = d->perm.encoding ().npatterns ();
25937 for (int i = 0; i < npatterns; i++)
25938 if (!known_eq (d->perm[i], i))
25939 return false;
25941 if (d->testing_p)
25942 return true;
25944 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
25945 return true;
25948 /* Try to implement D using SVE SEL instruction. */
25950 static bool
25951 aarch64_evpc_sel (struct expand_vec_perm_d *d)
25953 machine_mode vmode = d->vmode;
25954 int unit_size = GET_MODE_UNIT_SIZE (vmode);
25956 if (d->vec_flags != VEC_SVE_DATA
25957 || unit_size > 8)
25958 return false;
25960 int n_patterns = d->perm.encoding ().npatterns ();
25961 poly_int64 vec_len = d->perm.length ();
25963 for (int i = 0; i < n_patterns; ++i)
25964 if (!known_eq (d->perm[i], i)
25965 && !known_eq (d->perm[i], vec_len + i))
25966 return false;
25968 for (int i = n_patterns; i < n_patterns * 2; i++)
25969 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
25970 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
25971 return false;
25973 if (d->testing_p)
25974 return true;
25976 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
25978 /* Build a predicate that is true when op0 elements should be used. */
25979 rtx_vector_builder builder (pred_mode, n_patterns, 2);
25980 for (int i = 0; i < n_patterns * 2; i++)
25982 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
25983 : CONST0_RTX (BImode);
25984 builder.quick_push (elem);
25987 rtx const_vec = builder.build ();
25988 rtx pred = force_reg (pred_mode, const_vec);
25989 /* TARGET = PRED ? OP0 : OP1. */
25990 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
25991 return true;
25994 /* Recognize patterns suitable for the INS instructions. */
25995 static bool
25996 aarch64_evpc_ins (struct expand_vec_perm_d *d)
25998 machine_mode mode = d->vmode;
25999 unsigned HOST_WIDE_INT nelt;
26001 if (d->vec_flags != VEC_ADVSIMD)
26002 return false;
26004 /* to_constant is safe since this routine is specific to Advanced SIMD
26005 vectors. */
26006 nelt = d->perm.length ().to_constant ();
26007 rtx insv = d->op0;
26009 HOST_WIDE_INT idx = -1;
26011 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26013 HOST_WIDE_INT elt;
26014 if (!d->perm[i].is_constant (&elt))
26015 return false;
26016 if (elt == (HOST_WIDE_INT) i)
26017 continue;
26018 if (idx != -1)
26020 idx = -1;
26021 break;
26023 idx = i;
26026 if (idx == -1)
26028 insv = d->op1;
26029 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26031 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
26032 continue;
26033 if (idx != -1)
26034 return false;
26035 idx = i;
26038 if (idx == -1)
26039 return false;
26042 if (d->testing_p)
26043 return true;
26045 gcc_assert (idx != -1);
26047 unsigned extractindex = d->perm[idx].to_constant ();
26048 rtx extractv = d->op0;
26049 if (extractindex >= nelt)
26051 extractv = d->op1;
26052 extractindex -= nelt;
26054 gcc_assert (extractindex < nelt);
26056 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
26057 expand_operand ops[5];
26058 create_output_operand (&ops[0], d->target, mode);
26059 create_input_operand (&ops[1], insv, mode);
26060 create_integer_operand (&ops[2], 1 << idx);
26061 create_input_operand (&ops[3], extractv, mode);
26062 create_integer_operand (&ops[4], extractindex);
26063 expand_insn (icode, 5, ops);
26065 return true;
26068 static bool
26069 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
26071 gcc_assert (d->op_mode != E_VOIDmode);
26073 /* The pattern matching functions above are written to look for a small
26074 number to begin the sequence (0, 1, N/2). If we begin with an index
26075 from the second operand, we can swap the operands. */
26076 poly_int64 nelt = d->perm.length ();
26077 if (known_ge (d->perm[0], nelt))
26079 d->perm.rotate_inputs (1);
26080 std::swap (d->op0, d->op1);
26083 if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
26084 || d->vec_flags == VEC_SVE_DATA
26085 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
26086 || d->vec_flags == VEC_SVE_PRED)
26087 && known_gt (nelt, 1))
26089 if (d->vmode == d->op_mode)
26091 if (aarch64_evpc_rev_local (d))
26092 return true;
26093 else if (aarch64_evpc_rev_global (d))
26094 return true;
26095 else if (aarch64_evpc_ext (d))
26096 return true;
26097 else if (aarch64_evpc_dup (d))
26098 return true;
26099 else if (aarch64_evpc_zip (d))
26100 return true;
26101 else if (aarch64_evpc_uzp (d))
26102 return true;
26103 else if (aarch64_evpc_trn (d))
26104 return true;
26105 else if (aarch64_evpc_sel (d))
26106 return true;
26107 else if (aarch64_evpc_ins (d))
26108 return true;
26109 else if (aarch64_evpc_reencode (d))
26110 return true;
26112 if (d->vec_flags == VEC_SVE_DATA)
26113 return aarch64_evpc_sve_tbl (d);
26114 else if (d->vec_flags == VEC_ADVSIMD)
26115 return aarch64_evpc_tbl (d);
26117 else
26119 if (aarch64_evpc_sve_dup (d))
26120 return true;
26123 return false;
26126 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
26128 static bool
26129 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
26130 rtx target, rtx op0, rtx op1,
26131 const vec_perm_indices &sel)
26133 struct expand_vec_perm_d d;
26135 /* Check whether the mask can be applied to a single vector. */
26136 if (sel.ninputs () == 1
26137 || (op0 && rtx_equal_p (op0, op1)))
26138 d.one_vector_p = true;
26139 else if (sel.all_from_input_p (0))
26141 d.one_vector_p = true;
26142 op1 = op0;
26144 else if (sel.all_from_input_p (1))
26146 d.one_vector_p = true;
26147 op0 = op1;
26149 else
26150 d.one_vector_p = false;
26152 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
26153 sel.nelts_per_input ());
26154 d.vmode = vmode;
26155 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
26156 d.op_mode = op_mode;
26157 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
26158 d.target = target;
26159 d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
26160 if (op0 == op1)
26161 d.op1 = d.op0;
26162 else
26163 d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
26164 d.testing_p = !target;
26166 if (!d.testing_p)
26167 return aarch64_expand_vec_perm_const_1 (&d);
26169 rtx_insn *last = get_last_insn ();
26170 bool ret = aarch64_expand_vec_perm_const_1 (&d);
26171 gcc_assert (last == get_last_insn ());
26173 return ret;
26175 /* Generate a byte permute mask for a register of mode MODE,
26176 which has NUNITS units. */
26179 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26181 /* We have to reverse each vector because we dont have
26182 a permuted load that can reverse-load according to ABI rules. */
26183 rtx mask;
26184 rtvec v = rtvec_alloc (16);
26185 unsigned int i, j;
26186 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26188 gcc_assert (BYTES_BIG_ENDIAN);
26189 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26191 for (i = 0; i < nunits; i++)
26192 for (j = 0; j < usize; j++)
26193 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26194 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26195 return force_reg (V16QImode, mask);
26198 /* Expand an SVE integer comparison using the SVE equivalent of:
26200 (set TARGET (CODE OP0 OP1)). */
26202 void
26203 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26205 machine_mode pred_mode = GET_MODE (target);
26206 machine_mode data_mode = GET_MODE (op0);
26207 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26208 op0, op1);
26209 if (!rtx_equal_p (target, res))
26210 emit_move_insn (target, res);
26213 /* Return the UNSPEC_COND_* code for comparison CODE. */
26215 static unsigned int
26216 aarch64_unspec_cond_code (rtx_code code)
26218 switch (code)
26220 case NE:
26221 return UNSPEC_COND_FCMNE;
26222 case EQ:
26223 return UNSPEC_COND_FCMEQ;
26224 case LT:
26225 return UNSPEC_COND_FCMLT;
26226 case GT:
26227 return UNSPEC_COND_FCMGT;
26228 case LE:
26229 return UNSPEC_COND_FCMLE;
26230 case GE:
26231 return UNSPEC_COND_FCMGE;
26232 case UNORDERED:
26233 return UNSPEC_COND_FCMUO;
26234 default:
26235 gcc_unreachable ();
26239 /* Emit:
26241 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26243 where <X> is the operation associated with comparison CODE.
26244 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26246 static void
26247 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26248 bool known_ptrue_p, rtx op0, rtx op1)
26250 rtx flag = gen_int_mode (known_ptrue_p, SImode);
26251 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26252 gen_rtvec (4, pred, flag, op0, op1),
26253 aarch64_unspec_cond_code (code));
26254 emit_set_insn (target, unspec);
26257 /* Emit the SVE equivalent of:
26259 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26260 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26261 (set TARGET (ior:PRED_MODE TMP1 TMP2))
26263 where <Xi> is the operation associated with comparison CODEi.
26264 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26266 static void
26267 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26268 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26270 machine_mode pred_mode = GET_MODE (pred);
26271 rtx tmp1 = gen_reg_rtx (pred_mode);
26272 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26273 rtx tmp2 = gen_reg_rtx (pred_mode);
26274 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26275 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26278 /* Emit the SVE equivalent of:
26280 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26281 (set TARGET (not TMP))
26283 where <X> is the operation associated with comparison CODE.
26284 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26286 static void
26287 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26288 bool known_ptrue_p, rtx op0, rtx op1)
26290 machine_mode pred_mode = GET_MODE (pred);
26291 rtx tmp = gen_reg_rtx (pred_mode);
26292 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26293 aarch64_emit_unop (target, one_cmpl_optab, tmp);
26296 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26298 (set TARGET (CODE OP0 OP1))
26300 If CAN_INVERT_P is true, the caller can also handle inverted results;
26301 return true if the result is in fact inverted. */
26303 bool
26304 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26305 rtx op0, rtx op1, bool can_invert_p)
26307 machine_mode pred_mode = GET_MODE (target);
26308 machine_mode data_mode = GET_MODE (op0);
26310 rtx ptrue = aarch64_ptrue_reg (pred_mode);
26311 switch (code)
26313 case UNORDERED:
26314 /* UNORDERED has no immediate form. */
26315 op1 = force_reg (data_mode, op1);
26316 /* fall through */
26317 case LT:
26318 case LE:
26319 case GT:
26320 case GE:
26321 case EQ:
26322 case NE:
26324 /* There is native support for the comparison. */
26325 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26326 return false;
26329 case LTGT:
26330 /* This is a trapping operation (LT or GT). */
26331 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26332 return false;
26334 case UNEQ:
26335 if (!flag_trapping_math)
26337 /* This would trap for signaling NaNs. */
26338 op1 = force_reg (data_mode, op1);
26339 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26340 ptrue, true, op0, op1);
26341 return false;
26343 /* fall through */
26344 case UNLT:
26345 case UNLE:
26346 case UNGT:
26347 case UNGE:
26348 if (flag_trapping_math)
26350 /* Work out which elements are ordered. */
26351 rtx ordered = gen_reg_rtx (pred_mode);
26352 op1 = force_reg (data_mode, op1);
26353 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26354 ptrue, true, op0, op1);
26356 /* Test the opposite condition for the ordered elements,
26357 then invert the result. */
26358 if (code == UNEQ)
26359 code = NE;
26360 else
26361 code = reverse_condition_maybe_unordered (code);
26362 if (can_invert_p)
26364 aarch64_emit_sve_fp_cond (target, code,
26365 ordered, false, op0, op1);
26366 return true;
26368 aarch64_emit_sve_invert_fp_cond (target, code,
26369 ordered, false, op0, op1);
26370 return false;
26372 break;
26374 case ORDERED:
26375 /* ORDERED has no immediate form. */
26376 op1 = force_reg (data_mode, op1);
26377 break;
26379 default:
26380 gcc_unreachable ();
26383 /* There is native support for the inverse comparison. */
26384 code = reverse_condition_maybe_unordered (code);
26385 if (can_invert_p)
26387 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26388 return true;
26390 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26391 return false;
26394 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
26395 of the data being selected and CMP_MODE is the mode of the values being
26396 compared. */
26398 void
26399 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
26400 rtx *ops)
26402 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
26403 rtx pred = gen_reg_rtx (pred_mode);
26404 if (FLOAT_MODE_P (cmp_mode))
26406 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
26407 ops[4], ops[5], true))
26408 std::swap (ops[1], ops[2]);
26410 else
26411 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
26413 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
26414 ops[1] = force_reg (data_mode, ops[1]);
26415 /* The "false" value can only be zero if the "true" value is a constant. */
26416 if (register_operand (ops[1], data_mode)
26417 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
26418 ops[2] = force_reg (data_mode, ops[2]);
26420 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
26421 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
26424 /* Return true if:
26426 (a) MODE1 and MODE2 use the same layout for bytes that are common
26427 to both modes;
26429 (b) subregs involving the two modes behave as the target-independent
26430 subreg rules require; and
26432 (c) there is at least one register that can hold both modes.
26434 Return false otherwise. */
26436 static bool
26437 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26439 unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26440 unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26442 bool sve1_p = (flags1 & VEC_ANY_SVE);
26443 bool sve2_p = (flags2 & VEC_ANY_SVE);
26445 bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26446 bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26448 bool pred1_p = (flags1 & VEC_SVE_PRED);
26449 bool pred2_p = (flags2 & VEC_SVE_PRED);
26451 bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26452 | VEC_PARTIAL));
26453 bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26454 | VEC_PARTIAL));
26456 /* Don't allow changes between predicate modes and other modes.
26457 Only predicate registers can hold predicate modes and only
26458 non-predicate registers can hold non-predicate modes, so any
26459 attempt to mix them would require a round trip through memory. */
26460 if (pred1_p != pred2_p)
26461 return false;
26463 /* The contents of partial SVE modes are distributed evenly across
26464 the register, whereas GCC expects them to be clustered together.
26465 We therefore need to be careful about mode changes involving them. */
26466 if (partial_sve1_p && partial_sve2_p)
26468 /* Reject changes between partial SVE modes that have different
26469 patterns of significant and insignificant bits. */
26470 if ((aarch64_sve_container_bits (mode1)
26471 != aarch64_sve_container_bits (mode2))
26472 || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26473 return false;
26475 else if (partial_sve1_p)
26477 /* The first lane of MODE1 is where GCC expects it, but anything
26478 bigger than that is not. */
26479 if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26480 return false;
26482 else if (partial_sve2_p)
26484 /* Similarly in reverse. */
26485 if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26486 return false;
26489 /* Don't allow changes between partial Advanced SIMD structure modes
26490 and other modes that are bigger than 8 bytes. E.g. V16QI and V2x8QI
26491 are the same size, but the former occupies one Q register while the
26492 latter occupies two D registers. */
26493 if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26494 && maybe_gt (GET_MODE_SIZE (mode1), 8)
26495 && maybe_gt (GET_MODE_SIZE (mode2), 8))
26496 return false;
26498 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26500 /* Don't allow changes between SVE modes and other modes that might
26501 be bigger than 128 bits. In particular, OImode, CImode and XImode
26502 divide into 128-bit quantities while SVE modes divide into
26503 BITS_PER_SVE_VECTOR quantities. */
26504 if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26505 return false;
26506 if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26507 return false;
26510 if (BYTES_BIG_ENDIAN)
26512 /* Don't allow changes between SVE data modes and non-SVE modes.
26513 See the comment at the head of aarch64-sve.md for details. */
26514 if (sve1_p != sve2_p)
26515 return false;
26517 /* Don't allow changes in element size: lane 0 of the new vector
26518 would not then be lane 0 of the old vector. See the comment
26519 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26520 description.
26522 In the worst case, this forces a register to be spilled in
26523 one mode and reloaded in the other, which handles the
26524 endianness correctly. */
26525 if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26526 return false;
26528 return true;
26531 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always defer
26532 to aarch64_modes_compatible_p. However due to issues with register
26533 allocation it is preferable to avoid tieing integer scalar and FP
26534 scalar modes. Executing integer operations in general registers is
26535 better than treating them as scalar vector operations. This reduces
26536 latency and avoids redundant int<->FP moves. So tie modes if they
26537 are either the same class, or one of them is a vector mode. */
26539 static bool
26540 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
26542 if (aarch64_modes_compatible_p (mode1, mode2))
26544 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
26545 return true;
26546 if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
26547 return true;
26549 return false;
26552 /* Return a new RTX holding the result of moving POINTER forward by
26553 AMOUNT bytes. */
26555 static rtx
26556 aarch64_move_pointer (rtx pointer, poly_int64 amount)
26558 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
26560 return adjust_automodify_address (pointer, GET_MODE (pointer),
26561 next, amount);
26564 /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
26565 from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
26566 rather than memcpy. Return true iff we succeeded. */
26567 bool
26568 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
26570 if (!TARGET_MOPS)
26571 return false;
26573 /* All three registers are changed by the instruction, so each one
26574 must be a fresh pseudo. */
26575 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26576 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
26577 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26578 rtx src_mem = replace_equiv_address (operands[1], src_addr);
26579 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
26580 if (is_memmove)
26581 emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
26582 else
26583 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
26584 return true;
26587 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26588 OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true
26589 if this is a memmove rather than memcpy. Return true if we succeed,
26590 otherwise return false, indicating that a libcall should be emitted. */
26591 bool
26592 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
26594 int mode_bytes;
26595 rtx dst = operands[0];
26596 rtx src = operands[1];
26597 unsigned align = UINTVAL (operands[3]);
26598 rtx base;
26599 machine_mode mode = BLKmode, next_mode;
26601 /* Variable-sized or strict-align copies may use the MOPS expansion. */
26602 if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
26603 return aarch64_expand_cpymem_mops (operands, is_memmove);
26605 unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
26607 /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */
26608 unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
26609 unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
26610 : aarch64_mops_memcpy_size_threshold;
26612 /* Reduce the maximum size with -Os. */
26613 if (optimize_function_for_size_p (cfun))
26614 max_copy_size /= 4;
26616 /* Large copies use MOPS when available or a library call. */
26617 if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
26618 return aarch64_expand_cpymem_mops (operands, is_memmove);
26620 /* Default to 32-byte LDP/STP on large copies, however small copies or
26621 no SIMD support fall back to 16-byte chunks.
26622 ??? Although it would be possible to use LDP/STP Qn in streaming mode
26623 (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26624 whether that would improve performance. */
26625 bool use_qregs = size > 24 && TARGET_SIMD;
26627 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26628 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26630 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
26631 src = adjust_automodify_address (src, VOIDmode, base, 0);
26633 auto_vec<std::pair<rtx, rtx>, 16> ops;
26634 int offset = 0;
26636 while (size > 0)
26638 /* Find the largest mode in which to do the copy in without over reading
26639 or writing. */
26640 opt_scalar_int_mode mode_iter;
26641 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26642 if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
26643 mode = mode_iter.require ();
26645 gcc_assert (mode != BLKmode);
26647 mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26649 /* Prefer Q-register accesses. */
26650 if (mode_bytes == 16 && use_qregs)
26651 mode = V4SImode;
26653 rtx reg = gen_reg_rtx (mode);
26654 rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
26655 rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
26656 ops.safe_push ({ load, store });
26657 size -= mode_bytes;
26658 offset += mode_bytes;
26660 /* Emit trailing copies using overlapping unaligned accesses
26661 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26662 if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
26664 next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
26665 int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26666 gcc_assert (n_bytes <= mode_bytes);
26667 offset -= n_bytes - size;
26668 size = n_bytes;
26672 /* Memcpy interleaves loads with stores, memmove emits all loads first. */
26673 int nops = ops.length();
26674 int inc = is_memmove || nops <= 8 ? nops : 6;
26676 for (int i = 0; i < nops; i += inc)
26678 int m = MIN (nops, i + inc);
26679 /* Emit loads. */
26680 for (int j = i; j < m; j++)
26681 emit_insn (ops[j].first);
26682 /* Emit stores. */
26683 for (int j = i; j < m; j++)
26684 emit_insn (ops[j].second);
26686 return true;
26689 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
26690 as for the setmem pattern. Return true iff we succeed. */
26691 static bool
26692 aarch64_expand_setmem_mops (rtx *operands)
26694 if (!TARGET_MOPS)
26695 return false;
26697 /* The first two registers are changed by the instruction, so both
26698 of them must be a fresh pseudo. */
26699 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26700 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26701 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
26702 rtx val = operands[2];
26703 if (val != CONST0_RTX (QImode))
26704 val = force_reg (QImode, val);
26705 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
26706 return true;
26709 /* Expand setmem, as if from a __builtin_memset. Return true if
26710 we succeed, otherwise return false. */
26712 bool
26713 aarch64_expand_setmem (rtx *operands)
26715 int mode_bytes;
26716 unsigned HOST_WIDE_INT len;
26717 rtx dst = operands[0];
26718 rtx val = operands[2], src;
26719 unsigned align = UINTVAL (operands[3]);
26720 rtx base;
26721 machine_mode mode = BLKmode, next_mode;
26723 /* Variable-sized or strict-align memset may use the MOPS expansion. */
26724 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
26725 || (STRICT_ALIGNMENT && align < 16))
26726 return aarch64_expand_setmem_mops (operands);
26728 /* Set inline limits for memset. MOPS has a separate threshold. */
26729 unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
26730 unsigned mops_threshold = aarch64_mops_memset_size_threshold;
26732 len = UINTVAL (operands[1]);
26734 /* Large memset uses MOPS when available or a library call. */
26735 if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
26736 return aarch64_expand_setmem_mops (operands);
26738 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26739 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26741 /* Prepare the val using a DUP/MOVI v0.16B, val. */
26742 val = expand_vector_broadcast (V16QImode, val);
26743 val = force_reg (V16QImode, val);
26745 int offset = 0;
26746 while (len > 0)
26748 /* Find the largest mode in which to do the copy without
26749 over writing. */
26750 opt_scalar_int_mode mode_iter;
26751 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26752 if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
26753 mode = mode_iter.require ();
26755 gcc_assert (mode != BLKmode);
26757 mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26759 src = val;
26761 /* Prefer Q-register accesses. */
26762 if (mode_bytes == 16)
26763 mode = V16QImode;
26764 else
26765 src = lowpart_subreg (mode, src, GET_MODE (val));
26767 emit_move_insn (adjust_address (dst, mode, offset), src);
26768 len -= mode_bytes;
26769 offset += mode_bytes;
26771 /* Emit trailing writes using overlapping unaligned accesses
26772 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26773 if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
26775 next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
26776 int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26777 gcc_assert (n_bytes <= mode_bytes);
26778 offset -= n_bytes - len;
26779 len = n_bytes;
26783 return true;
26787 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
26788 SImode stores. Handle the case when the constant has identical
26789 bottom and top halves. This is beneficial when the two stores can be
26790 merged into an STP and we avoid synthesising potentially expensive
26791 immediates twice. Return true if such a split is possible. */
26793 bool
26794 aarch64_split_dimode_const_store (rtx dst, rtx src)
26796 rtx lo = gen_lowpart (SImode, src);
26797 rtx hi = gen_highpart_mode (SImode, DImode, src);
26799 if (!rtx_equal_p (lo, hi))
26800 return false;
26802 unsigned int orig_cost
26803 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
26804 unsigned int lo_cost
26805 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
26807 /* We want to transform:
26808 MOV x1, 49370
26809 MOVK x1, 0x140, lsl 16
26810 MOVK x1, 0xc0da, lsl 32
26811 MOVK x1, 0x140, lsl 48
26812 STR x1, [x0]
26813 into:
26814 MOV w1, 49370
26815 MOVK w1, 0x140, lsl 16
26816 STP w1, w1, [x0]
26817 So we want to perform this when we save at least one instruction. */
26818 if (orig_cost <= lo_cost)
26819 return false;
26821 rtx mem_lo = adjust_address (dst, SImode, 0);
26822 if (!aarch64_mem_pair_operand (mem_lo, SImode))
26823 return false;
26825 rtx tmp_reg = gen_reg_rtx (SImode);
26826 aarch64_expand_mov_immediate (tmp_reg, lo);
26827 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
26828 /* Don't emit an explicit store pair as this may not be always profitable.
26829 Let the sched-fusion logic decide whether to merge them. */
26830 emit_move_insn (mem_lo, tmp_reg);
26831 emit_move_insn (mem_hi, tmp_reg);
26833 return true;
26836 /* Generate RTL for a conditional branch with rtx comparison CODE in
26837 mode CC_MODE. The destination of the unlikely conditional branch
26838 is LABEL_REF. */
26840 void
26841 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
26842 rtx label_ref)
26844 rtx x;
26845 x = gen_rtx_fmt_ee (code, VOIDmode,
26846 gen_rtx_REG (cc_mode, CC_REGNUM),
26847 const0_rtx);
26849 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
26850 gen_rtx_LABEL_REF (VOIDmode, label_ref),
26851 pc_rtx);
26852 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
26855 /* Generate DImode scratch registers for 128-bit (TImode) addition.
26857 OP1 represents the TImode destination operand 1
26858 OP2 represents the TImode destination operand 2
26859 LOW_DEST represents the low half (DImode) of TImode operand 0
26860 LOW_IN1 represents the low half (DImode) of TImode operand 1
26861 LOW_IN2 represents the low half (DImode) of TImode operand 2
26862 HIGH_DEST represents the high half (DImode) of TImode operand 0
26863 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26864 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
26866 void
26867 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26868 rtx *low_in1, rtx *low_in2,
26869 rtx *high_dest, rtx *high_in1,
26870 rtx *high_in2)
26872 *low_dest = gen_reg_rtx (DImode);
26873 *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
26874 *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
26875 *high_dest = gen_reg_rtx (DImode);
26876 *high_in1 = force_highpart_subreg (DImode, op1, TImode);
26877 *high_in2 = force_highpart_subreg (DImode, op2, TImode);
26880 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
26882 OP1 represents the TImode destination operand 1
26883 OP2 represents the TImode destination operand 2
26884 LOW_DEST represents the low half (DImode) of TImode operand 0
26885 LOW_IN1 represents the low half (DImode) of TImode operand 1
26886 LOW_IN2 represents the low half (DImode) of TImode operand 2
26887 HIGH_DEST represents the high half (DImode) of TImode operand 0
26888 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26889 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
26892 void
26893 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26894 rtx *low_in1, rtx *low_in2,
26895 rtx *high_dest, rtx *high_in1,
26896 rtx *high_in2)
26898 *low_dest = gen_reg_rtx (DImode);
26899 *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
26900 *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
26901 *high_dest = gen_reg_rtx (DImode);
26903 *high_in1 = force_highpart_subreg (DImode, op1, TImode);
26904 *high_in2 = force_highpart_subreg (DImode, op2, TImode);
26907 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
26909 OP0 represents the TImode destination operand 0
26910 LOW_DEST represents the low half (DImode) of TImode operand 0
26911 LOW_IN1 represents the low half (DImode) of TImode operand 1
26912 LOW_IN2 represents the low half (DImode) of TImode operand 2
26913 HIGH_DEST represents the high half (DImode) of TImode operand 0
26914 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26915 HIGH_IN2 represents the high half (DImode) of TImode operand 2
26916 UNSIGNED_P is true if the operation is being performed on unsigned
26917 values. */
26918 void
26919 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
26920 rtx low_in2, rtx high_dest, rtx high_in1,
26921 rtx high_in2, bool unsigned_p)
26923 if (low_in2 == const0_rtx)
26925 low_dest = low_in1;
26926 high_in2 = force_reg (DImode, high_in2);
26927 if (unsigned_p)
26928 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
26929 else
26930 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
26932 else
26934 if (aarch64_plus_immediate (low_in2, DImode))
26935 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
26936 GEN_INT (-UINTVAL (low_in2))));
26937 else
26939 low_in2 = force_reg (DImode, low_in2);
26940 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
26942 high_in2 = force_reg (DImode, high_in2);
26944 if (unsigned_p)
26945 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
26946 else
26947 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
26950 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
26951 emit_move_insn (gen_highpart (DImode, op0), high_dest);
26955 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
26957 static unsigned HOST_WIDE_INT
26958 aarch64_asan_shadow_offset (void)
26960 if (TARGET_ILP32)
26961 return (HOST_WIDE_INT_1 << 29);
26962 else
26963 return (HOST_WIDE_INT_1 << 36);
26966 static rtx
26967 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
26968 rtx_code code, tree treeop0, tree treeop1)
26970 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
26971 rtx op0, op1;
26972 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
26973 insn_code icode;
26974 struct expand_operand ops[4];
26976 start_sequence ();
26977 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
26979 op_mode = GET_MODE (op0);
26980 if (op_mode == VOIDmode)
26981 op_mode = GET_MODE (op1);
26983 switch (op_mode)
26985 case E_QImode:
26986 case E_HImode:
26987 case E_SImode:
26988 cmp_mode = SImode;
26989 icode = CODE_FOR_cmpsi;
26990 break;
26992 case E_DImode:
26993 cmp_mode = DImode;
26994 icode = CODE_FOR_cmpdi;
26995 break;
26997 case E_SFmode:
26998 cmp_mode = SFmode;
26999 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27000 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
27001 break;
27003 case E_DFmode:
27004 cmp_mode = DFmode;
27005 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27006 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
27007 break;
27009 default:
27010 end_sequence ();
27011 return NULL_RTX;
27014 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
27015 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
27016 if (!op0 || !op1)
27018 end_sequence ();
27019 return NULL_RTX;
27021 *prep_seq = get_insns ();
27022 end_sequence ();
27024 create_fixed_operand (&ops[0], op0);
27025 create_fixed_operand (&ops[1], op1);
27027 start_sequence ();
27028 if (!maybe_expand_insn (icode, 2, ops))
27030 end_sequence ();
27031 return NULL_RTX;
27033 *gen_seq = get_insns ();
27034 end_sequence ();
27036 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
27037 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
27040 static rtx
27041 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27042 rtx_code cmp_code, tree treeop0, tree treeop1,
27043 rtx_code bit_code)
27045 rtx op0, op1, target;
27046 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27047 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27048 insn_code icode;
27049 struct expand_operand ops[6];
27050 int aarch64_cond;
27052 push_to_sequence (*prep_seq);
27053 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27055 op_mode = GET_MODE (op0);
27056 if (op_mode == VOIDmode)
27057 op_mode = GET_MODE (op1);
27059 switch (op_mode)
27061 case E_QImode:
27062 case E_HImode:
27063 case E_SImode:
27064 cmp_mode = SImode;
27065 break;
27067 case E_DImode:
27068 cmp_mode = DImode;
27069 break;
27071 case E_SFmode:
27072 cmp_mode = SFmode;
27073 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27074 break;
27076 case E_DFmode:
27077 cmp_mode = DFmode;
27078 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27079 break;
27081 default:
27082 end_sequence ();
27083 return NULL_RTX;
27086 icode = code_for_ccmp (cc_mode, cmp_mode);
27088 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27089 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27090 if (!op0 || !op1)
27092 end_sequence ();
27093 return NULL_RTX;
27095 *prep_seq = get_insns ();
27096 end_sequence ();
27098 target = gen_rtx_REG (cc_mode, CC_REGNUM);
27099 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
27101 if (bit_code != AND)
27103 /* Treat the ccmp patterns as canonical and use them where possible,
27104 but fall back to ccmp_rev patterns if there's no other option. */
27105 rtx_code prev_code = GET_CODE (prev);
27106 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27107 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27108 && !(prev_code == EQ
27109 || prev_code == NE
27110 || prev_code == ORDERED
27111 || prev_code == UNORDERED))
27112 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27113 else
27115 rtx_code code = reverse_condition (prev_code);
27116 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27118 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27121 create_fixed_operand (&ops[0], XEXP (prev, 0));
27122 create_fixed_operand (&ops[1], target);
27123 create_fixed_operand (&ops[2], op0);
27124 create_fixed_operand (&ops[3], op1);
27125 create_fixed_operand (&ops[4], prev);
27126 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27128 push_to_sequence (*gen_seq);
27129 if (!maybe_expand_insn (icode, 6, ops))
27131 end_sequence ();
27132 return NULL_RTX;
27135 *gen_seq = get_insns ();
27136 end_sequence ();
27138 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27141 #undef TARGET_GEN_CCMP_FIRST
27142 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27144 #undef TARGET_GEN_CCMP_NEXT
27145 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27147 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
27148 instruction fusion of some sort. */
27150 static bool
27151 aarch64_macro_fusion_p (void)
27153 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27157 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
27158 should be kept together during scheduling. */
27160 static bool
27161 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27163 rtx set_dest;
27164 rtx prev_set = single_set (prev);
27165 rtx curr_set = single_set (curr);
27166 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
27167 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27169 if (!aarch64_macro_fusion_p ())
27170 return false;
27172 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27174 /* We are trying to match:
27175 prev (mov) == (set (reg r0) (const_int imm16))
27176 curr (movk) == (set (zero_extract (reg r0)
27177 (const_int 16)
27178 (const_int 16))
27179 (const_int imm16_1)) */
27181 set_dest = SET_DEST (curr_set);
27183 if (GET_CODE (set_dest) == ZERO_EXTRACT
27184 && CONST_INT_P (SET_SRC (curr_set))
27185 && CONST_INT_P (SET_SRC (prev_set))
27186 && CONST_INT_P (XEXP (set_dest, 2))
27187 && INTVAL (XEXP (set_dest, 2)) == 16
27188 && REG_P (XEXP (set_dest, 0))
27189 && REG_P (SET_DEST (prev_set))
27190 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27192 return true;
27196 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27199 /* We're trying to match:
27200 prev (adrp) == (set (reg r1)
27201 (high (symbol_ref ("SYM"))))
27202 curr (add) == (set (reg r0)
27203 (lo_sum (reg r1)
27204 (symbol_ref ("SYM"))))
27205 Note that r0 need not necessarily be the same as r1, especially
27206 during pre-regalloc scheduling. */
27208 if (satisfies_constraint_Ush (SET_SRC (prev_set))
27209 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27211 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27212 && REG_P (XEXP (SET_SRC (curr_set), 0))
27213 && REGNO (XEXP (SET_SRC (curr_set), 0))
27214 == REGNO (SET_DEST (prev_set))
27215 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27216 XEXP (SET_SRC (curr_set), 1)))
27217 return true;
27221 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27224 /* We're trying to match:
27225 prev (movk) == (set (zero_extract (reg r0)
27226 (const_int 16)
27227 (const_int 32))
27228 (const_int imm16_1))
27229 curr (movk) == (set (zero_extract (reg r0)
27230 (const_int 16)
27231 (const_int 48))
27232 (const_int imm16_2)) */
27234 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27235 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27236 && REG_P (XEXP (SET_DEST (prev_set), 0))
27237 && REG_P (XEXP (SET_DEST (curr_set), 0))
27238 && REGNO (XEXP (SET_DEST (prev_set), 0))
27239 == REGNO (XEXP (SET_DEST (curr_set), 0))
27240 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27241 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27242 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27243 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27244 && CONST_INT_P (SET_SRC (prev_set))
27245 && CONST_INT_P (SET_SRC (curr_set)))
27246 return true;
27249 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27251 /* We're trying to match:
27252 prev (adrp) == (set (reg r0)
27253 (high (symbol_ref ("SYM"))))
27254 curr (ldr) == (set (reg r1)
27255 (mem (lo_sum (reg r0)
27256 (symbol_ref ("SYM")))))
27258 curr (ldr) == (set (reg r1)
27259 (zero_extend (mem
27260 (lo_sum (reg r0)
27261 (symbol_ref ("SYM")))))) */
27262 if (satisfies_constraint_Ush (SET_SRC (prev_set))
27263 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27265 rtx curr_src = SET_SRC (curr_set);
27267 if (GET_CODE (curr_src) == ZERO_EXTEND)
27268 curr_src = XEXP (curr_src, 0);
27270 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27271 && REG_P (XEXP (XEXP (curr_src, 0), 0))
27272 && REGNO (XEXP (XEXP (curr_src, 0), 0))
27273 == REGNO (SET_DEST (prev_set))
27274 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27275 XEXP (SET_SRC (prev_set), 0)))
27276 return true;
27280 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
27281 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27282 && prev_set && curr_set && any_condjump_p (curr)
27283 && GET_CODE (SET_SRC (prev_set)) == COMPARE
27284 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27285 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27286 return true;
27288 /* Fuse flag-setting ALU instructions and conditional branch. */
27289 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27290 && any_condjump_p (curr))
27292 unsigned int condreg1, condreg2;
27293 rtx cc_reg_1;
27294 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27295 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27297 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27298 && prev
27299 && modified_in_p (cc_reg_1, prev))
27301 enum attr_type prev_type = get_attr_type (prev);
27303 /* FIXME: this misses some which is considered simple arthematic
27304 instructions for ThunderX. Simple shifts are missed here. */
27305 if (prev_type == TYPE_ALUS_SREG
27306 || prev_type == TYPE_ALUS_IMM
27307 || prev_type == TYPE_LOGICS_REG
27308 || prev_type == TYPE_LOGICS_IMM)
27309 return true;
27313 /* Fuse ALU instructions and CBZ/CBNZ. */
27314 if (prev_set
27315 && curr_set
27316 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27317 && any_condjump_p (curr))
27319 /* We're trying to match:
27320 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27321 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
27322 (const_int 0))
27323 (label_ref ("SYM"))
27324 (pc)) */
27325 if (SET_DEST (curr_set) == (pc_rtx)
27326 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27327 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27328 && REG_P (SET_DEST (prev_set))
27329 && REGNO (SET_DEST (prev_set))
27330 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27332 /* Fuse ALU operations followed by conditional branch instruction. */
27333 switch (get_attr_type (prev))
27335 case TYPE_ALU_IMM:
27336 case TYPE_ALU_SREG:
27337 case TYPE_ADC_REG:
27338 case TYPE_ADC_IMM:
27339 case TYPE_ADCS_REG:
27340 case TYPE_ADCS_IMM:
27341 case TYPE_LOGIC_REG:
27342 case TYPE_LOGIC_IMM:
27343 case TYPE_CSEL:
27344 case TYPE_ADR:
27345 case TYPE_MOV_IMM:
27346 case TYPE_SHIFT_REG:
27347 case TYPE_SHIFT_IMM:
27348 case TYPE_BFM:
27349 case TYPE_RBIT:
27350 case TYPE_REV:
27351 case TYPE_EXTEND:
27352 return true;
27354 default:;
27359 /* Fuse A+B+1 and A-B-1 */
27360 if (simple_sets_p
27361 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27363 /* We're trying to match:
27364 prev == (set (r0) (plus (r0) (r1)))
27365 curr == (set (r0) (plus (r0) (const_int 1)))
27367 prev == (set (r0) (minus (r0) (r1)))
27368 curr == (set (r0) (plus (r0) (const_int -1))) */
27370 rtx prev_src = SET_SRC (prev_set);
27371 rtx curr_src = SET_SRC (curr_set);
27373 int polarity = 1;
27374 if (GET_CODE (prev_src) == MINUS)
27375 polarity = -1;
27377 if (GET_CODE (curr_src) == PLUS
27378 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27379 && CONST_INT_P (XEXP (curr_src, 1))
27380 && INTVAL (XEXP (curr_src, 1)) == polarity
27381 && REG_P (XEXP (curr_src, 0))
27382 && REG_P (SET_DEST (prev_set))
27383 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27384 return true;
27387 return false;
27390 /* Return true iff the instruction fusion described by OP is enabled. */
27392 bool
27393 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27395 return (aarch64_tune_params.fusible_ops & op) != 0;
27398 /* If MEM is in the form of [base+offset], extract the two parts
27399 of address and set to BASE and OFFSET, otherwise return false
27400 after clearing BASE and OFFSET. */
27402 bool
27403 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27405 rtx addr;
27407 gcc_assert (MEM_P (mem));
27409 addr = XEXP (mem, 0);
27411 if (REG_P (addr))
27413 *base = addr;
27414 *offset = const0_rtx;
27415 return true;
27418 if (GET_CODE (addr) == PLUS
27419 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27421 *base = XEXP (addr, 0);
27422 *offset = XEXP (addr, 1);
27423 return true;
27426 *base = NULL_RTX;
27427 *offset = NULL_RTX;
27429 return false;
27432 /* Types for scheduling fusion. */
27433 enum sched_fusion_type
27435 SCHED_FUSION_NONE = 0,
27436 SCHED_FUSION_LD_SIGN_EXTEND,
27437 SCHED_FUSION_LD_ZERO_EXTEND,
27438 SCHED_FUSION_LD,
27439 SCHED_FUSION_ST,
27440 SCHED_FUSION_NUM
27443 /* If INSN is a load or store of address in the form of [base+offset],
27444 extract the two parts and set to BASE and OFFSET. Return scheduling
27445 fusion type this INSN is. */
27447 static enum sched_fusion_type
27448 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27450 rtx x, dest, src;
27451 enum sched_fusion_type fusion = SCHED_FUSION_LD;
27453 gcc_assert (INSN_P (insn));
27454 x = PATTERN (insn);
27455 if (GET_CODE (x) != SET)
27456 return SCHED_FUSION_NONE;
27458 src = SET_SRC (x);
27459 dest = SET_DEST (x);
27461 machine_mode dest_mode = GET_MODE (dest);
27463 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27464 return SCHED_FUSION_NONE;
27466 if (GET_CODE (src) == SIGN_EXTEND)
27468 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27469 src = XEXP (src, 0);
27470 if (!MEM_P (src) || GET_MODE (src) != SImode)
27471 return SCHED_FUSION_NONE;
27473 else if (GET_CODE (src) == ZERO_EXTEND)
27475 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27476 src = XEXP (src, 0);
27477 if (!MEM_P (src) || GET_MODE (src) != SImode)
27478 return SCHED_FUSION_NONE;
27481 if (MEM_P (src) && REG_P (dest))
27482 extract_base_offset_in_addr (src, base, offset);
27483 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27485 fusion = SCHED_FUSION_ST;
27486 extract_base_offset_in_addr (dest, base, offset);
27488 else
27489 return SCHED_FUSION_NONE;
27491 if (*base == NULL_RTX || *offset == NULL_RTX)
27492 fusion = SCHED_FUSION_NONE;
27494 return fusion;
27497 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27499 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27500 and PRI are only calculated for these instructions. For other instruction,
27501 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
27502 type instruction fusion can be added by returning different priorities.
27504 It's important that irrelevant instructions get the largest FUSION_PRI. */
27506 static void
27507 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
27508 int *fusion_pri, int *pri)
27510 int tmp, off_val;
27511 rtx base, offset;
27512 enum sched_fusion_type fusion;
27514 gcc_assert (INSN_P (insn));
27516 tmp = max_pri - 1;
27517 fusion = fusion_load_store (insn, &base, &offset);
27518 if (fusion == SCHED_FUSION_NONE)
27520 *pri = tmp;
27521 *fusion_pri = tmp;
27522 return;
27525 /* Set FUSION_PRI according to fusion type and base register. */
27526 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
27528 /* Calculate PRI. */
27529 tmp /= 2;
27531 /* INSN with smaller offset goes first. */
27532 off_val = (int)(INTVAL (offset));
27533 if (off_val >= 0)
27534 tmp -= (off_val & 0xfffff);
27535 else
27536 tmp += ((- off_val) & 0xfffff);
27538 *pri = tmp;
27539 return;
27542 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27543 Adjust priority of sha1h instructions so they are scheduled before
27544 other SHA1 instructions. */
27546 static int
27547 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
27549 rtx x = PATTERN (insn);
27551 if (GET_CODE (x) == SET)
27553 x = SET_SRC (x);
27555 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
27556 return priority + 10;
27559 return priority;
27562 /* If REVERSED is null, return true if memory reference *MEM2 comes
27563 immediately after memory reference *MEM1. Do not change the references
27564 in this case.
27566 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27567 if they are, try to make them use constant offsets from the same base
27568 register. Return true on success. When returning true, set *REVERSED
27569 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
27570 static bool
27571 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
27573 if (reversed)
27574 *reversed = false;
27576 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
27577 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
27578 return false;
27580 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
27581 return false;
27583 auto size1 = MEM_SIZE (*mem1);
27584 auto size2 = MEM_SIZE (*mem2);
27586 rtx base1, base2, offset1, offset2;
27587 extract_base_offset_in_addr (*mem1, &base1, &offset1);
27588 extract_base_offset_in_addr (*mem2, &base2, &offset2);
27590 /* Make sure at least one memory is in base+offset form. */
27591 if (!(base1 && offset1) && !(base2 && offset2))
27592 return false;
27594 /* If both mems already use the same base register, just check the
27595 offsets. */
27596 if (base1 && base2 && rtx_equal_p (base1, base2))
27598 if (!offset1 || !offset2)
27599 return false;
27601 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
27602 return true;
27604 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
27606 *reversed = true;
27607 return true;
27610 return false;
27613 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27614 guarantee that the values are consecutive. */
27615 if (MEM_EXPR (*mem1)
27616 && MEM_EXPR (*mem2)
27617 && MEM_OFFSET_KNOWN_P (*mem1)
27618 && MEM_OFFSET_KNOWN_P (*mem2))
27620 poly_int64 expr_offset1;
27621 poly_int64 expr_offset2;
27622 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
27623 &expr_offset1);
27624 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
27625 &expr_offset2);
27626 if (!expr_base1
27627 || !expr_base2
27628 || !DECL_P (expr_base1)
27629 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
27630 return false;
27632 expr_offset1 += MEM_OFFSET (*mem1);
27633 expr_offset2 += MEM_OFFSET (*mem2);
27635 if (known_eq (expr_offset1 + size1, expr_offset2))
27637 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
27638 *reversed = true;
27639 else
27640 return false;
27642 if (reversed)
27644 if (base2)
27646 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
27647 expr_offset1 - expr_offset2);
27648 *mem1 = replace_equiv_address_nv (*mem1, addr1);
27650 else
27652 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
27653 expr_offset2 - expr_offset1);
27654 *mem2 = replace_equiv_address_nv (*mem2, addr2);
27657 return true;
27660 return false;
27663 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27664 instruction. */
27666 bool
27667 aarch64_ldpstp_operand_mode_p (machine_mode mode)
27669 if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
27670 || hard_regno_nregs (V0_REGNUM, mode) > 1)
27671 return false;
27673 const auto size = GET_MODE_SIZE (mode);
27674 return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
27677 /* Return true if MEM1 and MEM2 can be combined into a single access
27678 of mode MODE, with the combined access having the same address as MEM1. */
27680 bool
27681 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
27683 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
27684 return false;
27685 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
27688 /* Return true if MEM agrees with the ldp-stp policy model.
27689 Otherwise, false. */
27691 bool
27692 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
27694 auto policy = (load
27695 ? aarch64_tune_params.ldp_policy_model
27696 : aarch64_tune_params.stp_policy_model);
27698 /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair. */
27699 if (policy == AARCH64_LDP_STP_POLICY_NEVER)
27700 return false;
27702 /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27703 do not emit the load pair unless the alignment is checked to be
27704 at least double the alignment of the type. */
27705 if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
27706 && !optimize_function_for_size_p (cfun)
27707 && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
27708 return false;
27710 return true;
27713 /* Given OPERANDS of consecutive load/store, check if we can merge
27714 them into ldp/stp. LOAD is true if they are load instructions. */
27716 bool
27717 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load)
27719 enum reg_class rclass_1, rclass_2;
27720 rtx mem_1, mem_2, reg_1, reg_2;
27722 if (load)
27724 mem_1 = operands[1];
27725 mem_2 = operands[3];
27726 reg_1 = operands[0];
27727 reg_2 = operands[2];
27728 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
27729 if (REGNO (reg_1) == REGNO (reg_2))
27730 return false;
27731 if (reg_overlap_mentioned_p (reg_1, mem_2))
27732 return false;
27734 else
27736 mem_1 = operands[0];
27737 mem_2 = operands[2];
27738 reg_1 = operands[1];
27739 reg_2 = operands[3];
27742 /* The mems cannot be volatile. */
27743 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
27744 return false;
27746 /* Check if the addresses are in the form of [base+offset]. */
27747 bool reversed = false;
27748 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
27749 return false;
27751 /* The operands must be of the same size. */
27752 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
27753 GET_MODE_SIZE (GET_MODE (mem_2))));
27755 /* The lower memory access must be a mem-pair operand. */
27756 rtx lower_mem = reversed ? mem_2 : mem_1;
27757 machine_mode lower_mem_mode = GET_MODE (lower_mem);
27758 if (!aarch64_mem_pair_operand (lower_mem, lower_mem_mode))
27759 return false;
27761 /* Check if lower_mem is ok with the ldp-stp policy model. */
27762 if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem, load,
27763 lower_mem_mode))
27764 return false;
27766 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
27767 rclass_1 = FP_REGS;
27768 else
27769 rclass_1 = GENERAL_REGS;
27771 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
27772 rclass_2 = FP_REGS;
27773 else
27774 rclass_2 = GENERAL_REGS;
27776 /* Check if the registers are of same class. */
27777 if (rclass_1 != rclass_2)
27778 return false;
27780 return true;
27783 /* Given OPERANDS of consecutive load/store that can be merged,
27784 swap them if they are not in ascending order. */
27785 void
27786 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
27788 int mem_op = load ? 1 : 0;
27789 bool reversed = false;
27790 if (!aarch64_check_consecutive_mems (operands + mem_op,
27791 operands + mem_op + 2, &reversed))
27792 gcc_unreachable ();
27794 if (reversed)
27796 /* Irrespective of whether this is a load or a store,
27797 we do the same swap. */
27798 std::swap (operands[0], operands[2]);
27799 std::swap (operands[1], operands[3]);
27803 /* Helper function used for generation of load/store pair instructions, called
27804 from peepholes in aarch64-ldpstp.md. OPERANDS is an array of
27805 operands as matched by the peepholes in that file. LOAD_P is true if we're
27806 generating a load pair, otherwise we're generating a store pair. CODE is
27807 either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
27808 standard load/store pair. */
27810 void
27811 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
27813 aarch64_swap_ldrstr_operands (operands, load_p);
27815 if (load_p)
27816 emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
27817 operands[1], code));
27818 else
27820 gcc_assert (code == UNKNOWN);
27821 emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
27822 operands[3]));
27826 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
27827 comparison between the two. */
27829 aarch64_host_wide_int_compare (const void *x, const void *y)
27831 return wi::cmps (* ((const HOST_WIDE_INT *) x),
27832 * ((const HOST_WIDE_INT *) y));
27835 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
27836 other pointing to a REG rtx containing an offset, compare the offsets
27837 of the two pairs.
27839 Return:
27841 1 iff offset (X) > offset (Y)
27842 0 iff offset (X) == offset (Y)
27843 -1 iff offset (X) < offset (Y) */
27845 aarch64_ldrstr_offset_compare (const void *x, const void *y)
27847 const rtx * operands_1 = (const rtx *) x;
27848 const rtx * operands_2 = (const rtx *) y;
27849 rtx mem_1, mem_2, base, offset_1, offset_2;
27851 if (MEM_P (operands_1[0]))
27852 mem_1 = operands_1[0];
27853 else
27854 mem_1 = operands_1[1];
27856 if (MEM_P (operands_2[0]))
27857 mem_2 = operands_2[0];
27858 else
27859 mem_2 = operands_2[1];
27861 /* Extract the offsets. */
27862 extract_base_offset_in_addr (mem_1, &base, &offset_1);
27863 extract_base_offset_in_addr (mem_2, &base, &offset_2);
27865 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
27867 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
27870 /* Given OPERANDS of consecutive load/store, check if we can merge
27871 them into ldp/stp by adjusting the offset. LOAD is true if they
27872 are load instructions. MODE is the mode of memory operands.
27874 Given below consecutive stores:
27876 str w1, [xb, 0x100]
27877 str w1, [xb, 0x104]
27878 str w1, [xb, 0x108]
27879 str w1, [xb, 0x10c]
27881 Though the offsets are out of the range supported by stp, we can
27882 still pair them after adjusting the offset, like:
27884 add scratch, xb, 0x100
27885 stp w1, w1, [scratch]
27886 stp w1, w1, [scratch, 0x8]
27888 The peephole patterns detecting this opportunity should guarantee
27889 the scratch register is avaliable. */
27891 bool
27892 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
27893 machine_mode mode)
27895 const int num_insns = 4;
27896 enum reg_class rclass;
27897 HOST_WIDE_INT offvals[num_insns], msize;
27898 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
27900 if (load)
27902 for (int i = 0; i < num_insns; i++)
27904 reg[i] = operands[2 * i];
27905 mem[i] = operands[2 * i + 1];
27907 gcc_assert (REG_P (reg[i]));
27910 /* Do not attempt to merge the loads if the loads clobber each other. */
27911 for (int i = 0; i < 8; i += 2)
27912 for (int j = i + 2; j < 8; j += 2)
27913 if (reg_overlap_mentioned_p (operands[i], operands[j]))
27914 return false;
27916 else
27917 for (int i = 0; i < num_insns; i++)
27919 mem[i] = operands[2 * i];
27920 reg[i] = operands[2 * i + 1];
27923 /* Skip if memory operand is by itself valid for ldp/stp. */
27924 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
27925 return false;
27927 for (int i = 0; i < num_insns; i++)
27929 /* The mems cannot be volatile. */
27930 if (MEM_VOLATILE_P (mem[i]))
27931 return false;
27933 /* Check if the addresses are in the form of [base+offset]. */
27934 extract_base_offset_in_addr (mem[i], base + i, offset + i);
27935 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
27936 return false;
27939 /* Check if the registers are of same class. */
27940 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
27941 ? FP_REGS : GENERAL_REGS;
27943 for (int i = 1; i < num_insns; i++)
27944 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
27946 if (rclass != FP_REGS)
27947 return false;
27949 else
27951 if (rclass != GENERAL_REGS)
27952 return false;
27955 /* Only the last register in the order in which they occur
27956 may be clobbered by the load. */
27957 if (rclass == GENERAL_REGS && load)
27958 for (int i = 0; i < num_insns - 1; i++)
27959 if (reg_mentioned_p (reg[i], mem[i]))
27960 return false;
27962 /* Check if the bases are same. */
27963 for (int i = 0; i < num_insns - 1; i++)
27964 if (!rtx_equal_p (base[i], base[i + 1]))
27965 return false;
27967 for (int i = 0; i < num_insns; i++)
27968 offvals[i] = INTVAL (offset[i]);
27970 msize = GET_MODE_SIZE (mode).to_constant ();
27972 /* Check if the offsets can be put in the right order to do a ldp/stp. */
27973 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
27974 aarch64_host_wide_int_compare);
27976 if (!(offvals[1] == offvals[0] + msize
27977 && offvals[3] == offvals[2] + msize))
27978 return false;
27980 /* Check that offsets are within range of each other. The ldp/stp
27981 instructions have 7 bit immediate offsets, so use 0x80. */
27982 if (offvals[2] - offvals[0] >= msize * 0x80)
27983 return false;
27985 /* The offsets must be aligned with respect to each other. */
27986 if (offvals[0] % msize != offvals[2] % msize)
27987 return false;
27989 /* Check if mem[0] is ok with the ldp-stp policy model. */
27990 if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
27991 return false;
27993 return true;
27996 /* Given OPERANDS of consecutive load/store, this function pairs them
27997 into LDP/STP after adjusting the offset. It depends on the fact
27998 that the operands can be sorted so the offsets are correct for STP.
27999 MODE is the mode of memory operands. CODE is the rtl operator
28000 which should be applied to all memory operands, it's SIGN_EXTEND,
28001 ZERO_EXTEND or UNKNOWN. */
28003 bool
28004 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
28005 machine_mode mode, RTX_CODE code)
28007 rtx base, offset_1, offset_2;
28008 rtx mem_1, mem_2;
28009 rtx temp_operands[8];
28010 HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
28011 stp_off_upper_limit, stp_off_lower_limit, msize;
28013 /* We make changes on a copy as we may still bail out. */
28014 for (int i = 0; i < 8; i ++)
28015 temp_operands[i] = operands[i];
28017 /* Sort the operands. Note for cases as below:
28018 [base + 0x310] = A
28019 [base + 0x320] = B
28020 [base + 0x330] = C
28021 [base + 0x320] = D
28022 We need stable sorting otherwise wrong data may be store to offset 0x320.
28023 Also note the dead store in above case should be optimized away, but no
28024 guarantees here. */
28025 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
28026 aarch64_ldrstr_offset_compare);
28028 /* Copy the memory operands so that if we have to bail for some
28029 reason the original addresses are unchanged. */
28030 if (load)
28032 mem_1 = copy_rtx (temp_operands[1]);
28033 mem_2 = copy_rtx (temp_operands[5]);
28035 else
28037 mem_1 = copy_rtx (temp_operands[0]);
28038 mem_2 = copy_rtx (temp_operands[4]);
28039 gcc_assert (code == UNKNOWN);
28042 extract_base_offset_in_addr (mem_1, &base, &offset_1);
28043 extract_base_offset_in_addr (mem_2, &base, &offset_2);
28044 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28045 && offset_2 != NULL_RTX);
28047 /* Adjust offset so it can fit in LDP/STP instruction. */
28048 msize = GET_MODE_SIZE (mode).to_constant();
28049 stp_off_upper_limit = msize * (0x40 - 1);
28050 stp_off_lower_limit = - msize * 0x40;
28052 off_val_1 = INTVAL (offset_1);
28053 off_val_2 = INTVAL (offset_2);
28055 /* The base offset is optimally half way between the two STP/LDP offsets. */
28056 if (msize <= 4)
28057 base_off = (off_val_1 + off_val_2) / 2;
28058 else
28059 /* However, due to issues with negative LDP/STP offset generation for
28060 larger modes, for DF, DD, DI and vector modes. we must not use negative
28061 addresses smaller than 9 signed unadjusted bits can store. This
28062 provides the most range in this case. */
28063 base_off = off_val_1;
28065 /* Adjust the base so that it is aligned with the addresses but still
28066 optimal. */
28067 if (base_off % msize != off_val_1 % msize)
28068 /* Fix the offset, bearing in mind we want to make it bigger not
28069 smaller. */
28070 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28071 else if (msize <= 4)
28072 /* The negative range of LDP/STP is one larger than the positive range. */
28073 base_off += msize;
28075 /* Check if base offset is too big or too small. We can attempt to resolve
28076 this issue by setting it to the maximum value and seeing if the offsets
28077 still fit. */
28078 if (base_off >= 0x1000)
28080 base_off = 0x1000 - 1;
28081 /* We must still make sure that the base offset is aligned with respect
28082 to the address. But it may not be made any bigger. */
28083 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28086 /* Likewise for the case where the base is too small. */
28087 if (base_off <= -0x1000)
28089 base_off = -0x1000 + 1;
28090 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28093 /* Offset of the first STP/LDP. */
28094 new_off_1 = off_val_1 - base_off;
28096 /* Offset of the second STP/LDP. */
28097 new_off_2 = off_val_2 - base_off;
28099 /* The offsets must be within the range of the LDP/STP instructions. */
28100 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28101 || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28102 return false;
28104 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28105 new_off_1), true);
28106 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28107 new_off_2), true);
28109 if (!aarch64_mem_pair_operand (mem_1, mode)
28110 || !aarch64_mem_pair_operand (mem_2, mode))
28111 return false;
28113 if (load)
28115 operands[0] = temp_operands[0];
28116 operands[1] = mem_1;
28117 operands[2] = temp_operands[2];
28118 operands[4] = temp_operands[4];
28119 operands[5] = mem_2;
28120 operands[6] = temp_operands[6];
28122 else
28124 operands[0] = mem_1;
28125 operands[1] = temp_operands[1];
28126 operands[3] = temp_operands[3];
28127 operands[4] = mem_2;
28128 operands[5] = temp_operands[5];
28129 operands[7] = temp_operands[7];
28132 /* Emit adjusting instruction. */
28133 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28134 /* Emit ldp/stp instructions. */
28135 if (load)
28137 emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28138 operands[1], code));
28139 emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28140 operands[5], code));
28142 else
28144 emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28145 operands[3]));
28146 emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28147 operands[7]));
28149 return true;
28152 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
28153 it isn't worth branching around empty masked ops (including masked
28154 stores). */
28156 static bool
28157 aarch64_empty_mask_is_expensive (unsigned)
28159 return false;
28162 /* Return 1 if pseudo register should be created and used to hold
28163 GOT address for PIC code. */
28165 bool
28166 aarch64_use_pseudo_pic_reg (void)
28168 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28171 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
28173 static int
28174 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28176 switch (XINT (x, 1))
28178 case UNSPEC_GOTSMALLPIC:
28179 case UNSPEC_GOTSMALLPIC28K:
28180 case UNSPEC_GOTTINYPIC:
28181 return 0;
28182 default:
28183 break;
28186 return default_unspec_may_trap_p (x, flags);
28190 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28191 return the log2 of that value. Otherwise return -1. */
28194 aarch64_fpconst_pow_of_2 (rtx x)
28196 const REAL_VALUE_TYPE *r;
28198 if (!CONST_DOUBLE_P (x))
28199 return -1;
28201 r = CONST_DOUBLE_REAL_VALUE (x);
28203 if (REAL_VALUE_NEGATIVE (*r)
28204 || REAL_VALUE_ISNAN (*r)
28205 || REAL_VALUE_ISINF (*r)
28206 || !real_isinteger (r, DFmode))
28207 return -1;
28209 return exact_log2 (real_to_integer (r));
28212 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28213 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28214 return n. Otherwise return -1. */
28217 aarch64_fpconst_pow2_recip (rtx x)
28219 REAL_VALUE_TYPE r0;
28221 if (!CONST_DOUBLE_P (x))
28222 return -1;
28224 r0 = *CONST_DOUBLE_REAL_VALUE (x);
28225 if (exact_real_inverse (DFmode, &r0)
28226 && !REAL_VALUE_NEGATIVE (r0))
28228 int ret = exact_log2 (real_to_integer (&r0));
28229 if (ret >= 1 && ret <= 32)
28230 return ret;
28232 return -1;
28235 /* If X is a vector of equal CONST_DOUBLE values and that value is
28236 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
28239 aarch64_vec_fpconst_pow_of_2 (rtx x)
28241 int nelts;
28242 if (!CONST_VECTOR_P (x)
28243 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28244 return -1;
28246 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28247 return -1;
28249 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28250 if (firstval <= 0)
28251 return -1;
28253 for (int i = 1; i < nelts; i++)
28254 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28255 return -1;
28257 return firstval;
28260 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28261 to float.
28263 __fp16 always promotes through this hook.
28264 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28265 through the generic excess precision logic rather than here. */
28267 static tree
28268 aarch64_promoted_type (const_tree t)
28270 if (SCALAR_FLOAT_TYPE_P (t)
28271 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28272 return float_type_node;
28274 return NULL_TREE;
28277 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
28279 static bool
28280 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28281 optimization_type opt_type)
28283 switch (op)
28285 case rsqrt_optab:
28286 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28288 default:
28289 return true;
28293 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
28295 static unsigned int
28296 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28297 int *offset)
28299 /* Polynomial invariant 1 == (VG / 2) - 1. */
28300 gcc_assert (i == 1);
28301 *factor = 2;
28302 *offset = 1;
28303 return AARCH64_DWARF_VG;
28306 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28307 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28309 static bool
28310 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28312 return ((mode == HFmode || mode == BFmode)
28313 ? true
28314 : default_libgcc_floating_mode_supported_p (mode));
28317 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28318 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28320 static bool
28321 aarch64_scalar_mode_supported_p (scalar_mode mode)
28323 if (DECIMAL_FLOAT_MODE_P (mode))
28324 return default_decimal_float_supported_p ();
28326 return ((mode == HFmode || mode == BFmode)
28327 ? true
28328 : default_scalar_mode_supported_p (mode));
28331 /* Set the value of FLT_EVAL_METHOD.
28332 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28334 0: evaluate all operations and constants, whose semantic type has at
28335 most the range and precision of type float, to the range and
28336 precision of float; evaluate all other operations and constants to
28337 the range and precision of the semantic type;
28339 N, where _FloatN is a supported interchange floating type
28340 evaluate all operations and constants, whose semantic type has at
28341 most the range and precision of _FloatN type, to the range and
28342 precision of the _FloatN type; evaluate all other operations and
28343 constants to the range and precision of the semantic type;
28345 If we have the ARMv8.2-A extensions then we support _Float16 in native
28346 precision, so we should set this to 16. Otherwise, we support the type,
28347 but want to evaluate expressions in float precision, so set this to
28348 0. */
28350 static enum flt_eval_method
28351 aarch64_excess_precision (enum excess_precision_type type)
28353 switch (type)
28355 case EXCESS_PRECISION_TYPE_FAST:
28356 case EXCESS_PRECISION_TYPE_STANDARD:
28357 /* We can calculate either in 16-bit range and precision or
28358 32-bit range and precision. Make that decision based on whether
28359 we have native support for the ARMv8.2-A 16-bit floating-point
28360 instructions or not. */
28361 return (TARGET_FP_F16INST
28362 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28363 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28364 case EXCESS_PRECISION_TYPE_IMPLICIT:
28365 case EXCESS_PRECISION_TYPE_FLOAT16:
28366 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28367 default:
28368 gcc_unreachable ();
28370 return FLT_EVAL_METHOD_UNPREDICTABLE;
28373 /* Implement TARGET_C_BITINT_TYPE_INFO.
28374 Return true if _BitInt(N) is supported and fill its details into *INFO. */
28375 bool
28376 aarch64_bitint_type_info (int n, struct bitint_info *info)
28378 if (TARGET_BIG_END)
28379 return false;
28381 if (n <= 8)
28382 info->limb_mode = QImode;
28383 else if (n <= 16)
28384 info->limb_mode = HImode;
28385 else if (n <= 32)
28386 info->limb_mode = SImode;
28387 else if (n <= 64)
28388 info->limb_mode = DImode;
28389 else if (n <= 128)
28390 info->limb_mode = TImode;
28391 else
28392 /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28393 type {signed,unsigned} __int128[M] where M*128 >= N. However, to be
28394 able to use libgcc's implementation to support large _BitInt's we need
28395 to use a LIMB_MODE that is no larger than 'long long'. This is why we
28396 use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28397 be TImode to ensure we are ABI compliant. */
28398 info->limb_mode = DImode;
28400 if (n > 128)
28401 info->abi_limb_mode = TImode;
28402 else
28403 info->abi_limb_mode = info->limb_mode;
28404 info->big_endian = TARGET_BIG_END;
28405 info->extended = false;
28406 return true;
28409 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
28410 scheduled for speculative execution. Reject the long-running division
28411 and square-root instructions. */
28413 static bool
28414 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28416 switch (get_attr_type (insn))
28418 case TYPE_SDIV:
28419 case TYPE_UDIV:
28420 case TYPE_FDIVS:
28421 case TYPE_FDIVD:
28422 case TYPE_FSQRTS:
28423 case TYPE_FSQRTD:
28424 case TYPE_NEON_FP_SQRT_S:
28425 case TYPE_NEON_FP_SQRT_D:
28426 case TYPE_NEON_FP_SQRT_S_Q:
28427 case TYPE_NEON_FP_SQRT_D_Q:
28428 case TYPE_NEON_FP_DIV_S:
28429 case TYPE_NEON_FP_DIV_D:
28430 case TYPE_NEON_FP_DIV_S_Q:
28431 case TYPE_NEON_FP_DIV_D_Q:
28432 return false;
28433 default:
28434 return true;
28438 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
28440 static int
28441 aarch64_compute_pressure_classes (reg_class *classes)
28443 int i = 0;
28444 classes[i++] = GENERAL_REGS;
28445 classes[i++] = FP_REGS;
28446 /* PR_REGS isn't a useful pressure class because many predicate pseudo
28447 registers need to go in PR_LO_REGS at some point during their
28448 lifetime. Splitting it into two halves has the effect of making
28449 all predicates count against PR_LO_REGS, so that we try whenever
28450 possible to restrict the number of live predicates to 8. This
28451 greatly reduces the amount of spilling in certain loops. */
28452 classes[i++] = PR_LO_REGS;
28453 classes[i++] = PR_HI_REGS;
28454 return i;
28457 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
28459 static bool
28460 aarch64_can_change_mode_class (machine_mode from,
28461 machine_mode to, reg_class_t)
28463 return aarch64_modes_compatible_p (from, to);
28466 /* Implement TARGET_EARLY_REMAT_MODES. */
28468 static void
28469 aarch64_select_early_remat_modes (sbitmap modes)
28471 /* SVE values are not normally live across a call, so it should be
28472 worth doing early rematerialization even in VL-specific mode. */
28473 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
28474 if (aarch64_sve_mode_p ((machine_mode) i))
28475 bitmap_set_bit (modes, i);
28478 /* Override the default target speculation_safe_value. */
28479 static rtx
28480 aarch64_speculation_safe_value (machine_mode mode,
28481 rtx result, rtx val, rtx failval)
28483 /* Maybe we should warn if falling back to hard barriers. They are
28484 likely to be noticably more expensive than the alternative below. */
28485 if (!aarch64_track_speculation)
28486 return default_speculation_safe_value (mode, result, val, failval);
28488 if (!REG_P (val))
28489 val = copy_to_mode_reg (mode, val);
28491 if (!aarch64_reg_or_zero (failval, mode))
28492 failval = copy_to_mode_reg (mode, failval);
28494 emit_insn (gen_despeculate_copy (mode, result, val, failval));
28495 return result;
28498 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28499 Look into the tuning structure for an estimate.
28500 KIND specifies the type of requested estimate: min, max or likely.
28501 For cores with a known SVE width all three estimates are the same.
28502 For generic SVE tuning we want to distinguish the maximum estimate from
28503 the minimum and likely ones.
28504 The likely estimate is the same as the minimum in that case to give a
28505 conservative behavior of auto-vectorizing with SVE when it is a win
28506 even for 128-bit SVE.
28507 When SVE width information is available VAL.coeffs[1] is multiplied by
28508 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
28510 static HOST_WIDE_INT
28511 aarch64_estimated_poly_value (poly_int64 val,
28512 poly_value_estimate_kind kind
28513 = POLY_VALUE_LIKELY)
28515 unsigned int width_source = aarch64_tune_params.sve_width;
28517 /* If there is no core-specific information then the minimum and likely
28518 values are based on 128-bit vectors and the maximum is based on
28519 the architectural maximum of 2048 bits. */
28520 if (width_source == SVE_SCALABLE)
28521 switch (kind)
28523 case POLY_VALUE_MIN:
28524 case POLY_VALUE_LIKELY:
28525 return val.coeffs[0];
28526 case POLY_VALUE_MAX:
28527 return val.coeffs[0] + val.coeffs[1] * 15;
28530 /* Allow sve_width to be a bitmask of different VL, treating the lowest
28531 as likely. This could be made more general if future -mtune options
28532 need it to be. */
28533 if (kind == POLY_VALUE_MAX)
28534 width_source = 1 << floor_log2 (width_source);
28535 else
28536 width_source = least_bit_hwi (width_source);
28538 /* If the core provides width information, use that. */
28539 HOST_WIDE_INT over_128 = width_source - 128;
28540 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
28544 /* Return true for types that could be supported as SIMD return or
28545 argument types. */
28547 static bool
28548 supported_simd_type (tree t)
28550 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
28552 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
28553 return s == 1 || s == 2 || s == 4 || s == 8;
28555 return false;
28558 /* Determine the lane size for the clone argument/return type. This follows
28559 the LS(P) rule in the VFABIA64. */
28561 static unsigned
28562 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
28564 gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
28566 /* For non map-to-vector types that are pointers we use the element type it
28567 points to. */
28568 if (POINTER_TYPE_P (type))
28569 switch (clone_arg_type)
28571 default:
28572 break;
28573 case SIMD_CLONE_ARG_TYPE_UNIFORM:
28574 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
28575 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
28576 type = TREE_TYPE (type);
28577 break;
28580 /* For types (or pointers of non map-to-vector types point to) that are
28581 integers or floating point, we use their size if they are 1, 2, 4 or 8.
28583 if (INTEGRAL_TYPE_P (type)
28584 || SCALAR_FLOAT_TYPE_P (type))
28585 switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
28587 default:
28588 break;
28589 case 1:
28590 case 2:
28591 case 4:
28592 case 8:
28593 return TYPE_PRECISION (type);
28595 /* For any other we use the size of uintptr_t. For map-to-vector types that
28596 are pointers, using the size of uintptr_t is the same as using the size of
28597 their type, seeing all pointers are the same size as uintptr_t. */
28598 return POINTER_SIZE;
28602 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
28604 static int
28605 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
28606 struct cgraph_simd_clone *clonei,
28607 tree base_type ATTRIBUTE_UNUSED,
28608 int num, bool explicit_p)
28610 tree t, ret_type;
28611 unsigned int nds_elt_bits;
28612 unsigned HOST_WIDE_INT const_simdlen;
28614 if (!TARGET_SIMD)
28615 return 0;
28617 /* For now, SVE simdclones won't produce illegal simdlen, So only check
28618 const simdlens here. */
28619 if (maybe_ne (clonei->simdlen, 0U)
28620 && clonei->simdlen.is_constant (&const_simdlen)
28621 && (const_simdlen < 2
28622 || const_simdlen > 1024
28623 || (const_simdlen & (const_simdlen - 1)) != 0))
28625 if (explicit_p)
28626 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28627 "unsupported simdlen %wd", const_simdlen);
28628 return 0;
28631 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
28632 /* According to AArch64's Vector ABI the type that determines the simdlen is
28633 the narrowest of types, so we ignore base_type for AArch64. */
28634 if (TREE_CODE (ret_type) != VOID_TYPE
28635 && !supported_simd_type (ret_type))
28637 if (!explicit_p)
28639 else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28640 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28641 "GCC does not currently support return type %qT "
28642 "for simd", ret_type);
28643 else
28644 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28645 "unsupported return type %qT for simd",
28646 ret_type);
28647 return 0;
28650 auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
28652 /* We are looking for the NDS type here according to the VFABIA64. */
28653 if (TREE_CODE (ret_type) != VOID_TYPE)
28655 nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
28656 vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
28658 else
28659 nds_elt_bits = POINTER_SIZE;
28661 int i;
28662 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
28663 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
28664 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
28665 t && t != void_list_node; t = TREE_CHAIN (t), i++)
28667 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
28668 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
28669 && !supported_simd_type (arg_type))
28671 if (!explicit_p)
28673 else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28674 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28675 "GCC does not currently support argument type %qT "
28676 "for simd", arg_type);
28677 else
28678 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28679 "unsupported argument type %qT for simd",
28680 arg_type);
28681 return 0;
28683 unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
28684 if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
28685 vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
28686 if (nds_elt_bits > lane_bits)
28687 nds_elt_bits = lane_bits;
28690 clonei->vecsize_mangle = 'n';
28691 clonei->mask_mode = VOIDmode;
28692 poly_uint64 simdlen;
28693 auto_vec<poly_uint64> simdlens (2);
28694 /* Keep track of the possible simdlens the clones of this function can have,
28695 and check them later to see if we support them. */
28696 if (known_eq (clonei->simdlen, 0U))
28698 simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28699 if (maybe_ne (simdlen, 1U))
28700 simdlens.safe_push (simdlen);
28701 simdlens.safe_push (simdlen * 2);
28703 else
28704 simdlens.safe_push (clonei->simdlen);
28706 clonei->vecsize_int = 0;
28707 clonei->vecsize_float = 0;
28709 /* We currently do not support generating simdclones where vector arguments
28710 do not fit into a single vector register, i.e. vector types that are more
28711 than 128-bits large. This is because of how we currently represent such
28712 types in ACLE, where we use a struct to allow us to pass them as arguments
28713 and return.
28714 Hence why we have to check whether the simdlens available for this
28715 simdclone would cause a vector type to be larger than 128-bits, and reject
28716 such a clone. */
28717 unsigned j = 0;
28718 while (j < simdlens.length ())
28720 bool remove_simdlen = false;
28721 for (auto elt : vec_elts)
28722 if (known_gt (simdlens[j] * elt.second, 128U))
28724 /* Don't issue a warning for every simdclone when there is no
28725 specific simdlen clause. */
28726 if (explicit_p && maybe_ne (clonei->simdlen, 0U))
28727 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28728 "GCC does not currently support simdlen %wd for "
28729 "type %qT",
28730 constant_lower_bound (simdlens[j]), elt.first);
28731 remove_simdlen = true;
28732 break;
28734 if (remove_simdlen)
28735 simdlens.ordered_remove (j);
28736 else
28737 j++;
28741 int count = simdlens.length ();
28742 if (count == 0)
28744 if (explicit_p && known_eq (clonei->simdlen, 0U))
28746 /* Warn the user if we can't generate any simdclone. */
28747 simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28748 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28749 "GCC does not currently support a simdclone with simdlens"
28750 " %wd and %wd for these types.",
28751 constant_lower_bound (simdlen),
28752 constant_lower_bound (simdlen*2));
28754 return 0;
28757 gcc_assert (num < count);
28758 clonei->simdlen = simdlens[num];
28759 return count;
28762 /* Implement TARGET_SIMD_CLONE_ADJUST. */
28764 static void
28765 aarch64_simd_clone_adjust (struct cgraph_node *node)
28767 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
28768 use the correct ABI. */
28770 tree t = TREE_TYPE (node->decl);
28771 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
28772 TYPE_ATTRIBUTES (t));
28775 /* Implement TARGET_SIMD_CLONE_USABLE. */
28777 static int
28778 aarch64_simd_clone_usable (struct cgraph_node *node)
28780 switch (node->simdclone->vecsize_mangle)
28782 case 'n':
28783 if (!TARGET_SIMD)
28784 return -1;
28785 return 0;
28786 default:
28787 gcc_unreachable ();
28791 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
28793 static int
28794 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
28796 auto check_attr = [&](const char *ns, const char *name) {
28797 tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
28798 tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
28799 if (!attr1 && !attr2)
28800 return true;
28802 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
28805 if (!check_attr ("gnu", "aarch64_vector_pcs"))
28806 return 0;
28807 if (!check_attr ("gnu", "Advanced SIMD type"))
28808 return 0;
28809 if (!check_attr ("gnu", "SVE type"))
28810 return 0;
28811 if (!check_attr ("gnu", "SVE sizeless type"))
28812 return 0;
28813 if (!check_attr ("arm", "streaming"))
28814 return 0;
28815 if (!check_attr ("arm", "streaming_compatible"))
28816 return 0;
28817 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
28818 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
28819 return 0;
28820 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
28821 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
28822 return 0;
28823 return 1;
28826 /* Implement TARGET_MERGE_DECL_ATTRIBUTES. */
28828 static tree
28829 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
28831 tree old_attrs = DECL_ATTRIBUTES (olddecl);
28832 tree old_new = lookup_attribute ("arm", "new", old_attrs);
28834 tree new_attrs = DECL_ATTRIBUTES (newdecl);
28835 tree new_new = lookup_attribute ("arm", "new", new_attrs);
28837 if (DECL_INITIAL (olddecl) && new_new)
28839 error ("cannot apply attribute %qs to %q+D after the function"
28840 " has been defined", "new", newdecl);
28841 inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
28842 newdecl);
28844 else
28846 if (old_new && new_new)
28848 old_attrs = remove_attribute ("arm", "new", old_attrs);
28849 TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
28850 TREE_VALUE (old_new));
28852 if (new_new)
28853 aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
28856 return merge_attributes (old_attrs, new_attrs);
28859 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
28861 static const char *
28862 aarch64_get_multilib_abi_name (void)
28864 if (TARGET_BIG_END)
28865 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
28866 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
28869 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
28870 global variable based guard use the default else
28871 return a null tree. */
28872 static tree
28873 aarch64_stack_protect_guard (void)
28875 if (aarch64_stack_protector_guard == SSP_GLOBAL)
28876 return default_stack_protect_guard ();
28878 return NULL_TREE;
28881 /* Return the diagnostic message string if the binary operation OP is
28882 not permitted on TYPE1 and TYPE2, NULL otherwise. */
28884 static const char *
28885 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
28886 const_tree type2)
28888 if (VECTOR_TYPE_P (type1)
28889 && VECTOR_TYPE_P (type2)
28890 && !TYPE_INDIVISIBLE_P (type1)
28891 && !TYPE_INDIVISIBLE_P (type2)
28892 && (aarch64_sve::builtin_type_p (type1)
28893 != aarch64_sve::builtin_type_p (type2)))
28894 return N_("cannot combine GNU and SVE vectors in a binary operation");
28896 /* Operation allowed. */
28897 return NULL;
28900 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
28901 compiler that we automatically ignore the top byte of our pointers, which
28902 allows using -fsanitize=hwaddress. */
28903 bool
28904 aarch64_can_tag_addresses ()
28906 return !TARGET_ILP32;
28909 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
28910 section at the end if needed. */
28911 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
28912 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
28913 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
28914 void
28915 aarch64_file_end_indicate_exec_stack ()
28917 file_end_indicate_exec_stack ();
28919 unsigned feature_1_and = 0;
28920 if (aarch_bti_enabled ())
28921 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
28923 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
28924 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
28926 if (feature_1_and)
28928 /* Generate .note.gnu.property section. */
28929 switch_to_section (get_section (".note.gnu.property",
28930 SECTION_NOTYPE, NULL));
28932 /* PT_NOTE header: namesz, descsz, type.
28933 namesz = 4 ("GNU\0")
28934 descsz = 16 (Size of the program property array)
28935 [(12 + padding) * Number of array elements]
28936 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
28937 assemble_align (POINTER_SIZE);
28938 assemble_integer (GEN_INT (4), 4, 32, 1);
28939 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
28940 assemble_integer (GEN_INT (5), 4, 32, 1);
28942 /* PT_NOTE name. */
28943 assemble_string ("GNU", 4);
28945 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
28946 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
28947 datasz = 4
28948 data = feature_1_and. */
28949 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
28950 assemble_integer (GEN_INT (4), 4, 32, 1);
28951 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
28953 /* Pad the size of the note to the required alignment. */
28954 assemble_align (POINTER_SIZE);
28957 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
28958 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
28959 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
28961 /* Helper function for straight line speculation.
28962 Return what barrier should be emitted for straight line speculation
28963 mitigation.
28964 When not mitigating against straight line speculation this function returns
28965 an empty string.
28966 When mitigating against straight line speculation, use:
28967 * SB when the v8.5-A SB extension is enabled.
28968 * DSB+ISB otherwise. */
28969 const char *
28970 aarch64_sls_barrier (int mitigation_required)
28972 return mitigation_required
28973 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
28974 : "";
28977 static GTY (()) tree aarch64_sls_shared_thunks[30];
28978 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
28979 const char *indirect_symbol_names[30] = {
28980 "__call_indirect_x0",
28981 "__call_indirect_x1",
28982 "__call_indirect_x2",
28983 "__call_indirect_x3",
28984 "__call_indirect_x4",
28985 "__call_indirect_x5",
28986 "__call_indirect_x6",
28987 "__call_indirect_x7",
28988 "__call_indirect_x8",
28989 "__call_indirect_x9",
28990 "__call_indirect_x10",
28991 "__call_indirect_x11",
28992 "__call_indirect_x12",
28993 "__call_indirect_x13",
28994 "__call_indirect_x14",
28995 "__call_indirect_x15",
28996 "", /* "__call_indirect_x16", */
28997 "", /* "__call_indirect_x17", */
28998 "__call_indirect_x18",
28999 "__call_indirect_x19",
29000 "__call_indirect_x20",
29001 "__call_indirect_x21",
29002 "__call_indirect_x22",
29003 "__call_indirect_x23",
29004 "__call_indirect_x24",
29005 "__call_indirect_x25",
29006 "__call_indirect_x26",
29007 "__call_indirect_x27",
29008 "__call_indirect_x28",
29009 "__call_indirect_x29",
29012 /* Function to create a BLR thunk. This thunk is used to mitigate straight
29013 line speculation. Instead of a simple BLR that can be speculated past,
29014 we emit a BL to this thunk, and this thunk contains a BR to the relevant
29015 register. These thunks have the relevant speculation barries put after
29016 their indirect branch so that speculation is blocked.
29018 We use such a thunk so the speculation barriers are kept off the
29019 architecturally executed path in order to reduce the performance overhead.
29021 When optimizing for size we use stubs shared by the linked object.
29022 When optimizing for performance we emit stubs for each function in the hope
29023 that the branch predictor can better train on jumps specific for a given
29024 function. */
29026 aarch64_sls_create_blr_label (int regnum)
29028 gcc_assert (STUB_REGNUM_P (regnum));
29029 if (optimize_function_for_size_p (cfun))
29031 /* For the thunks shared between different functions in this compilation
29032 unit we use a named symbol -- this is just for users to more easily
29033 understand the generated assembly. */
29034 aarch64_sls_shared_thunks_needed = true;
29035 const char *thunk_name = indirect_symbol_names[regnum];
29036 if (aarch64_sls_shared_thunks[regnum] == NULL)
29038 /* Build a decl representing this function stub and record it for
29039 later. We build a decl here so we can use the GCC machinery for
29040 handling sections automatically (through `get_named_section` and
29041 `make_decl_one_only`). That saves us a lot of trouble handling
29042 the specifics of different output file formats. */
29043 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
29044 get_identifier (thunk_name),
29045 build_function_type_list (void_type_node,
29046 NULL_TREE));
29047 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
29048 NULL_TREE, void_type_node);
29049 TREE_PUBLIC (decl) = 1;
29050 TREE_STATIC (decl) = 1;
29051 DECL_IGNORED_P (decl) = 1;
29052 DECL_ARTIFICIAL (decl) = 1;
29053 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29054 resolve_unique_section (decl, 0, false);
29055 aarch64_sls_shared_thunks[regnum] = decl;
29058 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
29061 if (cfun->machine->call_via[regnum] == NULL)
29062 cfun->machine->call_via[regnum]
29063 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
29064 return cfun->machine->call_via[regnum];
29067 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29068 aarch64_sls_emit_shared_blr_thunks below. */
29069 static void
29070 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
29072 /* Save in x16 and branch to that function so this transformation does
29073 not prevent jumping to `BTI c` instructions. */
29074 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
29075 asm_fprintf (out_file, "\tbr\tx16\n");
29078 /* Emit all BLR stubs for this particular function.
29079 Here we emit all the BLR stubs needed for the current function. Since we
29080 emit these stubs in a consecutive block we know there will be no speculation
29081 gadgets between each stub, and hence we only emit a speculation barrier at
29082 the end of the stub sequences.
29084 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
29085 void
29086 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29088 if (! aarch64_harden_sls_blr_p ())
29089 return;
29091 bool any_functions_emitted = false;
29092 /* We must save and restore the current function section since this assembly
29093 is emitted at the end of the function. This means it can be emitted *just
29094 after* the cold section of a function. That cold part would be emitted in
29095 a different section. That switch would trigger a `.cfi_endproc` directive
29096 to be emitted in the original section and a `.cfi_startproc` directive to
29097 be emitted in the new section. Switching to the original section without
29098 restoring would mean that the `.cfi_endproc` emitted as a function ends
29099 would happen in a different section -- leaving an unmatched
29100 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29101 in the standard text section. */
29102 section *save_text_section = in_section;
29103 switch_to_section (function_section (current_function_decl));
29104 for (int regnum = 0; regnum < 30; ++regnum)
29106 rtx specu_label = cfun->machine->call_via[regnum];
29107 if (specu_label == NULL)
29108 continue;
29110 targetm.asm_out.print_operand (out_file, specu_label, 0);
29111 asm_fprintf (out_file, ":\n");
29112 aarch64_sls_emit_function_stub (out_file, regnum);
29113 any_functions_emitted = true;
29115 if (any_functions_emitted)
29116 /* Can use the SB if needs be here, since this stub will only be used
29117 by the current function, and hence for the current target. */
29118 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29119 switch_to_section (save_text_section);
29122 /* Emit shared BLR stubs for the current compilation unit.
29123 Over the course of compiling this unit we may have converted some BLR
29124 instructions to a BL to a shared stub function. This is where we emit those
29125 stub functions.
29126 This function is for the stubs shared between different functions in this
29127 compilation unit. We share when optimizing for size instead of speed.
29129 This function is called through the TARGET_ASM_FILE_END hook. */
29130 void
29131 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29133 if (! aarch64_sls_shared_thunks_needed)
29134 return;
29136 for (int regnum = 0; regnum < 30; ++regnum)
29138 tree decl = aarch64_sls_shared_thunks[regnum];
29139 if (!decl)
29140 continue;
29142 const char *name = indirect_symbol_names[regnum];
29143 switch_to_section (get_named_section (decl, NULL, 0));
29144 ASM_OUTPUT_ALIGN (out_file, 2);
29145 targetm.asm_out.globalize_label (out_file, name);
29146 /* Only emits if the compiler is configured for an assembler that can
29147 handle visibility directives. */
29148 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29149 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29150 ASM_OUTPUT_LABEL (out_file, name);
29151 aarch64_sls_emit_function_stub (out_file, regnum);
29152 /* Use the most conservative target to ensure it can always be used by any
29153 function in the translation unit. */
29154 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29155 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29159 /* Implement TARGET_ASM_FILE_END. */
29160 void
29161 aarch64_asm_file_end ()
29163 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29164 /* Since this function will be called for the ASM_FILE_END hook, we ensure
29165 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29166 for FreeBSD) still gets called. */
29167 #ifdef TARGET_ASM_FILE_END
29168 TARGET_ASM_FILE_END ();
29169 #endif
29172 const char *
29173 aarch64_indirect_call_asm (rtx addr)
29175 gcc_assert (REG_P (addr));
29176 if (aarch64_harden_sls_blr_p ())
29178 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29179 output_asm_insn ("bl\t%0", &stub_label);
29181 else
29182 output_asm_insn ("blr\t%0", &addr);
29183 return "";
29186 /* Emit the assembly instruction to load the thread pointer into DEST.
29187 Select between different tpidr_elN registers depending on -mtp= setting. */
29189 const char *
29190 aarch64_output_load_tp (rtx dest)
29192 const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29193 "tpidr_el3", "tpidrro_el0"};
29194 char buffer[64];
29195 snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29196 tpidrs[aarch64_tpidr_register]);
29197 output_asm_insn (buffer, &dest);
29198 return "";
29201 /* Set up the value of REG_ALLOC_ORDER from scratch.
29203 It was previously good practice to put call-clobbered registers ahead
29204 of call-preserved registers, but that isn't necessary these days.
29205 IRA's model of register save/restore costs is much more sophisticated
29206 than the model that a simple ordering could provide. We leave
29207 HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29208 of IRA's model.
29210 However, it is still useful to list registers that are members of
29211 multiple classes after registers that are members of fewer classes.
29212 For example, we have:
29214 - FP_LO8_REGS: v0-v7
29215 - FP_LO_REGS: v0-v15
29216 - FP_REGS: v0-v31
29218 If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29219 we run the risk of starving other (lower-priority) pseudos that
29220 require FP_LO8_REGS or FP_LO_REGS. Allocating FP_LO_REGS in the
29221 order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29222 Allocating downwards rather than upwards avoids this problem, at least
29223 in code that has reasonable register pressure.
29225 The situation for predicate registers is similar. */
29227 void
29228 aarch64_adjust_reg_alloc_order ()
29230 for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29231 if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29232 reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29233 else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29234 reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29235 else
29236 reg_alloc_order[i] = i;
29239 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29240 of vector mode MODE to select half the elements of that vector.
29241 Allow any combination of indices except duplicates (or out of range of
29242 the mode units). */
29244 bool
29245 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29247 int nunits = XVECLEN (par, 0);
29248 if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29249 return false;
29250 int mode_nunits = nunits * 2;
29251 /* Put all the elements of PAR into a hash_set and use its
29252 uniqueness guarantees to check that we don't try to insert the same
29253 element twice. */
29254 hash_set<rtx> parset;
29255 for (int i = 0; i < nunits; ++i)
29257 rtx elt = XVECEXP (par, 0, i);
29258 if (!CONST_INT_P (elt)
29259 || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29260 || parset.add (elt))
29261 return false;
29263 return true;
29266 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29267 contain any common elements. */
29269 bool
29270 aarch64_pars_overlap_p (rtx par1, rtx par2)
29272 int len1 = XVECLEN (par1, 0);
29273 int len2 = XVECLEN (par2, 0);
29274 hash_set<rtx> parset;
29275 for (int i = 0; i < len1; ++i)
29276 parset.add (XVECEXP (par1, 0, i));
29277 for (int i = 0; i < len2; ++i)
29278 if (parset.contains (XVECEXP (par2, 0, i)))
29279 return true;
29280 return false;
29283 /* Implement OPTIMIZE_MODE_SWITCHING. */
29285 bool
29286 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29288 bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29289 || (aarch64_cfun_has_new_state ("za")
29290 && df_regs_ever_live_p (ZA_REGNUM))
29291 || (aarch64_cfun_has_new_state ("zt0")
29292 && df_regs_ever_live_p (ZT0_REGNUM)));
29294 if (have_sme_state && nonlocal_goto_handler_labels)
29296 static bool reported;
29297 if (!reported)
29299 sorry ("non-local gotos in functions with SME state");
29300 reported = true;
29304 switch (entity)
29306 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29307 case aarch64_mode_entity::LOCAL_SME_STATE:
29308 return have_sme_state && !nonlocal_goto_handler_labels;
29310 gcc_unreachable ();
29313 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER. */
29315 static void
29316 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29317 aarch64_tristate_mode prev_mode)
29319 if (mode == aarch64_tristate_mode::YES)
29321 gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29322 aarch64_init_tpidr2_block ();
29324 else
29325 gcc_unreachable ();
29328 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE. */
29330 static void
29331 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
29332 aarch64_local_sme_state prev_mode)
29334 /* Back-propagation should ensure that we're always starting from
29335 a known mode. */
29336 gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
29338 if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29340 /* Commit any uncommitted lazy save. This leaves ZA either active
29341 and zero (lazy save case) or off (normal case).
29343 The sequence is:
29345 mrs <temp>, tpidr2_el0
29346 cbz <temp>, no_save
29347 bl __arm_tpidr2_save
29348 msr tpidr2_el0, xzr
29349 zero { za } // Only if ZA is live
29350 zero { zt0 } // Only if ZT0 is live
29351 no_save: */
29352 auto tmp_reg = gen_reg_rtx (DImode);
29353 emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
29354 auto label = gen_label_rtx ();
29355 rtx branch = aarch64_gen_compare_zero_and_branch (EQ, tmp_reg, label);
29356 auto jump = emit_jump_insn (branch);
29357 JUMP_LABEL (jump) = label;
29358 emit_insn (gen_aarch64_tpidr2_save ());
29359 emit_insn (gen_aarch64_clear_tpidr2 ());
29360 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29361 || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29363 if (aarch64_cfun_has_state ("za"))
29364 emit_insn (gen_aarch64_initial_zero_za ());
29365 if (aarch64_cfun_has_state ("zt0"))
29366 emit_insn (gen_aarch64_sme_zero_zt0 ());
29368 emit_label (label);
29371 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29372 || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29374 if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29376 /* Make ZA active after being inactive.
29378 First handle the case in which the lazy save we set up was
29379 committed by a callee. If the function's source-level ZA state
29380 is live then we must conditionally restore it from the lazy
29381 save buffer. Otherwise we can just force PSTATE.ZA to 1. */
29382 if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
29383 emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29384 else
29385 emit_insn (gen_aarch64_smstart_za ());
29387 /* Now handle the case in which the lazy save was not committed.
29388 In that case, ZA still contains the current function's ZA state,
29389 and we just need to cancel the lazy save. */
29390 emit_insn (gen_aarch64_clear_tpidr2 ());
29392 /* Restore the ZT0 state, if we have some. */
29393 if (aarch64_cfun_has_state ("zt0"))
29394 aarch64_restore_zt0 (true);
29396 return;
29399 if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
29401 /* Retrieve the current function's ZA state from the lazy save
29402 buffer. */
29403 aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29405 /* Restore the ZT0 state, if we have some. */
29406 if (aarch64_cfun_has_state ("zt0"))
29407 aarch64_restore_zt0 (true);
29408 return;
29411 if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
29412 || prev_mode == aarch64_local_sme_state::OFF)
29414 /* INACTIVE_CALLER means that we are enabling ZA for the first
29415 time in this function. The code above means that ZA is either
29416 active and zero (if we committed a lazy save) or off. Handle
29417 the latter case by forcing ZA on.
29419 OFF means that PSTATE.ZA is guaranteed to be 0. We just need
29420 to force it to 1.
29422 Both cases leave ZA zeroed. */
29423 emit_insn (gen_aarch64_smstart_za ());
29425 /* Restore the ZT0 state, if we have some. */
29426 if (prev_mode == aarch64_local_sme_state::OFF
29427 && aarch64_cfun_has_state ("zt0"))
29428 aarch64_restore_zt0 (true);
29429 return;
29432 if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29433 || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
29434 /* A simple change in liveness, such as in a CFG structure where
29435 ZA is only conditionally defined. No code is needed. */
29436 return;
29438 gcc_unreachable ();
29441 if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29443 if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29444 || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29445 || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29447 /* Save the ZT0 state, if we have some. */
29448 if (aarch64_cfun_has_state ("zt0"))
29449 aarch64_save_zt0 ();
29451 /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29452 case of setting up a lazy save buffer before a call.
29453 A transition from INACTIVE_CALLER is similar, except that
29454 the contents of ZA are known to be zero.
29456 A transition from ACTIVE_DEAD means that ZA is live at the
29457 point of the transition, but is dead on at least one incoming
29458 edge. (That is, ZA is only conditionally initialized.)
29459 For efficiency, we want to set up a lazy save even for
29460 dead contents, since forcing ZA off would make later code
29461 restore ZA from the lazy save buffer. */
29462 emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29463 return;
29466 if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
29467 || prev_mode == aarch64_local_sme_state::OFF)
29468 /* We're simply discarding the information about which inactive
29469 state applies. */
29470 return;
29472 gcc_unreachable ();
29475 if (mode == aarch64_local_sme_state::INACTIVE_CALLER
29476 || mode == aarch64_local_sme_state::OFF)
29478 /* Save the ZT0 state, if we have some. */
29479 if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29480 || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
29481 && mode == aarch64_local_sme_state::OFF
29482 && aarch64_cfun_has_state ("zt0"))
29483 aarch64_save_zt0 ();
29485 /* The transition to INACTIVE_CALLER is used before returning from
29486 new("za") functions. Any state in ZA belongs to the current
29487 function rather than a caller, but that state is no longer
29488 needed. Clear any pending lazy save and turn ZA off.
29490 The transition to OFF is used before calling a private-ZA function.
29491 We committed any incoming lazy save above, so at this point any
29492 contents in ZA belong to the current function. */
29493 if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29494 emit_insn (gen_aarch64_clear_tpidr2 ());
29496 if (prev_mode != aarch64_local_sme_state::OFF
29497 && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
29498 emit_insn (gen_aarch64_smstop_za ());
29500 return;
29503 if (mode == aarch64_local_sme_state::SAVED_LOCAL)
29505 /* This is a transition to an exception handler. */
29506 gcc_assert (prev_mode == aarch64_local_sme_state::OFF
29507 || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
29508 return;
29511 gcc_unreachable ();
29514 /* Implement TARGET_MODE_EMIT. */
29516 static void
29517 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
29519 if (mode == prev_mode)
29520 return;
29522 start_sequence ();
29523 switch (aarch64_mode_entity (entity))
29525 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29526 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
29527 aarch64_tristate_mode (prev_mode));
29528 break;
29530 case aarch64_mode_entity::LOCAL_SME_STATE:
29531 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
29532 aarch64_local_sme_state (prev_mode));
29533 break;
29535 rtx_insn *seq = get_insns ();
29536 end_sequence ();
29538 /* Get the set of clobbered registers that are currently live. */
29539 HARD_REG_SET clobbers = {};
29540 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
29542 if (!NONDEBUG_INSN_P (insn))
29543 continue;
29544 vec_rtx_properties properties;
29545 properties.add_insn (insn, false);
29546 for (rtx_obj_reference ref : properties.refs ())
29547 if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
29548 SET_HARD_REG_BIT (clobbers, ref.regno);
29550 clobbers &= live;
29552 /* Emit instructions to save clobbered registers to pseudos. Queue
29553 instructions to restore the registers afterwards.
29555 This should only needed in rare situations. */
29556 auto_vec<rtx, 33> after;
29557 for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
29558 if (TEST_HARD_REG_BIT (clobbers, regno))
29560 rtx hard_reg = gen_rtx_REG (DImode, regno);
29561 rtx pseudo_reg = gen_reg_rtx (DImode);
29562 emit_move_insn (pseudo_reg, hard_reg);
29563 after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
29565 if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
29567 rtx pseudo_reg = gen_reg_rtx (DImode);
29568 emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
29569 after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
29572 /* Emit the transition instructions themselves. */
29573 emit_insn (seq);
29575 /* Restore the clobbered registers. */
29576 for (auto *insn : after)
29577 emit_insn (insn);
29580 /* Return true if INSN references the SME state represented by hard register
29581 REGNO. */
29583 static bool
29584 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
29586 df_ref ref;
29587 FOR_EACH_INSN_DEF (ref, insn)
29588 if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
29589 && DF_REF_REGNO (ref) == regno)
29590 return true;
29591 FOR_EACH_INSN_USE (ref, insn)
29592 if (DF_REF_REGNO (ref) == regno)
29593 return true;
29594 return false;
29597 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE. */
29599 static aarch64_local_sme_state
29600 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
29602 if (!CALL_P (insn)
29603 && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29605 static bool reported;
29606 if (!reported)
29608 sorry ("catching non-call exceptions in functions with SME state");
29609 reported = true;
29611 /* Aim for graceful error recovery by picking the value that is
29612 least likely to generate an ICE. */
29613 return aarch64_local_sme_state::INACTIVE_LOCAL;
29616 /* A non-local goto is equivalent to a return. We disallow non-local
29617 receivers in functions with SME state, so we know that the target
29618 expects ZA to be dormant or off. */
29619 if (JUMP_P (insn)
29620 && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
29621 return aarch64_local_sme_state::INACTIVE_CALLER;
29623 /* start_private_za_call and end_private_za_call bracket a sequence
29624 that calls a private-ZA function. Force ZA to be turned off if the
29625 function doesn't have any live ZA state, otherwise require ZA to be
29626 inactive. */
29627 auto icode = recog_memoized (insn);
29628 if (icode == CODE_FOR_aarch64_start_private_za_call
29629 || icode == CODE_FOR_aarch64_end_private_za_call)
29630 return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29631 ? aarch64_local_sme_state::INACTIVE_LOCAL
29632 : aarch64_local_sme_state::OFF);
29634 /* Force ZA to contain the current function's ZA state if INSN wants
29635 to access it. Do the same for accesses to ZT0, since ZA and ZT0
29636 are both controlled by PSTATE.ZA. */
29637 if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
29638 || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
29639 return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29640 ? aarch64_local_sme_state::ACTIVE_LIVE
29641 : aarch64_local_sme_state::ACTIVE_DEAD);
29643 return aarch64_local_sme_state::ANY;
29646 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER. */
29648 static aarch64_tristate_mode
29649 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
29651 /* We need to set up a lazy save buffer no later than the first
29652 transition to INACTIVE_LOCAL (which involves setting up a lazy save). */
29653 if (aarch64_mode_needed_local_sme_state (insn, live)
29654 == aarch64_local_sme_state::INACTIVE_LOCAL)
29655 return aarch64_tristate_mode::YES;
29657 /* Also make sure that the lazy save buffer is set up before the first
29658 insn that throws internally. The exception handler will sometimes
29659 load from it. */
29660 if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29661 return aarch64_tristate_mode::YES;
29663 return aarch64_tristate_mode::MAYBE;
29666 /* Implement TARGET_MODE_NEEDED. */
29668 static int
29669 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
29671 switch (aarch64_mode_entity (entity))
29673 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29674 return int (aarch64_mode_needed_za_save_buffer (insn, live));
29676 case aarch64_mode_entity::LOCAL_SME_STATE:
29677 return int (aarch64_mode_needed_local_sme_state (insn, live));
29679 gcc_unreachable ();
29682 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE. */
29684 static aarch64_local_sme_state
29685 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
29686 HARD_REG_SET live)
29688 /* Note places where ZA dies, so that we can try to avoid saving and
29689 restoring state that isn't needed. */
29690 if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29691 && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
29692 return aarch64_local_sme_state::ACTIVE_DEAD;
29694 /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29695 function. */
29696 if (mode == aarch64_local_sme_state::ACTIVE_DEAD
29697 && TEST_HARD_REG_BIT (live, ZA_REGNUM))
29698 return aarch64_local_sme_state::ACTIVE_LIVE;
29700 return mode;
29703 /* Implement TARGET_MODE_AFTER. */
29705 static int
29706 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
29708 switch (aarch64_mode_entity (entity))
29710 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29711 return mode;
29713 case aarch64_mode_entity::LOCAL_SME_STATE:
29714 return int (aarch64_mode_after_local_sme_state
29715 (aarch64_local_sme_state (mode), live));
29717 gcc_unreachable ();
29720 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE. */
29722 static aarch64_local_sme_state
29723 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
29724 aarch64_local_sme_state mode2)
29726 /* Perform a symmetrical check for two values. */
29727 auto is_pair = [&](aarch64_local_sme_state val1,
29728 aarch64_local_sme_state val2)
29730 return ((mode1 == val1 && mode2 == val2)
29731 || (mode1 == val2 && mode2 == val1));
29734 /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
29735 to a caller. OFF is one of the options. */
29736 if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
29737 aarch64_local_sme_state::OFF))
29738 return aarch64_local_sme_state::INACTIVE_CALLER;
29740 /* Similarly for dormant contents belonging to the current function. */
29741 if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
29742 aarch64_local_sme_state::OFF))
29743 return aarch64_local_sme_state::INACTIVE_LOCAL;
29745 /* Treat a conditionally-initialized value as a fully-initialized value. */
29746 if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
29747 aarch64_local_sme_state::ACTIVE_DEAD))
29748 return aarch64_local_sme_state::ACTIVE_LIVE;
29750 return aarch64_local_sme_state::ANY;
29753 /* Implement TARGET_MODE_CONFLUENCE. */
29755 static int
29756 aarch64_mode_confluence (int entity, int mode1, int mode2)
29758 gcc_assert (mode1 != mode2);
29759 switch (aarch64_mode_entity (entity))
29761 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29762 return int (aarch64_tristate_mode::MAYBE);
29764 case aarch64_mode_entity::LOCAL_SME_STATE:
29765 return int (aarch64_local_sme_confluence
29766 (aarch64_local_sme_state (mode1),
29767 aarch64_local_sme_state (mode2)));
29769 gcc_unreachable ();
29772 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
29773 NO throughput, or makes one transition from NO to YES. */
29775 static aarch64_tristate_mode
29776 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
29777 aarch64_tristate_mode mode2)
29779 /* Keep bringing the transition forward until it starts from NO. */
29780 if (mode1 == aarch64_tristate_mode::MAYBE
29781 && mode2 == aarch64_tristate_mode::YES)
29782 return mode2;
29784 return aarch64_tristate_mode::MAYBE;
29787 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE. */
29789 static aarch64_local_sme_state
29790 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
29791 aarch64_local_sme_state mode2)
29793 /* We always need to know what the current state is when transitioning
29794 to a new state. Force any location with indeterminate starting state
29795 to be active. */
29796 if (mode1 == aarch64_local_sme_state::ANY)
29797 switch (mode2)
29799 case aarch64_local_sme_state::INACTIVE_CALLER:
29800 case aarch64_local_sme_state::OFF:
29801 case aarch64_local_sme_state::ACTIVE_DEAD:
29802 /* The current function's ZA state is not live. */
29803 return aarch64_local_sme_state::ACTIVE_DEAD;
29805 case aarch64_local_sme_state::INACTIVE_LOCAL:
29806 case aarch64_local_sme_state::ACTIVE_LIVE:
29807 /* The current function's ZA state is live. */
29808 return aarch64_local_sme_state::ACTIVE_LIVE;
29810 case aarch64_local_sme_state::SAVED_LOCAL:
29811 /* This is a transition to an exception handler. Since we don't
29812 support non-call exceptions for SME functions, the source of
29813 the transition must be known. We'll assert later if that's
29814 not the case. */
29815 return aarch64_local_sme_state::ANY;
29817 case aarch64_local_sme_state::ANY:
29818 return aarch64_local_sme_state::ANY;
29821 return aarch64_local_sme_state::ANY;
29824 /* Implement TARGET_MODE_BACKPROP. */
29826 static int
29827 aarch64_mode_backprop (int entity, int mode1, int mode2)
29829 switch (aarch64_mode_entity (entity))
29831 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29832 return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
29833 aarch64_tristate_mode (mode2)));
29835 case aarch64_mode_entity::LOCAL_SME_STATE:
29836 return int (aarch64_local_sme_backprop
29837 (aarch64_local_sme_state (mode1),
29838 aarch64_local_sme_state (mode2)));
29840 gcc_unreachable ();
29843 /* Implement TARGET_MODE_ENTRY. */
29845 static int
29846 aarch64_mode_entry (int entity)
29848 switch (aarch64_mode_entity (entity))
29850 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29851 return int (aarch64_tristate_mode::NO);
29853 case aarch64_mode_entity::LOCAL_SME_STATE:
29854 return int (aarch64_cfun_shared_flags ("za") != 0
29855 ? aarch64_local_sme_state::ACTIVE_LIVE
29856 : aarch64_cfun_incoming_pstate_za () != 0
29857 ? aarch64_local_sme_state::ACTIVE_DEAD
29858 : aarch64_local_sme_state::INACTIVE_CALLER);
29860 gcc_unreachable ();
29863 /* Implement TARGET_MODE_EXIT. */
29865 static int
29866 aarch64_mode_exit (int entity)
29868 switch (aarch64_mode_entity (entity))
29870 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29871 return int (aarch64_tristate_mode::MAYBE);
29873 case aarch64_mode_entity::LOCAL_SME_STATE:
29874 return int (aarch64_cfun_shared_flags ("za") != 0
29875 ? aarch64_local_sme_state::ACTIVE_LIVE
29876 : aarch64_cfun_incoming_pstate_za () != 0
29877 ? aarch64_local_sme_state::ACTIVE_DEAD
29878 : aarch64_local_sme_state::INACTIVE_CALLER);
29880 gcc_unreachable ();
29883 /* Implement TARGET_MODE_EH_HANDLER. */
29885 static int
29886 aarch64_mode_eh_handler (int entity)
29888 switch (aarch64_mode_entity (entity))
29890 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29891 /* Require a lazy save buffer to be allocated before the first
29892 insn that can throw. */
29893 return int (aarch64_tristate_mode::YES);
29895 case aarch64_mode_entity::LOCAL_SME_STATE:
29896 return int (aarch64_local_sme_state::SAVED_LOCAL);
29898 gcc_unreachable ();
29901 /* Implement TARGET_MODE_PRIORITY. */
29903 static int
29904 aarch64_mode_priority (int, int n)
29906 return n;
29909 /* Implement TARGET_MD_ASM_ADJUST. */
29911 static rtx_insn *
29912 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
29913 vec<machine_mode> &input_modes,
29914 vec<const char *> &constraints,
29915 vec<rtx> &uses, vec<rtx> &clobbers,
29916 HARD_REG_SET &clobbered_regs, location_t loc)
29918 rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
29919 uses, clobbers, clobbered_regs, loc);
29921 /* "za" in the clobber list of a function with ZA state is defined to
29922 mean that the asm can read from and write to ZA. We can model the
29923 read using a USE, but unfortunately, it's not possible to model the
29924 write directly. Use a separate insn to model the effect.
29926 We must ensure that ZA is active on entry, which is enforced by using
29927 SME_STATE_REGNUM. The asm must ensure that ZA is active on return.
29929 The same thing applies to ZT0. */
29930 if (TARGET_ZA)
29931 for (unsigned int i = clobbers.length (); i-- > 0; )
29933 rtx x = clobbers[i];
29934 if (REG_P (x)
29935 && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
29937 auto id = cfun->machine->next_asm_update_za_id++;
29939 start_sequence ();
29940 if (seq)
29941 emit_insn (seq);
29942 rtx id_rtx = gen_int_mode (id, SImode);
29943 emit_insn (REGNO (x) == ZA_REGNUM
29944 ? gen_aarch64_asm_update_za (id_rtx)
29945 : gen_aarch64_asm_update_zt0 (id_rtx));
29946 seq = get_insns ();
29947 end_sequence ();
29949 auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
29950 uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
29951 uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
29953 clobbers.ordered_remove (i);
29954 CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
29957 return seq;
29960 /* BB is the target of an exception or nonlocal goto edge, which means
29961 that PSTATE.SM is known to be 0 on entry. Put it into the state that
29962 the current function requires. */
29964 static bool
29965 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
29967 if (TARGET_NON_STREAMING)
29968 return false;
29970 start_sequence ();
29971 rtx_insn *guard_label = nullptr;
29972 if (TARGET_STREAMING_COMPATIBLE)
29973 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29974 AARCH64_FL_SM_OFF);
29975 aarch64_sme_mode_switch_regs args_switch;
29976 args_switch.add_call_preserved_regs (df_get_live_in (bb));
29977 args_switch.emit_prologue ();
29978 aarch64_switch_pstate_sm (AARCH64_FL_SM_OFF, AARCH64_FL_SM_ON);
29979 args_switch.emit_epilogue ();
29980 if (guard_label)
29981 emit_label (guard_label);
29982 auto seq = get_insns ();
29983 end_sequence ();
29985 emit_insn_after (seq, bb_note (bb));
29986 return true;
29989 /* JUMP is a nonlocal goto. Its target requires PSTATE.SM to be 0 on entry,
29990 so arrange to make it so. */
29992 static bool
29993 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
29995 if (TARGET_NON_STREAMING)
29996 return false;
29998 start_sequence ();
29999 rtx_insn *guard_label = nullptr;
30000 if (TARGET_STREAMING_COMPATIBLE)
30001 guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30002 AARCH64_FL_SM_OFF);
30003 aarch64_switch_pstate_sm (AARCH64_FL_SM_ON, AARCH64_FL_SM_OFF);
30004 if (guard_label)
30005 emit_label (guard_label);
30006 auto seq = get_insns ();
30007 end_sequence ();
30009 emit_insn_before (seq, jump);
30010 return true;
30013 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30014 to switch to the new mode and the instructions needed to restore the
30015 original mode. Return true if something changed. */
30016 static bool
30017 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
30019 /* Mode switches for sibling calls are handled via the epilogue. */
30020 if (SIBLING_CALL_P (call))
30021 return false;
30023 auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
30024 if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
30025 return false;
30027 /* Switch mode before the call, preserving any argument registers
30028 across the switch. */
30029 start_sequence ();
30030 rtx_insn *args_guard_label = nullptr;
30031 if (TARGET_STREAMING_COMPATIBLE)
30032 args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30033 callee_isa_mode);
30034 aarch64_sme_mode_switch_regs args_switch;
30035 args_switch.add_call_args (call);
30036 args_switch.emit_prologue ();
30037 aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
30038 args_switch.emit_epilogue ();
30039 if (args_guard_label)
30040 emit_label (args_guard_label);
30041 auto args_seq = get_insns ();
30042 end_sequence ();
30043 emit_insn_before (args_seq, call);
30045 if (find_reg_note (call, REG_NORETURN, NULL_RTX))
30046 return true;
30048 /* Switch mode after the call, preserving any return registers across
30049 the switch. */
30050 start_sequence ();
30051 rtx_insn *return_guard_label = nullptr;
30052 if (TARGET_STREAMING_COMPATIBLE)
30053 return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30054 callee_isa_mode);
30055 aarch64_sme_mode_switch_regs return_switch;
30056 return_switch.add_call_result (call);
30057 return_switch.emit_prologue ();
30058 aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
30059 return_switch.emit_epilogue ();
30060 if (return_guard_label)
30061 emit_label (return_guard_label);
30062 auto result_seq = get_insns ();
30063 end_sequence ();
30064 emit_insn_after (result_seq, call);
30065 return true;
30068 namespace {
30070 const pass_data pass_data_switch_pstate_sm =
30072 RTL_PASS, // type
30073 "smstarts", // name
30074 OPTGROUP_NONE, // optinfo_flags
30075 TV_NONE, // tv_id
30076 0, // properties_required
30077 0, // properties_provided
30078 0, // properties_destroyed
30079 0, // todo_flags_start
30080 TODO_df_finish, // todo_flags_finish
30083 class pass_switch_pstate_sm : public rtl_opt_pass
30085 public:
30086 pass_switch_pstate_sm (gcc::context *ctxt)
30087 : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
30090 // opt_pass methods:
30091 bool gate (function *) override final;
30092 unsigned int execute (function *) override final;
30095 bool
30096 pass_switch_pstate_sm::gate (function *fn)
30098 return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_FL_SM_OFF
30099 || cfun->machine->call_switches_pstate_sm);
30102 /* Emit any instructions needed to switch PSTATE.SM. */
30103 unsigned int
30104 pass_switch_pstate_sm::execute (function *fn)
30106 basic_block bb;
30108 auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30109 bitmap_clear (blocks);
30110 FOR_EACH_BB_FN (bb, fn)
30112 if (has_abnormal_call_or_eh_pred_edge_p (bb)
30113 && aarch64_switch_pstate_sm_for_landing_pad (bb))
30114 bitmap_set_bit (blocks, bb->index);
30116 if (cfun->machine->call_switches_pstate_sm)
30118 rtx_insn *insn;
30119 FOR_BB_INSNS (bb, insn)
30120 if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30121 if (aarch64_switch_pstate_sm_for_call (call))
30122 bitmap_set_bit (blocks, bb->index);
30125 auto end = BB_END (bb);
30126 if (JUMP_P (end)
30127 && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30128 && aarch64_switch_pstate_sm_for_jump (end))
30129 bitmap_set_bit (blocks, bb->index);
30131 find_many_sub_basic_blocks (blocks);
30132 clear_aux_for_blocks ();
30133 return 0;
30138 rtl_opt_pass *
30139 make_pass_switch_pstate_sm (gcc::context *ctxt)
30141 return new pass_switch_pstate_sm (ctxt);
30144 /* Parse an implementation-defined system register name of
30145 the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30146 Return true if name matched against above pattern, false
30147 otherwise. */
30148 bool
30149 aarch64_is_implem_def_reg (const char *regname)
30151 unsigned pos = 0;
30152 unsigned name_len = strlen (regname);
30153 if (name_len < 12 || name_len > 14)
30154 return false;
30156 auto cterm_valid_p = [&]()
30158 bool leading_zero_p = false;
30159 unsigned i = 0;
30160 char n[3] = {0};
30162 if (regname[pos] != 'c')
30163 return false;
30164 pos++;
30165 while (regname[pos] != '_')
30167 if (leading_zero_p)
30168 return false;
30169 if (i == 0 && regname[pos] == '0')
30170 leading_zero_p = true;
30171 if (i > 2)
30172 return false;
30173 if (!ISDIGIT (regname[pos]))
30174 return false;
30175 n[i++] = regname[pos++];
30177 if (atoi (n) > 15)
30178 return false;
30179 return true;
30182 if (regname[pos] != 's')
30183 return false;
30184 pos++;
30185 if (regname[pos] < '0' || regname[pos] > '3')
30186 return false;
30187 pos++;
30188 if (regname[pos++] != '_')
30189 return false;
30190 if (regname[pos] < '0' || regname[pos] > '7')
30191 return false;
30192 pos++;
30193 if (regname[pos++] != '_')
30194 return false;
30195 if (!cterm_valid_p ())
30196 return false;
30197 if (regname[pos++] != '_')
30198 return false;
30199 if (!cterm_valid_p ())
30200 return false;
30201 if (regname[pos++] != '_')
30202 return false;
30203 if (regname[pos] < '0' || regname[pos] > '7')
30204 return false;
30205 return true;
30208 /* Return true if REGNAME matches either a known permitted system
30209 register name, or a generic sysreg specification. For use in
30210 back-end predicate `aarch64_sysreg_string'. */
30211 bool
30212 aarch64_valid_sysreg_name_p (const char *regname)
30214 const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30215 if (sysreg == NULL)
30216 return aarch64_is_implem_def_reg (regname);
30217 if (sysreg->arch_reqs)
30218 return (aarch64_isa_flags & sysreg->arch_reqs);
30219 return true;
30222 /* Return the generic sysreg specification for a valid system register
30223 name, otherwise NULL. WRITE_P is true iff the register is being
30224 written to. IS128OP indicates the requested system register should
30225 be checked for a 128-bit implementation. */
30226 const char *
30227 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30229 const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30230 if (sysreg == NULL)
30232 if (aarch64_is_implem_def_reg (regname))
30233 return regname;
30234 else
30235 return NULL;
30237 if (is128op && !(sysreg->properties & F_REG_128))
30238 return NULL;
30239 if ((write_p && (sysreg->properties & F_REG_READ))
30240 || (!write_p && (sysreg->properties & F_REG_WRITE)))
30241 return NULL;
30242 if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30243 return NULL;
30244 return sysreg->encoding;
30247 /* Target-specific selftests. */
30249 #if CHECKING_P
30251 namespace selftest {
30253 /* Selftest for the RTL loader.
30254 Verify that the RTL loader copes with a dump from
30255 print_rtx_function. This is essentially just a test that class
30256 function_reader can handle a real dump, but it also verifies
30257 that lookup_reg_by_dump_name correctly handles hard regs.
30258 The presence of hard reg names in the dump means that the test is
30259 target-specific, hence it is in this file. */
30261 static void
30262 aarch64_test_loading_full_dump ()
30264 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
30266 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
30268 rtx_insn *insn_1 = get_insn_by_uid (1);
30269 ASSERT_EQ (NOTE, GET_CODE (insn_1));
30271 rtx_insn *insn_15 = get_insn_by_uid (15);
30272 ASSERT_EQ (INSN, GET_CODE (insn_15));
30273 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
30275 /* Verify crtl->return_rtx. */
30276 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
30277 ASSERT_EQ (0, REGNO (crtl->return_rtx));
30278 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
30281 /* Test the fractional_cost class. */
30283 static void
30284 aarch64_test_fractional_cost ()
30286 using cf = fractional_cost;
30288 ASSERT_EQ (cf (0, 20), 0);
30290 ASSERT_EQ (cf (4, 2), 2);
30291 ASSERT_EQ (3, cf (9, 3));
30293 ASSERT_NE (cf (5, 2), 2);
30294 ASSERT_NE (3, cf (8, 3));
30296 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30297 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30298 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30300 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30301 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30302 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30303 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30304 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30305 ASSERT_EQ (3 - cf (10, 3), 0);
30307 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30308 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30310 ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30311 ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30312 ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30313 ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30314 ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30315 ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30316 ASSERT_TRUE (cf (239, 240) <= 1);
30317 ASSERT_TRUE (cf (240, 240) <= 1);
30318 ASSERT_FALSE (cf (241, 240) <= 1);
30319 ASSERT_FALSE (2 <= cf (207, 104));
30320 ASSERT_TRUE (2 <= cf (208, 104));
30321 ASSERT_TRUE (2 <= cf (209, 104));
30323 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30324 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30325 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30326 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30327 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30328 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30329 ASSERT_TRUE (cf (239, 240) < 1);
30330 ASSERT_FALSE (cf (240, 240) < 1);
30331 ASSERT_FALSE (cf (241, 240) < 1);
30332 ASSERT_FALSE (2 < cf (207, 104));
30333 ASSERT_FALSE (2 < cf (208, 104));
30334 ASSERT_TRUE (2 < cf (209, 104));
30336 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30337 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30338 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30339 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30340 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30341 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30342 ASSERT_FALSE (cf (239, 240) >= 1);
30343 ASSERT_TRUE (cf (240, 240) >= 1);
30344 ASSERT_TRUE (cf (241, 240) >= 1);
30345 ASSERT_TRUE (2 >= cf (207, 104));
30346 ASSERT_TRUE (2 >= cf (208, 104));
30347 ASSERT_FALSE (2 >= cf (209, 104));
30349 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30350 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30351 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30352 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30353 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30354 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30355 ASSERT_FALSE (cf (239, 240) > 1);
30356 ASSERT_FALSE (cf (240, 240) > 1);
30357 ASSERT_TRUE (cf (241, 240) > 1);
30358 ASSERT_TRUE (2 > cf (207, 104));
30359 ASSERT_FALSE (2 > cf (208, 104));
30360 ASSERT_FALSE (2 > cf (209, 104));
30362 ASSERT_EQ (cf (1, 2).ceil (), 1);
30363 ASSERT_EQ (cf (11, 7).ceil (), 2);
30364 ASSERT_EQ (cf (20, 1).ceil (), 20);
30365 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30366 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30367 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30368 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30369 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30371 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30374 /* Calculate whether our system register data, as imported from
30375 `aarch64-sys-reg.def' has any duplicate entries. */
30376 static void
30377 aarch64_test_sysreg_encoding_clashes (void)
30379 using dup_instances_t = hash_map<nofree_string_hash,
30380 std::vector<const sysreg_t*>>;
30382 dup_instances_t duplicate_instances;
30384 /* Every time an encoding is established to come up more than once
30385 we add it to a "clash-analysis queue", which is then used to extract
30386 necessary information from our hash map when establishing whether
30387 repeated encodings are valid. */
30389 /* 1) Collect recurrence information. */
30390 for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
30392 const sysreg_t *reg = aarch64_sysregs + i;
30394 std::vector<const sysreg_t*> *tmp
30395 = &duplicate_instances.get_or_insert (reg->encoding);
30397 tmp->push_back (reg);
30400 /* 2) Carry out analysis on collected data. */
30401 for (auto instance : duplicate_instances)
30403 unsigned nrep = instance.second.size ();
30404 if (nrep > 1)
30405 for (unsigned i = 0; i < nrep; i++)
30406 for (unsigned j = i + 1; j < nrep; j++)
30408 const sysreg_t *a = instance.second[i];
30409 const sysreg_t *b = instance.second[j];
30410 ASSERT_TRUE ((a->properties != b->properties)
30411 || (a->arch_reqs != b->arch_reqs));
30416 /* Run all target-specific selftests. */
30418 static void
30419 aarch64_run_selftests (void)
30421 aarch64_test_loading_full_dump ();
30422 aarch64_test_fractional_cost ();
30423 aarch64_test_sysreg_encoding_clashes ();
30426 } // namespace selftest
30428 #endif /* #if CHECKING_P */
30430 #undef TARGET_STACK_PROTECT_GUARD
30431 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30433 #undef TARGET_ADDRESS_COST
30434 #define TARGET_ADDRESS_COST aarch64_address_cost
30436 /* This hook will determines whether unnamed bitfields affect the alignment
30437 of the containing structure. The hook returns true if the structure
30438 should inherit the alignment requirements of an unnamed bitfield's
30439 type. */
30440 #undef TARGET_ALIGN_ANON_BITFIELD
30441 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30443 #undef TARGET_ASM_ALIGNED_DI_OP
30444 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30446 #undef TARGET_ASM_ALIGNED_HI_OP
30447 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30449 #undef TARGET_ASM_ALIGNED_SI_OP
30450 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30452 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30453 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30454 hook_bool_const_tree_hwi_hwi_const_tree_true
30456 #undef TARGET_ASM_FILE_START
30457 #define TARGET_ASM_FILE_START aarch64_start_file
30459 #undef TARGET_ASM_OUTPUT_MI_THUNK
30460 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30462 #undef TARGET_ASM_SELECT_RTX_SECTION
30463 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30465 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30466 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30468 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30469 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30471 #undef TARGET_BUILD_BUILTIN_VA_LIST
30472 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30474 #undef TARGET_CALLEE_COPIES
30475 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30477 #undef TARGET_FRAME_POINTER_REQUIRED
30478 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30480 #undef TARGET_CAN_ELIMINATE
30481 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30483 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30484 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30485 aarch64_function_attribute_inlinable_p
30487 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30488 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30490 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30491 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30493 #undef TARGET_CAN_INLINE_P
30494 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30496 #undef TARGET_CANNOT_FORCE_CONST_MEM
30497 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30499 #undef TARGET_CASE_VALUES_THRESHOLD
30500 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30502 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30503 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30505 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30506 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30508 /* Only the least significant bit is used for initialization guard
30509 variables. */
30510 #undef TARGET_CXX_GUARD_MASK_BIT
30511 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30513 #undef TARGET_C_MODE_FOR_SUFFIX
30514 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30516 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30517 #undef TARGET_DEFAULT_TARGET_FLAGS
30518 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30519 #endif
30521 #undef TARGET_CLASS_MAX_NREGS
30522 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30524 #undef TARGET_BUILTIN_DECL
30525 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30527 #undef TARGET_BUILTIN_RECIPROCAL
30528 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30530 #undef TARGET_C_EXCESS_PRECISION
30531 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30533 #undef TARGET_C_BITINT_TYPE_INFO
30534 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
30536 #undef TARGET_EXPAND_BUILTIN
30537 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30539 #undef TARGET_EXPAND_BUILTIN_VA_START
30540 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30542 #undef TARGET_FOLD_BUILTIN
30543 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30545 #undef TARGET_FUNCTION_ARG
30546 #define TARGET_FUNCTION_ARG aarch64_function_arg
30548 #undef TARGET_FUNCTION_ARG_ADVANCE
30549 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30551 #undef TARGET_FUNCTION_ARG_BOUNDARY
30552 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30554 #undef TARGET_FUNCTION_ARG_PADDING
30555 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30557 #undef TARGET_GET_RAW_RESULT_MODE
30558 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30559 #undef TARGET_GET_RAW_ARG_MODE
30560 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30562 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30563 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30565 #undef TARGET_FUNCTION_VALUE
30566 #define TARGET_FUNCTION_VALUE aarch64_function_value
30568 #undef TARGET_FUNCTION_VALUE_REGNO_P
30569 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30571 #undef TARGET_START_CALL_ARGS
30572 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30574 #undef TARGET_END_CALL_ARGS
30575 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30577 #undef TARGET_GIMPLE_FOLD_BUILTIN
30578 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30580 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30581 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30583 #undef TARGET_INIT_BUILTINS
30584 #define TARGET_INIT_BUILTINS aarch64_init_builtins
30586 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30587 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30588 aarch64_ira_change_pseudo_allocno_class
30590 #undef TARGET_LEGITIMATE_ADDRESS_P
30591 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30593 #undef TARGET_LEGITIMATE_CONSTANT_P
30594 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30596 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30597 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30598 aarch64_legitimize_address_displacement
30600 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30601 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30603 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30604 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30605 aarch64_libgcc_floating_mode_supported_p
30607 #undef TARGET_MANGLE_TYPE
30608 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30610 #undef TARGET_INVALID_BINARY_OP
30611 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30613 #undef TARGET_VERIFY_TYPE_CONTEXT
30614 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30616 #undef TARGET_MEMORY_MOVE_COST
30617 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30619 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30620 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30622 #undef TARGET_MUST_PASS_IN_STACK
30623 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30625 /* This target hook should return true if accesses to volatile bitfields
30626 should use the narrowest mode possible. It should return false if these
30627 accesses should use the bitfield container type. */
30628 #undef TARGET_NARROW_VOLATILE_BITFIELD
30629 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30631 #undef TARGET_OPTION_OVERRIDE
30632 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30634 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30635 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30636 aarch64_override_options_after_change
30638 #undef TARGET_OFFLOAD_OPTIONS
30639 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30641 #undef TARGET_OPTION_RESTORE
30642 #define TARGET_OPTION_RESTORE aarch64_option_restore
30644 #undef TARGET_OPTION_PRINT
30645 #define TARGET_OPTION_PRINT aarch64_option_print
30647 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30648 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30650 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30651 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30652 aarch64_option_valid_version_attribute_p
30654 #undef TARGET_SET_CURRENT_FUNCTION
30655 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30657 #undef TARGET_PASS_BY_REFERENCE
30658 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30660 #undef TARGET_PREFERRED_RELOAD_CLASS
30661 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30663 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30664 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30666 #undef TARGET_DWARF_FRAME_REG_MODE
30667 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30669 #undef TARGET_PROMOTED_TYPE
30670 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30672 #undef TARGET_SECONDARY_RELOAD
30673 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30675 #undef TARGET_SECONDARY_MEMORY_NEEDED
30676 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30678 #undef TARGET_SHIFT_TRUNCATION_MASK
30679 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30681 #undef TARGET_SETUP_INCOMING_VARARGS
30682 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30684 #undef TARGET_STRUCT_VALUE_RTX
30685 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
30687 #undef TARGET_REGISTER_MOVE_COST
30688 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30690 #undef TARGET_RETURN_IN_MEMORY
30691 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30693 #undef TARGET_RETURN_IN_MSB
30694 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30696 #undef TARGET_RTX_COSTS
30697 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30699 #undef TARGET_INSN_COST
30700 #define TARGET_INSN_COST aarch64_insn_cost
30702 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30703 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30705 #undef TARGET_SCHED_ISSUE_RATE
30706 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
30708 #undef TARGET_SCHED_VARIABLE_ISSUE
30709 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
30711 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
30712 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
30713 aarch64_sched_first_cycle_multipass_dfa_lookahead
30715 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
30716 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
30717 aarch64_first_cycle_multipass_dfa_lookahead_guard
30719 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
30720 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
30721 aarch64_get_separate_components
30723 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
30724 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
30725 aarch64_components_for_bb
30727 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
30728 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
30729 aarch64_disqualify_components
30731 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
30732 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
30733 aarch64_emit_prologue_components
30735 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
30736 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
30737 aarch64_emit_epilogue_components
30739 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
30740 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
30741 aarch64_set_handled_components
30743 #undef TARGET_TRAMPOLINE_INIT
30744 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
30746 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
30747 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
30749 #undef TARGET_VECTOR_MODE_SUPPORTED_P
30750 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
30752 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
30753 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
30755 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
30756 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
30758 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
30759 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
30760 aarch64_builtin_support_vector_misalignment
30762 #undef TARGET_ARRAY_MODE
30763 #define TARGET_ARRAY_MODE aarch64_array_mode
30765 #undef TARGET_ARRAY_MODE_SUPPORTED_P
30766 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
30768 #undef TARGET_VECTORIZE_CREATE_COSTS
30769 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
30771 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
30772 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
30773 aarch64_builtin_vectorization_cost
30775 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
30776 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
30778 #undef TARGET_VECTORIZE_BUILTINS
30779 #define TARGET_VECTORIZE_BUILTINS
30781 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
30782 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
30783 aarch64_autovectorize_vector_modes
30785 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
30786 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
30787 aarch64_atomic_assign_expand_fenv
30789 /* Section anchor support. */
30791 #undef TARGET_MIN_ANCHOR_OFFSET
30792 #define TARGET_MIN_ANCHOR_OFFSET -256
30794 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
30795 byte offset; we can do much more for larger data types, but have no way
30796 to determine the size of the access. We assume accesses are aligned. */
30797 #undef TARGET_MAX_ANCHOR_OFFSET
30798 #define TARGET_MAX_ANCHOR_OFFSET 4095
30800 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
30801 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
30802 aarch64_vectorize_preferred_div_as_shifts_over_mult
30804 #undef TARGET_VECTOR_ALIGNMENT
30805 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
30807 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
30808 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
30809 aarch64_vectorize_preferred_vector_alignment
30810 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
30811 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
30812 aarch64_simd_vector_alignment_reachable
30814 /* vec_perm support. */
30816 #undef TARGET_VECTORIZE_VEC_PERM_CONST
30817 #define TARGET_VECTORIZE_VEC_PERM_CONST \
30818 aarch64_vectorize_vec_perm_const
30820 #undef TARGET_VECTORIZE_RELATED_MODE
30821 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
30822 #undef TARGET_VECTORIZE_GET_MASK_MODE
30823 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
30824 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
30825 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
30826 aarch64_empty_mask_is_expensive
30827 #undef TARGET_PREFERRED_ELSE_VALUE
30828 #define TARGET_PREFERRED_ELSE_VALUE \
30829 aarch64_preferred_else_value
30831 #undef TARGET_INIT_LIBFUNCS
30832 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
30834 #undef TARGET_FIXED_CONDITION_CODE_REGS
30835 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
30837 #undef TARGET_FLAGS_REGNUM
30838 #define TARGET_FLAGS_REGNUM CC_REGNUM
30840 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
30841 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
30843 #undef TARGET_ASAN_SHADOW_OFFSET
30844 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
30846 #undef TARGET_LEGITIMIZE_ADDRESS
30847 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
30849 #undef TARGET_SCHED_CAN_SPECULATE_INSN
30850 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
30852 #undef TARGET_CAN_USE_DOLOOP_P
30853 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
30855 #undef TARGET_SCHED_ADJUST_PRIORITY
30856 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
30858 #undef TARGET_SCHED_MACRO_FUSION_P
30859 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
30861 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
30862 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
30864 #undef TARGET_SCHED_FUSION_PRIORITY
30865 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
30867 #undef TARGET_UNSPEC_MAY_TRAP_P
30868 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
30870 #undef TARGET_USE_PSEUDO_PIC_REG
30871 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
30873 #undef TARGET_PRINT_OPERAND
30874 #define TARGET_PRINT_OPERAND aarch64_print_operand
30876 #undef TARGET_PRINT_OPERAND_ADDRESS
30877 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
30879 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
30880 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
30882 #undef TARGET_OPTAB_SUPPORTED_P
30883 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
30885 #undef TARGET_OMIT_STRUCT_RETURN_REG
30886 #define TARGET_OMIT_STRUCT_RETURN_REG true
30888 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
30889 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
30890 aarch64_dwarf_poly_indeterminate_value
30892 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
30893 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
30894 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
30896 #undef TARGET_HARD_REGNO_NREGS
30897 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
30898 #undef TARGET_HARD_REGNO_MODE_OK
30899 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
30901 #undef TARGET_MODES_TIEABLE_P
30902 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
30904 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
30905 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
30906 aarch64_hard_regno_call_part_clobbered
30908 #undef TARGET_INSN_CALLEE_ABI
30909 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
30911 #undef TARGET_CONSTANT_ALIGNMENT
30912 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
30914 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
30915 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
30916 aarch64_stack_clash_protection_alloca_probe_range
30918 #undef TARGET_COMPUTE_PRESSURE_CLASSES
30919 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
30921 #undef TARGET_CAN_CHANGE_MODE_CLASS
30922 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
30924 #undef TARGET_SELECT_EARLY_REMAT_MODES
30925 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
30927 #undef TARGET_SPECULATION_SAFE_VALUE
30928 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
30930 #undef TARGET_ESTIMATED_POLY_VALUE
30931 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
30933 #undef TARGET_ATTRIBUTE_TABLE
30934 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
30936 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
30937 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
30938 aarch64_simd_clone_compute_vecsize_and_simdlen
30940 #undef TARGET_SIMD_CLONE_ADJUST
30941 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
30943 #undef TARGET_SIMD_CLONE_USABLE
30944 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
30946 #undef TARGET_COMP_TYPE_ATTRIBUTES
30947 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
30949 #undef TARGET_MERGE_DECL_ATTRIBUTES
30950 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
30952 #undef TARGET_GET_MULTILIB_ABI_NAME
30953 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
30955 #undef TARGET_FNTYPE_ABI
30956 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
30958 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
30959 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
30961 #if CHECKING_P
30962 #undef TARGET_RUN_TARGET_SELFTESTS
30963 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
30964 #endif /* #if CHECKING_P */
30966 #undef TARGET_ASM_POST_CFI_STARTPROC
30967 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
30969 #undef TARGET_STRICT_ARGUMENT_NAMING
30970 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
30972 #undef TARGET_MODE_EMIT
30973 #define TARGET_MODE_EMIT aarch64_mode_emit
30975 #undef TARGET_MODE_NEEDED
30976 #define TARGET_MODE_NEEDED aarch64_mode_needed
30978 #undef TARGET_MODE_AFTER
30979 #define TARGET_MODE_AFTER aarch64_mode_after
30981 #undef TARGET_MODE_CONFLUENCE
30982 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
30984 #undef TARGET_MODE_BACKPROP
30985 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
30987 #undef TARGET_MODE_ENTRY
30988 #define TARGET_MODE_ENTRY aarch64_mode_entry
30990 #undef TARGET_MODE_EXIT
30991 #define TARGET_MODE_EXIT aarch64_mode_exit
30993 #undef TARGET_MODE_EH_HANDLER
30994 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
30996 #undef TARGET_MODE_PRIORITY
30997 #define TARGET_MODE_PRIORITY aarch64_mode_priority
30999 #undef TARGET_MD_ASM_ADJUST
31000 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31002 #undef TARGET_ASM_FILE_END
31003 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31005 #undef TARGET_ASM_FUNCTION_EPILOGUE
31006 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31008 #undef TARGET_HAVE_SHADOW_CALL_STACK
31009 #define TARGET_HAVE_SHADOW_CALL_STACK true
31011 #undef TARGET_CONST_ANCHOR
31012 #define TARGET_CONST_ANCHOR 0x1000000
31014 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31015 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31017 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31018 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31020 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31021 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31023 #undef TARGET_OPTION_FUNCTION_VERSIONS
31024 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31026 #undef TARGET_COMPARE_VERSION_PRIORITY
31027 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31029 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31030 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31031 aarch64_generate_version_dispatcher_body
31033 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31034 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31035 aarch64_get_function_versions_dispatcher
31037 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31038 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31040 struct gcc_target targetm = TARGET_INITIALIZER;
31042 #include "gt-aarch64.h"