1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2024 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #define INCLUDE_VECTOR
28 #include "coretypes.h"
39 #include "stringpool.h"
46 #include "diagnostic.h"
47 #include "insn-attr.h"
49 #include "fold-const.h"
50 #include "stor-layout.h"
58 #include "langhooks.h"
62 #include "gimple-iterator.h"
63 #include "tree-vectorizer.h"
64 #include "aarch64-cost-tables.h"
68 #include "tm-constrs.h"
69 #include "sched-int.h"
70 #include "target-globals.h"
71 #include "common/common-target.h"
74 #include "selftest-rtl.h"
75 #include "rtx-vector-builder.h"
78 #include "function-abi.h"
79 #include "gimple-pretty-print.h"
80 #include "tree-ssa-loop-niter.h"
81 #include "fractional-cost.h"
85 #include "aarch64-feature-deps.h"
86 #include "config/arm/aarch-common.h"
87 #include "config/arm/aarch-common-protos.h"
88 #include "common/config/aarch64/cpuinfo.h"
91 #include "tree-pass.h"
93 #include "symbol-summary.h"
97 #include "ipa-fnsummary.h"
100 /* This file should be included last. */
101 #include "target-def.h"
103 /* Defined for convenience. */
104 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
106 /* Maximum bytes set for an inline memset expansion. With -Os use 3 STP
107 and 1 MOVI/DUP (same size as a call). */
108 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
110 /* Flags that describe how a function shares certain architectural state
113 - AARCH64_STATE_SHARED indicates that the function does share the state
116 - AARCH64_STATE_IN indicates that the function reads (or might read) the
117 incoming state. The converse is that the function ignores the incoming
120 - AARCH64_STATE_OUT indicates that the function returns new state.
121 The converse is that the state on return is the same as it was on entry.
123 A function that partially modifies the state treats it as both IN
124 and OUT (because the value on return depends to some extent on the
126 constexpr auto AARCH64_STATE_SHARED
= 1U << 0;
127 constexpr auto AARCH64_STATE_IN
= 1U << 1;
128 constexpr auto AARCH64_STATE_OUT
= 1U << 2;
130 /* Information about a legitimate vector immediate operand. */
131 struct simd_immediate_info
133 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
134 enum modifier_type
{ LSL
, MSL
};
136 simd_immediate_info () {}
137 simd_immediate_info (scalar_float_mode
, rtx
);
138 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
139 insn_type
= MOV
, modifier_type
= LSL
,
141 simd_immediate_info (scalar_mode
, rtx
, rtx
);
142 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
144 /* The mode of the elements. */
145 scalar_mode elt_mode
;
147 /* The instruction to use to move the immediate into a vector. */
152 /* For MOV and MVN. */
155 /* The value of each element. */
158 /* The kind of shift modifier to use, and the number of bits to shift.
159 This is (LSL, 0) if no shift is needed. */
160 modifier_type modifier
;
167 /* The value of the first element and the step to be added for each
168 subsequent element. */
173 aarch64_svpattern pattern
;
177 /* Construct a floating-point immediate in which each element has mode
178 ELT_MODE_IN and value VALUE_IN. */
179 inline simd_immediate_info
180 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
181 : elt_mode (elt_mode_in
), insn (MOV
)
183 u
.mov
.value
= value_in
;
184 u
.mov
.modifier
= LSL
;
188 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
189 and value VALUE_IN. The other parameters are as for the structure
191 inline simd_immediate_info
192 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
193 unsigned HOST_WIDE_INT value_in
,
194 insn_type insn_in
, modifier_type modifier_in
,
195 unsigned int shift_in
)
196 : elt_mode (elt_mode_in
), insn (insn_in
)
198 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
199 u
.mov
.modifier
= modifier_in
;
200 u
.mov
.shift
= shift_in
;
203 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
204 and where element I is equal to BASE_IN + I * STEP_IN. */
205 inline simd_immediate_info
206 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
207 : elt_mode (elt_mode_in
), insn (INDEX
)
209 u
.index
.base
= base_in
;
210 u
.index
.step
= step_in
;
213 /* Construct a predicate that controls elements of mode ELT_MODE_IN
214 and has PTRUE pattern PATTERN_IN. */
215 inline simd_immediate_info
216 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
217 aarch64_svpattern pattern_in
)
218 : elt_mode (elt_mode_in
), insn (PTRUE
)
220 u
.pattern
= pattern_in
;
225 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
226 class pure_scalable_type_info
229 /* Represents the result of analyzing a type. All values are nonzero,
230 in the possibly forlorn hope that accidental conversions to bool
231 trigger a warning. */
234 /* The type does not have an ABI identity; i.e. it doesn't contain
235 at least one object whose type is a Fundamental Data Type. */
238 /* The type is definitely a Pure Scalable Type. */
241 /* The type is definitely not a Pure Scalable Type. */
244 /* It doesn't matter for PCS purposes whether the type is a Pure
245 Scalable Type or not, since the type will be handled the same
248 Specifically, this means that if the type is a Pure Scalable Type,
249 there aren't enough argument registers to hold it, and so it will
250 need to be passed or returned in memory. If the type isn't a
251 Pure Scalable Type, it's too big to be passed or returned in core
252 or SIMD&FP registers, and so again will need to go in memory. */
256 /* Aggregates of 17 bytes or more are normally passed and returned
257 in memory, so aggregates of that size can safely be analyzed as
258 DOESNT_MATTER. We need to be able to collect enough pieces to
259 represent a PST that is smaller than that. Since predicates are
260 2 bytes in size for -msve-vector-bits=128, that means we need to be
261 able to store at least 8 pieces.
263 We also need to be able to store enough pieces to represent
264 a single vector in each vector argument register and a single
265 predicate in each predicate argument register. This means that
266 we need at least 12 pieces. */
267 static const unsigned int MAX_PIECES
= NUM_FP_ARG_REGS
+ NUM_PR_ARG_REGS
;
268 static_assert (MAX_PIECES
>= 8, "Need to store at least 8 predicates");
270 /* Describes one piece of a PST. Each piece is one of:
272 - a single Scalable Vector Type (SVT)
273 - a single Scalable Predicate Type (SPT)
274 - a PST containing 2, 3 or 4 SVTs, with no padding
276 It either represents a single built-in type or a PST formed from
277 multiple homogeneous built-in types. */
280 rtx
get_rtx (unsigned int, unsigned int) const;
282 /* The number of vector and predicate registers that the piece
283 occupies. One of the two is always zero. */
287 /* The mode of the registers described above. */
290 /* If this piece is formed from multiple homogeneous built-in types,
291 this is the mode of the built-in types, otherwise it is MODE. */
292 machine_mode orig_mode
;
294 /* The offset in bytes of the piece from the start of the type. */
298 /* Divides types analyzed as IS_PST into individual pieces. The pieces
299 are in memory order. */
300 auto_vec
<piece
, MAX_PIECES
> pieces
;
302 unsigned int num_zr () const;
303 unsigned int num_pr () const;
305 rtx
get_rtx (machine_mode mode
, unsigned int, unsigned int) const;
307 analysis_result
analyze (const_tree
);
308 bool analyze_registers (const_tree
);
311 analysis_result
analyze_array (const_tree
);
312 analysis_result
analyze_record (const_tree
);
313 void add_piece (const piece
&);
317 /* The current code model. */
318 enum aarch64_code_model aarch64_cmodel
;
320 enum aarch64_tp_reg aarch64_tpidr_register
;
322 /* The number of 64-bit elements in an SVE vector. */
323 poly_uint16 aarch64_sve_vg
;
326 #undef TARGET_HAVE_TLS
327 #define TARGET_HAVE_TLS 1
330 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
331 static bool aarch64_return_in_memory_1 (const_tree
);
332 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
334 machine_mode
*, int *,
336 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
337 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
338 static void aarch64_override_options_after_change (void);
339 static bool aarch64_vector_mode_supported_p (machine_mode
);
340 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
341 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
345 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
346 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
347 aarch64_addr_query_type
);
349 /* The processor for which instructions should be scheduled. */
350 enum aarch64_processor aarch64_tune
= cortexa53
;
352 /* Mask to specify which instruction scheduling options should be used. */
353 uint64_t aarch64_tune_flags
= 0;
355 /* Global flag for PC relative loads. */
356 bool aarch64_pcrelative_literal_loads
;
358 /* Global flag for whether frame pointer is enabled. */
359 bool aarch64_use_frame_pointer
;
361 /* Support for command line parsing of boolean flags in the tuning
363 struct aarch64_flag_desc
369 #define AARCH64_FUSION_PAIR(name, internal_name) \
370 { name, AARCH64_FUSE_##internal_name },
371 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
373 { "none", AARCH64_FUSE_NOTHING
},
374 #include "aarch64-fusion-pairs.def"
375 { "all", AARCH64_FUSE_ALL
},
376 { NULL
, AARCH64_FUSE_NOTHING
}
379 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
380 { name, AARCH64_EXTRA_TUNE_##internal_name },
381 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
383 { "none", AARCH64_EXTRA_TUNE_NONE
},
384 #include "aarch64-tuning-flags.def"
385 { "all", AARCH64_EXTRA_TUNE_ALL
},
386 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
389 /* Tuning parameters. */
390 #include "tuning_models/generic.h"
391 #include "tuning_models/generic_armv8_a.h"
392 #include "tuning_models/generic_armv9_a.h"
393 #include "tuning_models/cortexa35.h"
394 #include "tuning_models/cortexa53.h"
395 #include "tuning_models/cortexa57.h"
396 #include "tuning_models/cortexa72.h"
397 #include "tuning_models/cortexa73.h"
398 #include "tuning_models/exynosm1.h"
399 #include "tuning_models/thunderxt88.h"
400 #include "tuning_models/thunderx.h"
401 #include "tuning_models/tsv110.h"
402 #include "tuning_models/xgene1.h"
403 #include "tuning_models/emag.h"
404 #include "tuning_models/qdf24xx.h"
405 #include "tuning_models/saphira.h"
406 #include "tuning_models/thunderx2t99.h"
407 #include "tuning_models/thunderx3t110.h"
408 #include "tuning_models/neoversen1.h"
409 #include "tuning_models/ampere1.h"
410 #include "tuning_models/ampere1a.h"
411 #include "tuning_models/ampere1b.h"
412 #include "tuning_models/neoversev1.h"
413 #include "tuning_models/neoverse512tvb.h"
414 #include "tuning_models/neoversen2.h"
415 #include "tuning_models/neoversev2.h"
416 #include "tuning_models/a64fx.h"
418 /* Support for fine-grained override of the tuning structures. */
419 struct aarch64_tuning_override_function
422 void (*parse_override
)(const char*, struct tune_params
*);
425 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
426 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
427 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
429 static const struct aarch64_tuning_override_function
430 aarch64_tuning_override_functions
[] =
432 { "fuse", aarch64_parse_fuse_string
},
433 { "tune", aarch64_parse_tune_string
},
434 { "sve_width", aarch64_parse_sve_width_string
},
438 /* A processor implementing AArch64. */
442 aarch64_processor ident
;
443 aarch64_processor sched_core
;
445 aarch64_feature_flags flags
;
446 const tune_params
*tune
;
449 /* Architectures implementing AArch64. */
450 static CONSTEXPR
const processor all_architectures
[] =
452 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
453 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
454 feature_deps::ARCH_IDENT ().enable, NULL},
455 #include "aarch64-arches.def"
456 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, NULL
}
459 /* Processor cores implementing AArch64. */
460 static const struct processor all_cores
[] =
462 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
463 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
464 feature_deps::cpu_##IDENT, &COSTS##_tunings},
465 #include "aarch64-cores.def"
466 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, NULL
}
468 /* Internal representation of system registers. */
471 /* Stringified sysreg encoding values, represented as
472 s<sn>_<op1>_c<cn>_c<cm>_<op2>. */
473 const char *encoding
;
474 /* Flags affecting sysreg usage, such as read/write-only. */
476 /* Architectural features implied by sysreg. */
477 aarch64_feature_flags arch_reqs
;
480 /* An aarch64_feature_set initializer for a single feature,
481 AARCH64_FEATURE_<FEAT>. */
482 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
484 /* Used by AARCH64_FEATURES. */
485 #define AARCH64_OR_FEATURES_1(X, F1) \
487 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
488 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
489 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
490 (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
492 /* An aarch64_feature_set initializer for the N features listed in "...". */
493 #define AARCH64_FEATURES(N, ...) \
494 AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
496 #define AARCH64_NO_FEATURES 0
498 /* Flags associated with the properties of system registers. It mainly serves
499 to mark particular registers as read or write only. */
500 #define F_DEPRECATED (1 << 1)
501 #define F_REG_READ (1 << 2)
502 #define F_REG_WRITE (1 << 3)
503 #define F_ARCHEXT (1 << 4)
504 /* Flag indicating register name is alias for another system register. */
505 #define F_REG_ALIAS (1 << 5)
506 /* Flag indicatinig registers which may be implemented with 128-bits. */
507 #define F_REG_128 (1 << 6)
509 /* Database of system registers, their encodings and architectural
511 const sysreg_t aarch64_sysregs
[] =
513 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
514 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
515 { NAME, ENC, FLAGS, ARCH },
516 #include "aarch64-sys-regs.def"
520 #undef AARCH64_NO_FEATURES
522 using sysreg_map_t
= hash_map
<nofree_string_hash
, const sysreg_t
*>;
523 static sysreg_map_t
*sysreg_map
= nullptr;
525 /* Map system register names to their hardware metadata: encoding,
526 feature flags and architectural feature requirements, all of which
527 are encoded in a sysreg_t struct. */
529 aarch64_register_sysreg (const char *name
, const sysreg_t
*metadata
)
531 bool dup
= sysreg_map
->put (name
, metadata
);
532 gcc_checking_assert (!dup
);
535 /* Lazily initialize hash table for system register validation,
536 checking the validity of supplied register name and returning
537 register's associated metadata. */
539 aarch64_init_sysregs (void)
541 gcc_assert (!sysreg_map
);
542 sysreg_map
= new sysreg_map_t
;
545 for (unsigned i
= 0; i
< ARRAY_SIZE (aarch64_sysregs
); i
++)
547 const sysreg_t
*reg
= aarch64_sysregs
+ i
;
548 aarch64_register_sysreg (reg
->name
, reg
);
552 /* No direct access to the sysreg hash-map should be made. Doing so
553 risks trying to acess an unitialized hash-map and dereferencing the
554 returned double pointer without due care risks dereferencing a
557 aarch64_lookup_sysreg_map (const char *regname
)
560 aarch64_init_sysregs ();
562 const sysreg_t
**sysreg_entry
= sysreg_map
->get (regname
);
563 if (sysreg_entry
!= NULL
)
564 return *sysreg_entry
;
568 /* The current tuning set. */
569 struct tune_params aarch64_tune_params
= generic_tunings
;
571 /* If NAME is the name of an arm:: attribute that describes shared state,
572 return its associated AARCH64_STATE_* flags, otherwise return 0. */
574 aarch64_attribute_shared_state_flags (const char *name
)
576 if (strcmp (name
, "in") == 0)
577 return AARCH64_STATE_SHARED
| AARCH64_STATE_IN
;
578 if (strcmp (name
, "inout") == 0)
579 return AARCH64_STATE_SHARED
| AARCH64_STATE_IN
| AARCH64_STATE_OUT
;
580 if (strcmp (name
, "out") == 0)
581 return AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
;
582 if (strcmp (name
, "preserves") == 0)
583 return AARCH64_STATE_SHARED
;
587 /* See whether attribute list ATTRS has any sharing information
588 for state STATE_NAME. Return the associated state flags if so,
589 otherwise return 0. */
591 aarch64_lookup_shared_state_flags (tree attrs
, const char *state_name
)
593 for (tree attr
= attrs
; attr
; attr
= TREE_CHAIN (attr
))
595 if (!cxx11_attribute_p (attr
))
598 auto ns
= IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr
)));
599 if (strcmp (ns
, "arm") != 0)
602 auto attr_name
= IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr
)));
603 auto flags
= aarch64_attribute_shared_state_flags (attr_name
);
607 for (tree arg
= TREE_VALUE (attr
); arg
; arg
= TREE_CHAIN (arg
))
609 tree value
= TREE_VALUE (arg
);
610 if (TREE_CODE (value
) == STRING_CST
611 && strcmp (TREE_STRING_POINTER (value
), state_name
) == 0)
618 /* Return true if DECL creates a new scope for state STATE_STRING. */
620 aarch64_fndecl_has_new_state (const_tree decl
, const char *state_name
)
622 if (tree attr
= lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl
)))
623 for (tree arg
= TREE_VALUE (attr
); arg
; arg
= TREE_CHAIN (arg
))
625 tree value
= TREE_VALUE (arg
);
626 if (TREE_CODE (value
) == STRING_CST
627 && strcmp (TREE_STRING_POINTER (value
), state_name
) == 0)
633 /* Return true if attribute argument VALUE is a recognized state string,
634 otherwise report an error. NAME is the name of the attribute to which
635 VALUE is being passed. */
637 aarch64_check_state_string (tree name
, tree value
)
639 if (TREE_CODE (value
) != STRING_CST
)
641 error ("the arguments to %qE must be constant strings", name
);
645 const char *state_name
= TREE_STRING_POINTER (value
);
646 if (strcmp (state_name
, "za") != 0
647 && strcmp (state_name
, "zt0") != 0)
649 error ("unrecognized state string %qs", state_name
);
656 /* qsort callback to compare two STRING_CSTs. */
658 cmp_string_csts (const void *a
, const void *b
)
660 return strcmp (TREE_STRING_POINTER (*(const_tree
const *) a
),
661 TREE_STRING_POINTER (*(const_tree
const *) b
));
664 /* Canonicalize a list of state strings. ARGS contains the arguments to
665 a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
666 of the same type. If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
667 arguments and drop the new attribute. Otherwise, the new attribute must
668 be kept and ARGS must include the information in OLD_ATTR.
670 In both cases, the new arguments must be a sorted list of state strings
671 with duplicates removed.
673 Return true if new attribute should be kept, false if it should be
676 aarch64_merge_string_arguments (tree args
, tree old_attr
,
677 bool can_merge_in_place
)
679 /* Get a sorted list of all state strings (including duplicates). */
680 auto add_args
= [](vec
<tree
> &strings
, const_tree args
)
682 for (const_tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
683 if (TREE_CODE (TREE_VALUE (arg
)) == STRING_CST
)
684 strings
.safe_push (TREE_VALUE (arg
));
686 auto_vec
<tree
, 16> strings
;
687 add_args (strings
, args
);
689 add_args (strings
, TREE_VALUE (old_attr
));
690 strings
.qsort (cmp_string_csts
);
692 /* The list can be empty if there was no previous attribute and if all
693 the new arguments are erroneous. Drop the attribute in that case. */
694 if (strings
.is_empty ())
697 /* Destructively modify one of the argument lists, removing duplicates
699 bool use_old_attr
= old_attr
&& can_merge_in_place
;
700 tree
*end
= use_old_attr
? &TREE_VALUE (old_attr
) : &args
;
701 tree prev
= NULL_TREE
;
702 for (tree arg
: strings
)
704 if (prev
&& simple_cst_equal (arg
, prev
))
708 *end
= tree_cons (NULL_TREE
, arg
, NULL_TREE
);
710 TREE_VALUE (*end
) = arg
;
711 end
= &TREE_CHAIN (*end
);
714 return !use_old_attr
;
717 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
720 handle_aarch64_vector_pcs_attribute (tree
*node
, tree name
, tree
,
721 int, bool *no_add_attrs
)
723 /* Since we set fn_type_req to true, the caller should have checked
725 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node
));
726 switch ((arm_pcs
) fntype_abi (*node
).id ())
728 case ARM_PCS_AAPCS64
:
733 error ("the %qE attribute cannot be applied to an SVE function type",
735 *no_add_attrs
= true;
738 case ARM_PCS_TLSDESC
:
739 case ARM_PCS_UNKNOWN
:
745 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
746 otherwise report an error. */
748 aarch64_check_arm_new_against_type (tree args
, tree decl
)
750 tree type_attrs
= TYPE_ATTRIBUTES (TREE_TYPE (decl
));
751 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
753 tree value
= TREE_VALUE (arg
);
754 if (TREE_CODE (value
) == STRING_CST
)
756 const char *state_name
= TREE_STRING_POINTER (value
);
757 if (aarch64_lookup_shared_state_flags (type_attrs
, state_name
))
759 error_at (DECL_SOURCE_LOCATION (decl
),
760 "cannot create a new %qs scope since %qs is shared"
761 " with callers", state_name
, state_name
);
769 /* Callback for arm::new attributes. */
771 handle_arm_new (tree
*node
, tree name
, tree args
, int, bool *no_add_attrs
)
774 if (TREE_CODE (decl
) != FUNCTION_DECL
)
776 error ("%qE attribute applies only to function definitions", name
);
777 *no_add_attrs
= true;
780 if (TREE_TYPE (decl
) == error_mark_node
)
782 *no_add_attrs
= true;
786 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
787 aarch64_check_state_string (name
, TREE_VALUE (arg
));
789 if (!aarch64_check_arm_new_against_type (args
, decl
))
791 *no_add_attrs
= true;
795 /* If there is an old attribute, we should try to update it in-place,
796 so that there is only one (definitive) arm::new attribute on the decl. */
797 tree old_attr
= lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl
));
798 if (!aarch64_merge_string_arguments (args
, old_attr
, true))
799 *no_add_attrs
= true;
804 /* Callback for arm::{in,out,inout,preserves} attributes. */
806 handle_arm_shared (tree
*node
, tree name
, tree args
,
807 int, bool *no_add_attrs
)
810 tree old_attrs
= TYPE_ATTRIBUTES (type
);
811 auto flags
= aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name
));
812 for (tree arg
= args
; arg
; arg
= TREE_CHAIN (arg
))
814 tree value
= TREE_VALUE (arg
);
815 if (aarch64_check_state_string (name
, value
))
817 const char *state_name
= TREE_STRING_POINTER (value
);
818 auto old_flags
= aarch64_lookup_shared_state_flags (old_attrs
,
820 if (old_flags
&& old_flags
!= flags
)
822 error ("inconsistent attributes for state %qs", state_name
);
823 *no_add_attrs
= true;
829 /* We can't update an old attribute in-place, since types are shared.
830 Instead make sure that this new attribute contains all the
831 information, so that the old attribute becomes redundant. */
832 tree old_attr
= lookup_attribute ("arm", IDENTIFIER_POINTER (name
),
834 if (!aarch64_merge_string_arguments (args
, old_attr
, false))
835 *no_add_attrs
= true;
840 /* Mutually-exclusive function type attributes for controlling PSTATE.SM. */
841 static const struct attribute_spec::exclusions attr_streaming_exclusions
[] =
843 /* Attribute name exclusion applies to:
844 function, type, variable */
845 { "streaming", false, true, false },
846 { "streaming_compatible", false, true, false },
847 { NULL
, false, false, false }
850 /* Table of machine attributes. */
851 static const attribute_spec aarch64_gnu_attributes
[] =
853 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
854 affects_type_identity, handler, exclude } */
855 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
856 handle_aarch64_vector_pcs_attribute
, NULL
},
857 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
858 aarch64_sve::handle_arm_sve_vector_bits_attribute
,
860 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL
, NULL
},
861 { "SVE type", 3, 3, false, true, false, true, NULL
, NULL
},
862 { "SVE sizeless type", 0, 0, false, true, false, true, NULL
, NULL
},
863 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
864 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute
, NULL
},
865 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute
, NULL
},
867 #ifdef SUBTARGET_ATTRIBUTE_TABLE
868 SUBTARGET_ATTRIBUTE_TABLE
872 static const scoped_attribute_specs aarch64_gnu_attribute_table
=
874 "gnu", { aarch64_gnu_attributes
}
877 static const attribute_spec aarch64_arm_attributes
[] =
879 { "streaming", 0, 0, false, true, true, true,
880 NULL
, attr_streaming_exclusions
},
881 { "streaming_compatible", 0, 0, false, true, true, true,
882 NULL
, attr_streaming_exclusions
},
883 { "locally_streaming", 0, 0, true, false, false, false, NULL
, NULL
},
884 { "new", 1, -1, true, false, false, false,
885 handle_arm_new
, NULL
},
886 { "preserves", 1, -1, false, true, true, true,
887 handle_arm_shared
, NULL
},
888 { "in", 1, -1, false, true, true, true,
889 handle_arm_shared
, NULL
},
890 { "out", 1, -1, false, true, true, true,
891 handle_arm_shared
, NULL
},
892 { "inout", 1, -1, false, true, true, true,
893 handle_arm_shared
, NULL
}
896 static const scoped_attribute_specs aarch64_arm_attribute_table
=
898 "arm", { aarch64_arm_attributes
}
901 static const scoped_attribute_specs
*const aarch64_attribute_table
[] =
903 &aarch64_gnu_attribute_table
,
904 &aarch64_arm_attribute_table
907 typedef enum aarch64_cond_code
909 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
910 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
911 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
915 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
918 /* The condition codes of the processor, and the inverse function. */
919 static const char * const aarch64_condition_codes
[] =
921 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
922 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
925 /* The preferred condition codes for SVE conditions. */
926 static const char *const aarch64_sve_condition_codes
[] =
928 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
929 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
932 /* Return the assembly token for svpattern value VALUE. */
935 svpattern_token (enum aarch64_svpattern pattern
)
939 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
940 AARCH64_FOR_SVPATTERN (CASE
)
942 case AARCH64_NUM_SVPATTERNS
:
948 /* Return the location of a piece that is known to be passed or returned
949 in registers. FIRST_ZR is the first unused vector argument register
950 and FIRST_PR is the first unused predicate argument register. */
953 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr
,
954 unsigned int first_pr
) const
956 gcc_assert (VECTOR_MODE_P (mode
)
957 && first_zr
+ num_zr
<= V0_REGNUM
+ NUM_FP_ARG_REGS
958 && first_pr
+ num_pr
<= P0_REGNUM
+ NUM_PR_ARG_REGS
);
960 if (num_zr
> 0 && num_pr
== 0)
961 return gen_rtx_REG (mode
, first_zr
);
963 if (num_zr
== 0 && num_pr
<= 2)
964 return gen_rtx_REG (mode
, first_pr
);
969 /* Return the total number of vector registers required by the PST. */
972 pure_scalable_type_info::num_zr () const
974 unsigned int res
= 0;
975 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
976 res
+= pieces
[i
].num_zr
;
980 /* Return the total number of predicate registers required by the PST. */
983 pure_scalable_type_info::num_pr () const
985 unsigned int res
= 0;
986 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
987 res
+= pieces
[i
].num_pr
;
991 /* Return the location of a PST that is known to be passed or returned
992 in registers. FIRST_ZR is the first unused vector argument register
993 and FIRST_PR is the first unused predicate argument register. */
996 pure_scalable_type_info::get_rtx (machine_mode mode
,
997 unsigned int first_zr
,
998 unsigned int first_pr
) const
1000 /* Try to return a single REG if possible. This leads to better
1001 code generation; it isn't required for correctness. */
1002 if (mode
== pieces
[0].mode
)
1004 gcc_assert (pieces
.length () == 1);
1005 return pieces
[0].get_rtx (first_zr
, first_pr
);
1008 /* Build up a PARALLEL that contains the individual pieces. */
1009 rtvec rtxes
= rtvec_alloc (pieces
.length ());
1010 for (unsigned int i
= 0; i
< pieces
.length (); ++i
)
1012 rtx reg
= pieces
[i
].get_rtx (first_zr
, first_pr
);
1013 rtx offset
= gen_int_mode (pieces
[i
].offset
, Pmode
);
1014 RTVEC_ELT (rtxes
, i
) = gen_rtx_EXPR_LIST (VOIDmode
, reg
, offset
);
1015 first_zr
+= pieces
[i
].num_zr
;
1016 first_pr
+= pieces
[i
].num_pr
;
1018 return gen_rtx_PARALLEL (mode
, rtxes
);
1021 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1024 pure_scalable_type_info::analysis_result
1025 pure_scalable_type_info::analyze (const_tree type
)
1027 /* Prevent accidental reuse. */
1028 gcc_assert (pieces
.is_empty ());
1030 /* No code will be generated for erroneous types, so we won't establish
1032 if (type
== error_mark_node
)
1033 return NO_ABI_IDENTITY
;
1035 /* Zero-sized types disappear in the language->ABI mapping. */
1036 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
1037 return NO_ABI_IDENTITY
;
1039 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1041 if (aarch64_sve::builtin_type_p (type
, &p
.num_zr
, &p
.num_pr
))
1043 machine_mode mode
= TYPE_MODE_RAW (type
);
1044 gcc_assert (VECTOR_MODE_P (mode
)
1045 && (!TARGET_SVE
|| aarch64_sve_mode_p (mode
)));
1047 p
.mode
= p
.orig_mode
= mode
;
1052 /* Check for user-defined PSTs. */
1053 if (TREE_CODE (type
) == ARRAY_TYPE
)
1054 return analyze_array (type
);
1055 if (TREE_CODE (type
) == RECORD_TYPE
)
1056 return analyze_record (type
);
1061 /* Analyze a type that is known not to be passed or returned in memory.
1062 Return true if it has an ABI identity and is a Pure Scalable Type. */
1065 pure_scalable_type_info::analyze_registers (const_tree type
)
1067 analysis_result result
= analyze (type
);
1068 gcc_assert (result
!= DOESNT_MATTER
);
1069 return result
== IS_PST
;
1072 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1074 pure_scalable_type_info::analysis_result
1075 pure_scalable_type_info::analyze_array (const_tree type
)
1077 /* Analyze the element type. */
1078 pure_scalable_type_info element_info
;
1079 analysis_result result
= element_info
.analyze (TREE_TYPE (type
));
1080 if (result
!= IS_PST
)
1083 /* An array of unknown, flexible or variable length will be passed and
1084 returned by reference whatever we do. */
1085 tree nelts_minus_one
= array_type_nelts (type
);
1086 if (!tree_fits_uhwi_p (nelts_minus_one
))
1087 return DOESNT_MATTER
;
1089 /* Likewise if the array is constant-sized but too big to be interesting.
1090 The double checks against MAX_PIECES are to protect against overflow. */
1091 unsigned HOST_WIDE_INT count
= tree_to_uhwi (nelts_minus_one
);
1092 if (count
> MAX_PIECES
)
1093 return DOESNT_MATTER
;
1095 if (count
* element_info
.pieces
.length () > MAX_PIECES
)
1096 return DOESNT_MATTER
;
1098 /* The above checks should have weeded out elements of unknown size. */
1099 poly_uint64 element_bytes
;
1100 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type
)), &element_bytes
))
1103 /* Build up the list of individual vectors and predicates. */
1104 gcc_assert (!element_info
.pieces
.is_empty ());
1105 for (unsigned int i
= 0; i
< count
; ++i
)
1106 for (unsigned int j
= 0; j
< element_info
.pieces
.length (); ++j
)
1108 piece p
= element_info
.pieces
[j
];
1109 p
.offset
+= i
* element_bytes
;
1115 /* Subroutine of analyze for handling RECORD_TYPEs. */
1117 pure_scalable_type_info::analysis_result
1118 pure_scalable_type_info::analyze_record (const_tree type
)
1120 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
1122 if (TREE_CODE (field
) != FIELD_DECL
)
1125 /* Zero-sized fields disappear in the language->ABI mapping. */
1126 if (DECL_SIZE (field
) && integer_zerop (DECL_SIZE (field
)))
1129 /* All fields with an ABI identity must be PSTs for the record as
1130 a whole to be a PST. If any individual field is too big to be
1131 interesting then the record is too. */
1132 pure_scalable_type_info field_info
;
1133 analysis_result subresult
= field_info
.analyze (TREE_TYPE (field
));
1134 if (subresult
== NO_ABI_IDENTITY
)
1136 if (subresult
!= IS_PST
)
1139 /* Since all previous fields are PSTs, we ought to be able to track
1140 the field offset using poly_ints. */
1141 tree bitpos
= bit_position (field
);
1142 gcc_assert (poly_int_tree_p (bitpos
));
1144 /* For the same reason, it shouldn't be possible to create a PST field
1145 whose offset isn't byte-aligned. */
1146 poly_widest_int wide_bytepos
= exact_div (wi::to_poly_widest (bitpos
),
1149 /* Punt if the record is too big to be interesting. */
1150 poly_uint64 bytepos
;
1151 if (!wide_bytepos
.to_uhwi (&bytepos
)
1152 || pieces
.length () + field_info
.pieces
.length () > MAX_PIECES
)
1153 return DOESNT_MATTER
;
1155 /* Add the individual vectors and predicates in the field to the
1157 gcc_assert (!field_info
.pieces
.is_empty ());
1158 for (unsigned int i
= 0; i
< field_info
.pieces
.length (); ++i
)
1160 piece p
= field_info
.pieces
[i
];
1161 p
.offset
+= bytepos
;
1165 /* Empty structures disappear in the language->ABI mapping. */
1166 return pieces
.is_empty () ? NO_ABI_IDENTITY
: IS_PST
;
1169 /* Add P to the list of pieces in the type. */
1172 pure_scalable_type_info::add_piece (const piece
&p
)
1174 /* Try to fold the new piece into the previous one to form a
1175 single-mode PST. For example, if we see three consecutive vectors
1176 of the same mode, we can represent them using the corresponding
1179 This is purely an optimization. */
1180 if (!pieces
.is_empty ())
1182 piece
&prev
= pieces
.last ();
1183 gcc_assert (VECTOR_MODE_P (p
.mode
) && VECTOR_MODE_P (prev
.mode
));
1184 unsigned int nelems1
, nelems2
;
1185 if (prev
.orig_mode
== p
.orig_mode
1186 && GET_MODE_CLASS (p
.orig_mode
) != MODE_VECTOR_BOOL
1187 && known_eq (prev
.offset
+ GET_MODE_SIZE (prev
.mode
), p
.offset
)
1188 && constant_multiple_p (GET_MODE_NUNITS (prev
.mode
),
1189 GET_MODE_NUNITS (p
.orig_mode
), &nelems1
)
1190 && constant_multiple_p (GET_MODE_NUNITS (p
.mode
),
1191 GET_MODE_NUNITS (p
.orig_mode
), &nelems2
)
1192 && targetm
.array_mode (p
.orig_mode
,
1193 nelems1
+ nelems2
).exists (&prev
.mode
))
1195 prev
.num_zr
+= p
.num_zr
;
1196 prev
.num_pr
+= p
.num_pr
;
1200 pieces
.quick_push (p
);
1203 /* Return true if at least one possible value of type TYPE includes at
1204 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1206 This is a relatively expensive test for some types, so it should
1207 generally be made as late as possible. */
1210 aarch64_some_values_include_pst_objects_p (const_tree type
)
1212 if (TYPE_SIZE (type
) && integer_zerop (TYPE_SIZE (type
)))
1215 if (aarch64_sve::builtin_type_p (type
))
1218 if (TREE_CODE (type
) == ARRAY_TYPE
|| TREE_CODE (type
) == COMPLEX_TYPE
)
1219 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type
));
1221 if (RECORD_OR_UNION_TYPE_P (type
))
1222 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
1223 if (TREE_CODE (field
) == FIELD_DECL
1224 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field
)))
1230 /* Return the descriptor of the SIMD ABI. */
1232 static const predefined_function_abi
&
1233 aarch64_simd_abi (void)
1235 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1236 if (!simd_abi
.initialized_p ())
1238 HARD_REG_SET full_reg_clobbers
1239 = default_function_abi
.full_reg_clobbers ();
1240 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1241 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1242 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1243 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1248 /* Return the descriptor of the SVE PCS. */
1250 static const predefined_function_abi
&
1251 aarch64_sve_abi (void)
1253 predefined_function_abi
&sve_abi
= function_abis
[ARM_PCS_SVE
];
1254 if (!sve_abi
.initialized_p ())
1256 HARD_REG_SET full_reg_clobbers
1257 = default_function_abi
.full_reg_clobbers ();
1258 for (int regno
= V8_REGNUM
; regno
<= V23_REGNUM
; ++regno
)
1259 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1260 for (int regno
= P4_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
1261 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1262 sve_abi
.initialize (ARM_PCS_SVE
, full_reg_clobbers
);
1267 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1268 wraps, otherwise return X itself. */
1274 if (GET_CODE (search
) == CONST
)
1275 search
= XEXP (search
, 0);
1276 if (GET_CODE (search
) == UNSPEC
&& XINT (search
, 1) == UNSPEC_SALT_ADDR
)
1277 x
= XVECEXP (search
, 0, 0);
1281 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1285 strip_offset_and_salt (rtx addr
, poly_int64
*offset
)
1287 return strip_salt (strip_offset (addr
, offset
));
1290 /* Generate code to enable conditional branches in functions over 1 MiB. */
1292 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1293 const char * branch_format
)
1295 rtx_code_label
* tmp_label
= gen_label_rtx ();
1296 char label_buf
[256];
1298 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1299 CODE_LABEL_NUMBER (tmp_label
));
1300 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1301 rtx dest_label
= operands
[pos_label
];
1302 operands
[pos_label
] = tmp_label
;
1304 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1305 output_asm_insn (buffer
, operands
);
1307 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1308 operands
[pos_label
] = dest_label
;
1309 output_asm_insn (buffer
, operands
);
1314 aarch64_err_no_fpadvsimd (machine_mode mode
)
1316 if (TARGET_GENERAL_REGS_ONLY
)
1317 if (FLOAT_MODE_P (mode
))
1318 error ("%qs is incompatible with the use of floating-point types",
1319 "-mgeneral-regs-only");
1321 error ("%qs is incompatible with the use of vector types",
1322 "-mgeneral-regs-only");
1324 if (FLOAT_MODE_P (mode
))
1325 error ("%qs feature modifier is incompatible with the use of"
1326 " floating-point types", "+nofp");
1328 error ("%qs feature modifier is incompatible with the use of"
1329 " vector types", "+nofp");
1332 /* Report when we try to do something that requires SVE when SVE is disabled.
1333 This is an error of last resort and isn't very high-quality. It usually
1334 involves attempts to measure the vector length in some way. */
1336 aarch64_report_sve_required (void)
1338 static bool reported_p
= false;
1340 /* Avoid reporting a slew of messages for a single oversight. */
1344 error ("this operation requires the SVE ISA extension");
1345 inform (input_location
, "you can enable SVE using the command-line"
1346 " option %<-march%>, or by using the %<target%>"
1347 " attribute or pragma");
1351 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1354 pr_or_ffr_regnum_p (unsigned int regno
)
1356 return PR_REGNUM_P (regno
) || regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
;
1359 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1360 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1361 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1362 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1363 and GENERAL_REGS is lower than the memory cost (in this case the best class
1364 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1365 cost results in bad allocations with many redundant int<->FP moves which
1366 are expensive on various cores.
1367 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1368 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1369 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1370 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1371 The result of this is that it is no longer inefficient to have a higher
1372 memory move cost than the register move cost.
1376 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1377 reg_class_t best_class
)
1381 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1382 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1383 return allocno_class
;
1385 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1386 || !reg_class_subset_p (FP_REGS
, best_class
))
1389 mode
= PSEUDO_REGNO_MODE (regno
);
1390 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1394 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1396 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1397 return aarch64_tune_params
.min_div_recip_mul_sf
;
1398 return aarch64_tune_params
.min_div_recip_mul_df
;
1401 /* Return the reassociation width of treeop OPC with mode MODE. */
1403 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1405 if (VECTOR_MODE_P (mode
))
1406 return aarch64_tune_params
.vec_reassoc_width
;
1407 if (INTEGRAL_MODE_P (mode
))
1408 return aarch64_tune_params
.int_reassoc_width
;
1409 /* Reassociation reduces the number of FMAs which may result in worse
1410 performance. Use a per-CPU setting for FMA reassociation which allows
1411 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1412 CPUs with many FP pipes to enable reassociation.
1413 Since the reassociation pass doesn't understand FMA at all, assume
1414 that any FP addition might turn into FMA. */
1415 if (FLOAT_MODE_P (mode
))
1416 return opc
== PLUS_EXPR
? aarch64_tune_params
.fma_reassoc_width
1417 : aarch64_tune_params
.fp_reassoc_width
;
1421 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1423 aarch64_debugger_regno (unsigned regno
)
1425 if (GP_REGNUM_P (regno
))
1426 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1427 else if (regno
== SP_REGNUM
)
1428 return AARCH64_DWARF_SP
;
1429 else if (FP_REGNUM_P (regno
))
1430 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1431 else if (PR_REGNUM_P (regno
))
1432 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1433 else if (regno
== VG_REGNUM
)
1434 return AARCH64_DWARF_VG
;
1436 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1437 equivalent DWARF register. */
1438 return DWARF_FRAME_REGISTERS
;
1441 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
1443 aarch64_dwarf_frame_reg_mode (int regno
)
1445 /* Predicate registers are call-clobbered in the EH ABI (which is
1446 ARM_PCS_AAPCS64), so they should not be described by CFI.
1447 Their size changes as VL changes, so any values computed by
1448 __builtin_init_dwarf_reg_size_table might not be valid for
1450 if (PR_REGNUM_P (regno
))
1452 return default_dwarf_frame_reg_mode (regno
);
1455 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1456 integer, otherwise return X unmodified. */
1458 aarch64_bit_representation (rtx x
)
1460 if (CONST_DOUBLE_P (x
))
1461 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1465 /* Return an estimate for the number of quadwords in an SVE vector. This is
1466 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
1468 aarch64_estimated_sve_vq ()
1470 return estimated_poly_value (BITS_PER_SVE_VECTOR
) / 128;
1473 /* Return true if MODE is an SVE predicate mode. */
1475 aarch64_sve_pred_mode_p (machine_mode mode
)
1478 && (mode
== VNx16BImode
1479 || mode
== VNx8BImode
1480 || mode
== VNx4BImode
1481 || mode
== VNx2BImode
));
1484 /* Three mutually-exclusive flags describing a vector or predicate type. */
1485 const unsigned int VEC_ADVSIMD
= 1;
1486 const unsigned int VEC_SVE_DATA
= 2;
1487 const unsigned int VEC_SVE_PRED
= 4;
1488 /* Indicates a structure of 2, 3 or 4 vectors or predicates. */
1489 const unsigned int VEC_STRUCT
= 8;
1490 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1491 vector has fewer significant bytes than a full SVE vector. */
1492 const unsigned int VEC_PARTIAL
= 16;
1493 /* Useful combinations of the above. */
1494 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1495 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1497 /* Return a set of flags describing the vector properties of mode MODE.
1498 If ANY_TARGET_P is false (the default), ignore modes that are not supported
1499 by the current target. Otherwise categorize the modes that can be used
1500 with the set of all targets supported by the port. */
1503 aarch64_classify_vector_mode (machine_mode mode
, bool any_target_p
= false)
1505 if (aarch64_sve_pred_mode_p (mode
))
1506 return VEC_SVE_PRED
;
1508 /* Make the decision based on the mode's enum value rather than its
1509 properties, so that we keep the correct classification regardless
1510 of -msve-vector-bits. */
1513 /* Partial SVE QI vectors. */
1517 /* Partial SVE HI vectors. */
1520 /* Partial SVE SI vector. */
1522 /* Partial SVE HF vectors. */
1525 /* Partial SVE BF vectors. */
1528 /* Partial SVE SF vector. */
1530 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
1540 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
: 0;
1542 /* x2 SVE vectors. */
1551 /* x3 SVE vectors. */
1560 /* x4 SVE vectors. */
1569 return (TARGET_SVE
|| any_target_p
) ? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1574 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
| VEC_STRUCT
: 0;
1576 /* Structures of 64-bit Advanced SIMD vectors. */
1601 return (TARGET_FLOAT
|| any_target_p
)
1602 ? VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
: 0;
1604 /* Structures of 128-bit Advanced SIMD vectors. */
1629 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
| VEC_STRUCT
: 0;
1631 /* 64-bit Advanced SIMD vectors. */
1640 /* 128-bit Advanced SIMD vectors. */
1649 return (TARGET_FLOAT
|| any_target_p
) ? VEC_ADVSIMD
: 0;
1652 return TARGET_SVE
? VEC_SVE_PRED
| VEC_STRUCT
: 0;
1659 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1661 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1663 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1664 return (vec_flags
& VEC_ADVSIMD
) && (vec_flags
& VEC_STRUCT
);
1667 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
1669 aarch64_advsimd_partial_struct_mode_p (machine_mode mode
)
1671 return (aarch64_classify_vector_mode (mode
)
1672 == (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
));
1675 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
1677 aarch64_advsimd_full_struct_mode_p (machine_mode mode
)
1679 return (aarch64_classify_vector_mode (mode
) == (VEC_ADVSIMD
| VEC_STRUCT
));
1682 /* Return true if MODE is any of the data vector modes, including
1685 aarch64_vector_data_mode_p (machine_mode mode
)
1687 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1690 /* Return true if MODE is any form of SVE mode, including predicates,
1691 vectors and structures. */
1693 aarch64_sve_mode_p (machine_mode mode
)
1695 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1698 /* Return true if MODE is an SVE data vector mode; either a single vector
1699 or a structure of vectors. */
1701 aarch64_sve_data_mode_p (machine_mode mode
)
1703 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1706 /* Return the number of defined bytes in one constituent vector of
1707 SVE mode MODE, which has vector flags VEC_FLAGS. */
1709 aarch64_vl_bytes (machine_mode mode
, unsigned int vec_flags
)
1711 if (vec_flags
& VEC_PARTIAL
)
1712 /* A single partial vector. */
1713 return GET_MODE_SIZE (mode
);
1715 if (vec_flags
& VEC_SVE_DATA
)
1716 /* A single vector or a tuple. */
1717 return BYTES_PER_SVE_VECTOR
;
1719 /* A single predicate. */
1720 gcc_assert (vec_flags
& VEC_SVE_PRED
);
1721 return BYTES_PER_SVE_PRED
;
1724 /* If MODE holds an array of vectors, return the number of vectors
1725 in the array, otherwise return 1. */
1728 aarch64_ldn_stn_vectors (machine_mode mode
)
1730 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1731 if (vec_flags
== (VEC_ADVSIMD
| VEC_PARTIAL
| VEC_STRUCT
))
1732 return exact_div (GET_MODE_SIZE (mode
), 8).to_constant ();
1733 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
1734 return exact_div (GET_MODE_SIZE (mode
), 16).to_constant ();
1735 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
1736 return exact_div (GET_MODE_SIZE (mode
),
1737 BYTES_PER_SVE_VECTOR
).to_constant ();
1741 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1742 corresponding vector structure mode. */
1743 static opt_machine_mode
1744 aarch64_advsimd_vector_array_mode (machine_mode mode
,
1745 unsigned HOST_WIDE_INT nelems
)
1747 unsigned int flags
= VEC_ADVSIMD
| VEC_STRUCT
;
1748 if (known_eq (GET_MODE_SIZE (mode
), 8))
1749 flags
|= VEC_PARTIAL
;
1751 machine_mode struct_mode
;
1752 FOR_EACH_MODE_IN_CLASS (struct_mode
, GET_MODE_CLASS (mode
))
1753 if (aarch64_classify_vector_mode (struct_mode
) == flags
1754 && GET_MODE_INNER (struct_mode
) == GET_MODE_INNER (mode
)
1755 && known_eq (GET_MODE_NUNITS (struct_mode
),
1756 GET_MODE_NUNITS (mode
) * nelems
))
1758 return opt_machine_mode ();
1761 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1764 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1766 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1767 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1769 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1770 if (inner_mode
== GET_MODE_INNER (mode
)
1771 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1772 && aarch64_sve_data_mode_p (mode
))
1774 return opt_machine_mode ();
1777 /* Implement target hook TARGET_ARRAY_MODE. */
1778 static opt_machine_mode
1779 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1781 if (TARGET_SVE
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1783 /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1784 a mode to other array sizes. Using integer modes requires a round
1785 trip through memory and generates terrible code. */
1788 if (mode
== VNx16BImode
&& nelems
== 2)
1793 auto flags
= aarch64_classify_vector_mode (mode
);
1794 if (flags
== VEC_SVE_DATA
&& IN_RANGE (nelems
, 2, 4))
1795 return aarch64_sve_data_mode (GET_MODE_INNER (mode
),
1796 GET_MODE_NUNITS (mode
) * nelems
);
1798 if (flags
== VEC_ADVSIMD
&& IN_RANGE (nelems
, 2, 4))
1799 return aarch64_advsimd_vector_array_mode (mode
, nelems
);
1801 return opt_machine_mode ();
1804 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1806 aarch64_array_mode_supported_p (machine_mode mode
,
1807 unsigned HOST_WIDE_INT nelems
)
1809 if (TARGET_BASE_SIMD
1810 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1811 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1812 && (nelems
>= 2 && nelems
<= 4))
1818 /* MODE is some form of SVE vector mode. For data modes, return the number
1819 of vector register bits that each element of MODE occupies, such as 64
1820 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1821 in a 64-bit container). For predicate modes, return the number of
1822 data bits controlled by each significant predicate bit. */
1825 aarch64_sve_container_bits (machine_mode mode
)
1827 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1828 poly_uint64 vector_bits
= (vec_flags
& (VEC_PARTIAL
| VEC_SVE_PRED
)
1829 ? BITS_PER_SVE_VECTOR
1830 : GET_MODE_BITSIZE (mode
));
1831 return vector_element_size (vector_bits
, GET_MODE_NUNITS (mode
));
1834 /* Return the SVE predicate mode to use for elements that have
1835 ELEM_NBYTES bytes, if such a mode exists. */
1838 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1842 if (elem_nbytes
== 1)
1844 if (elem_nbytes
== 2)
1846 if (elem_nbytes
== 4)
1848 if (elem_nbytes
== 8)
1851 return opt_machine_mode ();
1854 /* Return the SVE predicate mode that should be used to control
1858 aarch64_sve_pred_mode (machine_mode mode
)
1860 unsigned int bits
= aarch64_sve_container_bits (mode
);
1861 return aarch64_sve_pred_mode (bits
/ BITS_PER_UNIT
).require ();
1864 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1866 static opt_machine_mode
1867 aarch64_get_mask_mode (machine_mode mode
)
1869 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1870 if (vec_flags
& VEC_SVE_DATA
)
1871 return aarch64_sve_pred_mode (mode
);
1873 return default_get_mask_mode (mode
);
1876 /* Return the integer element mode associated with SVE mode MODE. */
1878 static scalar_int_mode
1879 aarch64_sve_element_int_mode (machine_mode mode
)
1881 poly_uint64 vector_bits
= (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
1882 ? BITS_PER_SVE_VECTOR
1883 : GET_MODE_BITSIZE (mode
));
1884 unsigned int elt_bits
= vector_element_size (vector_bits
,
1885 GET_MODE_NUNITS (mode
));
1886 return int_mode_for_size (elt_bits
, 0).require ();
1889 /* Return an integer element mode that contains exactly
1890 aarch64_sve_container_bits (MODE) bits. This is wider than
1891 aarch64_sve_element_int_mode if MODE is a partial vector,
1892 otherwise it's the same. */
1894 static scalar_int_mode
1895 aarch64_sve_container_int_mode (machine_mode mode
)
1897 return int_mode_for_size (aarch64_sve_container_bits (mode
), 0).require ();
1900 /* Return the integer vector mode associated with SVE mode MODE.
1901 Unlike related_int_vector_mode, this can handle the case in which
1902 MODE is a predicate (and thus has a different total size). */
1905 aarch64_sve_int_mode (machine_mode mode
)
1907 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1908 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1911 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1913 static opt_machine_mode
1914 aarch64_vectorize_related_mode (machine_mode vector_mode
,
1915 scalar_mode element_mode
,
1918 unsigned int vec_flags
= aarch64_classify_vector_mode (vector_mode
);
1920 /* If we're operating on SVE vectors, try to return an SVE mode. */
1921 poly_uint64 sve_nunits
;
1922 if ((vec_flags
& VEC_SVE_DATA
)
1923 && multiple_p (BYTES_PER_SVE_VECTOR
,
1924 GET_MODE_SIZE (element_mode
), &sve_nunits
))
1926 machine_mode sve_mode
;
1927 if (maybe_ne (nunits
, 0U))
1929 /* Try to find a full or partial SVE mode with exactly
1931 if (multiple_p (sve_nunits
, nunits
)
1932 && aarch64_sve_data_mode (element_mode
,
1933 nunits
).exists (&sve_mode
))
1938 /* Take the preferred number of units from the number of bytes
1939 that fit in VECTOR_MODE. We always start by "autodetecting"
1940 a full vector mode with preferred_simd_mode, so vectors
1941 chosen here will also be full vector modes. Then
1942 autovectorize_vector_modes tries smaller starting modes
1943 and thus smaller preferred numbers of units. */
1944 sve_nunits
= ordered_min (sve_nunits
, GET_MODE_SIZE (vector_mode
));
1945 if (aarch64_sve_data_mode (element_mode
,
1946 sve_nunits
).exists (&sve_mode
))
1951 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1953 && (vec_flags
& VEC_ADVSIMD
)
1954 && known_eq (nunits
, 0U)
1955 && known_eq (GET_MODE_BITSIZE (vector_mode
), 64U)
1956 && maybe_ge (GET_MODE_BITSIZE (element_mode
)
1957 * GET_MODE_NUNITS (vector_mode
), 128U))
1959 machine_mode res
= aarch64_simd_container_mode (element_mode
, 128);
1960 if (VECTOR_MODE_P (res
))
1964 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
1967 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
1970 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type
)
1972 machine_mode mode
= TYPE_MODE (type
);
1973 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1974 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
1975 bool simd_p
= (vec_flags
& VEC_ADVSIMD
);
1977 return (sve_p
&& TARGET_SVE2
) || (simd_p
&& TARGET_SIMD
);
1980 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1981 prefer to use the first arithmetic operand as the else value if
1982 the else value doesn't matter, since that exactly matches the SVE
1983 destructive merging form. For ternary operations we could either
1984 pick the first operand and use FMAD-like instructions or the last
1985 operand and use FMLA-like instructions; the latter seems more
1989 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1991 return nops
== 3 ? ops
[2] : ops
[0];
1994 /* Implement TARGET_HARD_REGNO_NREGS. */
1997 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1999 /* ??? Logically we should only need to provide a value when
2000 HARD_REGNO_MODE_OK says that the combination is valid,
2001 but at the moment we need to handle all modes. Just ignore
2002 any runtime parts for registers that can't store them. */
2003 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
2004 switch (aarch64_regno_regclass (regno
))
2010 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2011 if (vec_flags
& VEC_SVE_DATA
)
2012 return exact_div (GET_MODE_SIZE (mode
),
2013 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
2014 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
))
2015 return GET_MODE_SIZE (mode
).to_constant () / 8;
2016 return CEIL (lowest_size
, UNITS_PER_VREG
);
2022 return mode
== VNx32BImode
? 2 : 1;
2025 case PR_AND_FFR_REGS
:
2030 return CEIL (lowest_size
, UNITS_PER_WORD
);
2035 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2038 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
2040 if (mode
== V8DImode
)
2041 return IN_RANGE (regno
, R0_REGNUM
, R23_REGNUM
)
2042 && multiple_p (regno
- R0_REGNUM
, 2);
2044 if (GET_MODE_CLASS (mode
) == MODE_CC
)
2045 return regno
== CC_REGNUM
;
2047 if (regno
== VG_REGNUM
)
2048 /* This must have the same size as _Unwind_Word. */
2049 return mode
== DImode
;
2051 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2052 if (vec_flags
== VEC_SVE_PRED
)
2053 return pr_or_ffr_regnum_p (regno
);
2055 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
2056 return PR_REGNUM_P (regno
);
2058 if (pr_or_ffr_regnum_p (regno
))
2061 /* These registers are abstract; their modes don't matter. */
2062 if (FAKE_REGNUM_P (regno
))
2065 if (regno
== SP_REGNUM
)
2066 /* The purpose of comparing with ptr_mode is to support the
2067 global register variable associated with the stack pointer
2068 register via the syntax of asm ("wsp") in ILP32. */
2069 return mode
== Pmode
|| mode
== ptr_mode
;
2071 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
2072 return mode
== Pmode
;
2074 if (GP_REGNUM_P (regno
))
2076 if (vec_flags
& (VEC_ANY_SVE
| VEC_STRUCT
))
2078 if (known_le (GET_MODE_SIZE (mode
), 8))
2080 if (known_le (GET_MODE_SIZE (mode
), 16))
2081 return (regno
& 1) == 0;
2083 else if (FP_REGNUM_P (regno
))
2085 if (vec_flags
& VEC_STRUCT
)
2086 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
2088 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
2094 /* Return true if a function with type FNTYPE returns its value in
2095 SVE vector or predicate registers. */
2098 aarch64_returns_value_in_sve_regs_p (const_tree fntype
)
2100 tree return_type
= TREE_TYPE (fntype
);
2102 pure_scalable_type_info pst_info
;
2103 switch (pst_info
.analyze (return_type
))
2105 case pure_scalable_type_info::IS_PST
:
2106 return (pst_info
.num_zr () <= NUM_FP_ARG_REGS
2107 && pst_info
.num_pr () <= NUM_PR_ARG_REGS
);
2109 case pure_scalable_type_info::DOESNT_MATTER
:
2110 gcc_assert (aarch64_return_in_memory_1 (return_type
));
2113 case pure_scalable_type_info::NO_ABI_IDENTITY
:
2114 case pure_scalable_type_info::ISNT_PST
:
2120 /* Return true if a function with type FNTYPE takes arguments in
2121 SVE vector or predicate registers. */
2124 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype
)
2126 CUMULATIVE_ARGS args_so_far_v
;
2127 aarch64_init_cumulative_args (&args_so_far_v
, NULL_TREE
, NULL_RTX
,
2128 NULL_TREE
, 0, true);
2129 cumulative_args_t args_so_far
= pack_cumulative_args (&args_so_far_v
);
2131 for (tree chain
= TYPE_ARG_TYPES (fntype
);
2132 chain
&& chain
!= void_list_node
;
2133 chain
= TREE_CHAIN (chain
))
2135 tree arg_type
= TREE_VALUE (chain
);
2136 if (arg_type
== error_mark_node
)
2139 function_arg_info
arg (arg_type
, /*named=*/true);
2140 apply_pass_by_reference_rules (&args_so_far_v
, arg
);
2141 pure_scalable_type_info pst_info
;
2142 if (pst_info
.analyze_registers (arg
.type
))
2144 unsigned int end_zr
= args_so_far_v
.aapcs_nvrn
+ pst_info
.num_zr ();
2145 unsigned int end_pr
= args_so_far_v
.aapcs_nprn
+ pst_info
.num_pr ();
2146 gcc_assert (end_zr
<= NUM_FP_ARG_REGS
&& end_pr
<= NUM_PR_ARG_REGS
);
2150 targetm
.calls
.function_arg_advance (args_so_far
, arg
);
2155 /* Implement TARGET_FNTYPE_ABI. */
2157 static const predefined_function_abi
&
2158 aarch64_fntype_abi (const_tree fntype
)
2160 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
2161 return aarch64_simd_abi ();
2163 if (aarch64_returns_value_in_sve_regs_p (fntype
)
2164 || aarch64_takes_arguments_in_sve_regs_p (fntype
))
2165 return aarch64_sve_abi ();
2167 return default_function_abi
;
2170 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE. */
2172 static aarch64_feature_flags
2173 aarch64_fntype_pstate_sm (const_tree fntype
)
2175 if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype
)))
2176 return AARCH64_FL_SM_ON
;
2178 if (lookup_attribute ("arm", "streaming_compatible",
2179 TYPE_ATTRIBUTES (fntype
)))
2182 return AARCH64_FL_SM_OFF
;
2185 /* Return state flags that describe whether and how functions of type
2186 FNTYPE share state STATE_NAME with their callers. */
2189 aarch64_fntype_shared_flags (const_tree fntype
, const char *state_name
)
2191 return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype
),
2195 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE. */
2197 static aarch64_feature_flags
2198 aarch64_fntype_pstate_za (const_tree fntype
)
2200 if (aarch64_fntype_shared_flags (fntype
, "za")
2201 || aarch64_fntype_shared_flags (fntype
, "zt0"))
2202 return AARCH64_FL_ZA_ON
;
2207 /* Return the ISA mode on entry to functions of type FNTYPE. */
2209 static aarch64_feature_flags
2210 aarch64_fntype_isa_mode (const_tree fntype
)
2212 return (aarch64_fntype_pstate_sm (fntype
)
2213 | aarch64_fntype_pstate_za (fntype
));
2216 /* Return true if FNDECL uses streaming mode internally, as an
2217 implementation choice. */
2220 aarch64_fndecl_is_locally_streaming (const_tree fndecl
)
2222 return lookup_attribute ("arm", "locally_streaming",
2223 DECL_ATTRIBUTES (fndecl
));
2226 /* Return the state of PSTATE.SM when compiling the body of
2227 function FNDECL. This might be different from the state of
2228 PSTATE.SM on entry. */
2230 static aarch64_feature_flags
2231 aarch64_fndecl_pstate_sm (const_tree fndecl
)
2233 if (aarch64_fndecl_is_locally_streaming (fndecl
))
2234 return AARCH64_FL_SM_ON
;
2236 return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl
));
2239 /* Return true if function FNDECL has state STATE_NAME, either by creating
2240 new state itself or by sharing state with callers. */
2243 aarch64_fndecl_has_state (tree fndecl
, const char *state_name
)
2245 return (aarch64_fndecl_has_new_state (fndecl
, state_name
)
2246 || aarch64_fntype_shared_flags (TREE_TYPE (fndecl
),
2250 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2251 This might be different from the state of PSTATE.ZA on entry. */
2253 static aarch64_feature_flags
2254 aarch64_fndecl_pstate_za (const_tree fndecl
)
2256 if (aarch64_fndecl_has_new_state (fndecl
, "za")
2257 || aarch64_fndecl_has_new_state (fndecl
, "zt0"))
2258 return AARCH64_FL_ZA_ON
;
2260 return aarch64_fntype_pstate_za (TREE_TYPE (fndecl
));
2263 /* Return the ISA mode that should be used to compile the body of
2266 static aarch64_feature_flags
2267 aarch64_fndecl_isa_mode (const_tree fndecl
)
2269 return (aarch64_fndecl_pstate_sm (fndecl
)
2270 | aarch64_fndecl_pstate_za (fndecl
));
2273 /* Return the state of PSTATE.SM on entry to the current function.
2274 This might be different from the state of PSTATE.SM in the function
2277 static aarch64_feature_flags
2278 aarch64_cfun_incoming_pstate_sm ()
2280 return aarch64_fntype_pstate_sm (TREE_TYPE (cfun
->decl
));
2283 /* Return the state of PSTATE.ZA on entry to the current function.
2284 This might be different from the state of PSTATE.ZA in the function
2287 static aarch64_feature_flags
2288 aarch64_cfun_incoming_pstate_za ()
2290 return aarch64_fntype_pstate_za (TREE_TYPE (cfun
->decl
));
2293 /* Return state flags that describe whether and how the current function shares
2294 state STATE_NAME with callers. */
2297 aarch64_cfun_shared_flags (const char *state_name
)
2299 return aarch64_fntype_shared_flags (TREE_TYPE (cfun
->decl
), state_name
);
2302 /* Return true if the current function creates new state of type STATE_NAME
2303 (as opposed to sharing the state with its callers or ignoring the state
2307 aarch64_cfun_has_new_state (const char *state_name
)
2309 return aarch64_fndecl_has_new_state (cfun
->decl
, state_name
);
2312 /* Return true if PSTATE.SM is 1 in the body of the current function,
2313 but is not guaranteed to be 1 on entry. */
2316 aarch64_cfun_enables_pstate_sm ()
2318 return (aarch64_fndecl_is_locally_streaming (cfun
->decl
)
2319 && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON
);
2322 /* Return true if the current function has state STATE_NAME, either by
2323 creating new state itself or by sharing state with callers. */
2326 aarch64_cfun_has_state (const char *state_name
)
2328 return aarch64_fndecl_has_state (cfun
->decl
, state_name
);
2331 /* Return true if a call from the current function to a function with
2332 ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2333 the BL instruction. */
2336 aarch64_call_switches_pstate_sm (aarch64_feature_flags callee_mode
)
2338 return (callee_mode
& ~AARCH64_ISA_MODE
& AARCH64_FL_SM_STATE
) != 0;
2341 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2344 aarch64_compatible_vector_types_p (const_tree type1
, const_tree type2
)
2346 return (aarch64_sve::builtin_type_p (type1
)
2347 == aarch64_sve::builtin_type_p (type2
));
2350 /* Return true if we should emit CFI for register REGNO. */
2353 aarch64_emit_cfi_for_reg_p (unsigned int regno
)
2355 return (GP_REGNUM_P (regno
)
2356 || !default_function_abi
.clobbers_full_reg_p (regno
));
2359 /* Return the mode we should use to save and restore register REGNO. */
2362 aarch64_reg_save_mode (unsigned int regno
)
2364 if (GP_REGNUM_P (regno
) || regno
== VG_REGNUM
)
2367 if (FP_REGNUM_P (regno
))
2368 switch (crtl
->abi
->id ())
2370 case ARM_PCS_AAPCS64
:
2371 /* Only the low 64 bits are saved by the base PCS. */
2375 /* The vector PCS saves the low 128 bits (which is the full
2376 register on non-SVE targets). */
2380 /* Use vectors of DImode for registers that need frame
2381 information, so that the first 64 bytes of the save slot
2382 are always the equivalent of what storing D<n> would give. */
2383 if (aarch64_emit_cfi_for_reg_p (regno
))
2386 /* Use vectors of bytes otherwise, so that the layout is
2387 endian-agnostic, and so that we can use LDR and STR for
2388 big-endian targets. */
2391 case ARM_PCS_TLSDESC
:
2392 case ARM_PCS_UNKNOWN
:
2396 if (PR_REGNUM_P (regno
))
2397 /* Save the full predicate register. */
2403 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2404 return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx. */
2407 aarch64_gen_callee_cookie (aarch64_feature_flags isa_mode
, arm_pcs pcs_variant
)
2409 return gen_int_mode ((unsigned int) isa_mode
2410 | (unsigned int) pcs_variant
<< AARCH64_NUM_ISA_MODES
,
2414 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2417 static const predefined_function_abi
&
2418 aarch64_callee_abi (rtx cookie
)
2420 return function_abis
[UINTVAL (cookie
) >> AARCH64_NUM_ISA_MODES
];
2423 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
2424 required ISA mode on entry to the callee, which is also the ISA
2425 mode on return from the callee. */
2427 static aarch64_feature_flags
2428 aarch64_callee_isa_mode (rtx cookie
)
2430 return UINTVAL (cookie
) & AARCH64_FL_ISA_MODES
;
2433 /* INSN is a call instruction. Return the CONST_INT stored in its
2434 UNSPEC_CALLEE_ABI rtx. */
2437 aarch64_insn_callee_cookie (const rtx_insn
*insn
)
2439 rtx pat
= PATTERN (insn
);
2440 gcc_assert (GET_CODE (pat
) == PARALLEL
);
2441 rtx unspec
= XVECEXP (pat
, 0, 1);
2442 gcc_assert (GET_CODE (unspec
) == UNSPEC
2443 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
2444 return XVECEXP (unspec
, 0, 0);
2447 /* Implement TARGET_INSN_CALLEE_ABI. */
2449 const predefined_function_abi
&
2450 aarch64_insn_callee_abi (const rtx_insn
*insn
)
2452 return aarch64_callee_abi (aarch64_insn_callee_cookie (insn
));
2455 /* INSN is a call instruction. Return the required ISA mode on entry to
2456 the callee, which is also the ISA mode on return from the callee. */
2458 static aarch64_feature_flags
2459 aarch64_insn_callee_isa_mode (const rtx_insn
*insn
)
2461 return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn
));
2464 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2465 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2466 clobbers the top 64 bits when restoring the bottom 64 bits. */
2469 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
2473 if (FP_REGNUM_P (regno
) && abi_id
!= ARM_PCS_SVE
)
2475 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
2476 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
2478 per_register_size
= exact_div (per_register_size
, nregs
);
2479 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
2480 return maybe_gt (per_register_size
, 16);
2481 return maybe_gt (per_register_size
, 8);
2486 /* Implement REGMODE_NATURAL_SIZE. */
2488 aarch64_regmode_natural_size (machine_mode mode
)
2490 /* The natural size for SVE data modes is one SVE data vector,
2491 and similarly for predicates. We can't independently modify
2492 anything smaller than that. */
2493 /* ??? For now, only do this for variable-width SVE registers.
2494 Doing it for constant-sized registers breaks lower-subreg.cc. */
2495 /* ??? And once that's fixed, we should probably have similar
2496 code for Advanced SIMD. */
2497 if (!aarch64_sve_vg
.is_constant ())
2499 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2500 if (vec_flags
& VEC_SVE_PRED
)
2501 return BYTES_PER_SVE_PRED
;
2502 if (vec_flags
& VEC_SVE_DATA
)
2503 return BYTES_PER_SVE_VECTOR
;
2505 return UNITS_PER_WORD
;
2508 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2510 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
2513 /* The predicate mode determines which bits are significant and
2514 which are "don't care". Decreasing the number of lanes would
2515 lose data while increasing the number of lanes would make bits
2516 unnecessarily significant. */
2517 if (PR_REGNUM_P (regno
))
2519 if (known_ge (GET_MODE_SIZE (mode
), 4))
2525 /* Return true if I's bits are consecutive ones from the MSB. */
2527 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
2529 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
2532 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2533 that strcpy from constants will be faster. */
2535 static HOST_WIDE_INT
2536 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
2538 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
2539 return MAX (align
, BITS_PER_WORD
);
2543 /* Return true if calls to DECL should be treated as
2544 long-calls (ie called via a register). */
2546 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
2551 /* Return true if calls to symbol-ref SYM should be treated as
2552 long-calls (ie called via a register). */
2554 aarch64_is_long_call_p (rtx sym
)
2556 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
2559 /* Return true if calls to symbol-ref SYM should not go through
2563 aarch64_is_noplt_call_p (rtx sym
)
2565 const_tree decl
= SYMBOL_REF_DECL (sym
);
2570 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
2571 && !targetm
.binds_local_p (decl
))
2577 /* Emit an insn that's a simple single-set. Both the operands must be
2578 known to be valid. */
2579 inline static rtx_insn
*
2580 emit_set_insn (rtx x
, rtx y
)
2582 return emit_insn (gen_rtx_SET (x
, y
));
2585 /* X and Y are two things to compare using CODE. Emit the compare insn and
2586 return the rtx for register 0 in the proper mode. */
2588 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2590 machine_mode cmp_mode
= GET_MODE (x
);
2591 machine_mode cc_mode
;
2594 if (cmp_mode
== TImode
)
2596 gcc_assert (code
== NE
);
2599 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2601 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2602 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2603 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2605 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2606 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2607 emit_insn (gen_ccmpccdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2608 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2609 GEN_INT (AARCH64_EQ
)));
2613 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2614 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2615 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2620 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2623 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2624 machine_mode y_mode
)
2626 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2628 if (CONST_INT_P (y
))
2630 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2636 machine_mode cc_mode
;
2638 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2639 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2640 cc_mode
= CC_SWPmode
;
2641 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2642 emit_set_insn (cc_reg
, t
);
2647 if (!aarch64_plus_operand (y
, y_mode
))
2648 y
= force_reg (y_mode
, y
);
2650 return aarch64_gen_compare_reg (code
, x
, y
);
2653 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2654 Return the jump instruction. */
2657 aarch64_gen_compare_zero_and_branch (rtx_code code
, rtx x
,
2658 rtx_code_label
*label
)
2660 if (aarch64_track_speculation
)
2662 /* Emit an explicit compare instruction, so that we can correctly
2663 track the condition codes. */
2664 rtx cc_reg
= aarch64_gen_compare_reg (code
, x
, const0_rtx
);
2665 x
= gen_rtx_fmt_ee (code
, GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
2668 x
= gen_rtx_fmt_ee (code
, VOIDmode
, x
, const0_rtx
);
2670 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
2671 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
2672 return gen_rtx_SET (pc_rtx
, x
);
2675 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2676 If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2677 it branches to LABEL when the bit is clear. */
2680 aarch64_gen_test_and_branch (rtx_code code
, rtx x
, int bitnum
,
2681 rtx_code_label
*label
)
2683 auto mode
= GET_MODE (x
);
2684 if (aarch64_track_speculation
)
2686 auto mask
= gen_int_mode (HOST_WIDE_INT_1U
<< bitnum
, mode
);
2687 emit_insn (gen_aarch64_and3nr_compare0 (mode
, x
, mask
));
2688 rtx cc_reg
= gen_rtx_REG (CC_NZVmode
, CC_REGNUM
);
2689 rtx x
= gen_rtx_fmt_ee (code
, CC_NZVmode
, cc_reg
, const0_rtx
);
2690 return gen_condjump (x
, cc_reg
, label
);
2692 return gen_aarch64_tb (code
, mode
, mode
,
2693 x
, gen_int_mode (bitnum
, mode
), label
);
2696 /* Consider the operation:
2698 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2702 - CODE is [SU]MAX or [SU]MIN
2703 - OPERANDS[2] and OPERANDS[3] are constant integers
2704 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2705 - all operands have mode MODE
2707 Decide whether it is possible to implement the operation using:
2709 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2711 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2715 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2717 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
2718 If GENERATE_P is true, also update OPERANDS as follows:
2720 OPERANDS[4] = -OPERANDS[3]
2721 OPERANDS[5] = the rtl condition representing <cond>
2723 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
2725 aarch64_maxmin_plus_const (rtx_code code
, rtx
*operands
, bool generate_p
)
2727 signop sgn
= (code
== UMAX
|| code
== UMIN
? UNSIGNED
: SIGNED
);
2728 rtx dst
= operands
[0];
2729 rtx maxmin_op
= operands
[2];
2730 rtx add_op
= operands
[3];
2731 machine_mode mode
= GET_MODE (dst
);
2733 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2734 == (x >= y ? x : y) - z
2735 == (x > y ? x : y) - z
2736 == (x > y - 1 ? x : y) - z
2738 min (x, y) - z == (x <= y - 1 ? x : y) - z
2739 == (x <= y ? x : y) - z
2740 == (x < y ? x : y) - z
2741 == (x < y + 1 ? x : y) - z
2743 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2744 which x is compared with z. Set DIFF to y - z. Thus the supported
2745 combinations are as follows, with DIFF being the value after the ":":
2747 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
2748 == x >= y ? x - y : 0 [z == y]
2749 == x > y ? x - y : 0 [z == y]
2750 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
2752 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
2753 == x <= y ? x - y : 0 [z == y]
2754 == x < y ? x - y : 0 [z == y]
2755 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
2756 auto maxmin_val
= rtx_mode_t (maxmin_op
, mode
);
2757 auto add_val
= rtx_mode_t (add_op
, mode
);
2758 auto sub_val
= wi::neg (add_val
);
2759 auto diff
= wi::sub (maxmin_val
, sub_val
);
2761 || (diff
== 1 && wi::gt_p (maxmin_val
, sub_val
, sgn
))
2762 || (diff
== -1 && wi::lt_p (maxmin_val
, sub_val
, sgn
))))
2772 cmp
= diff
== 1 ? GT
: GE
;
2775 cmp
= diff
== 1 ? GTU
: GEU
;
2778 cmp
= diff
== -1 ? LT
: LE
;
2781 cmp
= diff
== -1 ? LTU
: LEU
;
2786 rtx cc
= gen_rtx_REG (CCmode
, CC_REGNUM
);
2788 operands
[4] = immed_wide_int_const (sub_val
, mode
);
2789 operands
[5] = gen_rtx_fmt_ee (cmp
, VOIDmode
, cc
, const0_rtx
);
2790 if (can_create_pseudo_p ())
2791 operands
[6] = gen_reg_rtx (mode
);
2794 operands
[7] = immed_wide_int_const (diff
, mode
);
2800 /* Build the SYMBOL_REF for __tls_get_addr. */
2802 static GTY(()) rtx tls_get_addr_libfunc
;
2805 aarch64_tls_get_addr (void)
2807 if (!tls_get_addr_libfunc
)
2808 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2809 return tls_get_addr_libfunc
;
2812 /* Return the TLS model to use for ADDR. */
2814 static enum tls_model
2815 tls_symbolic_operand_type (rtx addr
)
2817 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2819 addr
= strip_offset_and_salt (addr
, &offset
);
2820 if (SYMBOL_REF_P (addr
))
2821 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2826 /* We'll allow lo_sum's in addresses in our legitimate addresses
2827 so that combine would take care of combining addresses where
2828 necessary, but for generation purposes, we'll generate the address
2831 tmp = hi (symbol_ref); adrp x1, foo
2832 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2836 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2837 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2841 Load TLS symbol, depending on TLS mechanism and TLS access model.
2843 Global Dynamic - Traditional TLS:
2844 adrp tmp, :tlsgd:imm
2845 add dest, tmp, #:tlsgd_lo12:imm
2848 Global Dynamic - TLS Descriptors:
2849 adrp dest, :tlsdesc:imm
2850 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2851 add dest, dest, #:tlsdesc_lo12:imm
2858 adrp tmp, :gottprel:imm
2859 ldr dest, [tmp, #:gottprel_lo12:imm]
2864 add t0, tp, #:tprel_hi12:imm, lsl #12
2865 add t0, t0, #:tprel_lo12_nc:imm
2869 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2870 enum aarch64_symbol_type type
)
2873 rtx tmp
= legitimize_pe_coff_symbol (imm
, true);
2876 emit_insn (gen_rtx_SET (dest
, tmp
));
2883 case SYMBOL_SMALL_ABSOLUTE
:
2885 /* In ILP32, the mode of dest can be either SImode or DImode. */
2887 machine_mode mode
= GET_MODE (dest
);
2889 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2891 if (can_create_pseudo_p ())
2892 tmp_reg
= gen_reg_rtx (mode
);
2894 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, copy_rtx (imm
)));
2895 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2899 case SYMBOL_TINY_ABSOLUTE
:
2900 emit_insn (gen_rtx_SET (dest
, imm
));
2903 case SYMBOL_SMALL_GOT_28K
:
2905 machine_mode mode
= GET_MODE (dest
);
2906 rtx gp_rtx
= pic_offset_table_rtx
;
2910 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2911 here before rtl expand. Tree IVOPT will generate rtl pattern to
2912 decide rtx costs, in which case pic_offset_table_rtx is not
2913 initialized. For that case no need to generate the first adrp
2914 instruction as the final cost for global variable access is
2918 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2919 using the page base as GOT base, the first page may be wasted,
2920 in the worst scenario, there is only 28K space for GOT).
2922 The generate instruction sequence for accessing global variable
2925 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2927 Only one instruction needed. But we must initialize
2928 pic_offset_table_rtx properly. We generate initialize insn for
2929 every global access, and allow CSE to remove all redundant.
2931 The final instruction sequences will look like the following
2932 for multiply global variables access.
2934 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2936 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2937 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2938 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2941 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2942 crtl
->uses_pic_offset_table
= 1;
2943 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2945 if (mode
!= GET_MODE (gp_rtx
))
2946 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2950 if (mode
== ptr_mode
)
2953 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2955 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2957 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2961 gcc_assert (mode
== Pmode
);
2963 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2964 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2967 /* The operand is expected to be MEM. Whenever the related insn
2968 pattern changed, above code which calculate mem should be
2970 gcc_assert (MEM_P (mem
));
2971 MEM_READONLY_P (mem
) = 1;
2972 MEM_NOTRAP_P (mem
) = 1;
2977 case SYMBOL_SMALL_GOT_4G
:
2978 emit_insn (gen_rtx_SET (dest
, imm
));
2981 case SYMBOL_SMALL_TLSGD
:
2984 /* The return type of __tls_get_addr is the C pointer type
2986 rtx result
= gen_rtx_REG (ptr_mode
, R0_REGNUM
);
2989 if (GET_MODE (dest
) != ptr_mode
)
2990 tmp_reg
= can_create_pseudo_p () ? gen_reg_rtx (ptr_mode
) : result
;
2993 if (ptr_mode
== SImode
)
2994 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2996 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2997 insns
= get_insns ();
3000 RTL_CONST_CALL_P (insns
) = 1;
3001 emit_libcall_block (insns
, tmp_reg
, result
, imm
);
3002 /* Convert back to the mode of the dest adding a zero_extend
3003 from SImode (ptr_mode) to DImode (Pmode). */
3004 if (dest
!= tmp_reg
)
3005 convert_move (dest
, tmp_reg
, true);
3009 case SYMBOL_SMALL_TLSDESC
:
3011 machine_mode mode
= GET_MODE (dest
);
3012 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
3015 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
3017 /* In ILP32, the got entry is always of SImode size. Unlike
3018 small GOT, the dest is fixed at reg 0. */
3020 emit_insn (gen_tlsdesc_small_si (imm
));
3022 emit_insn (gen_tlsdesc_small_di (imm
));
3023 tp
= aarch64_load_tp (NULL
);
3026 tp
= gen_lowpart (mode
, tp
);
3028 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
3030 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3034 case SYMBOL_SMALL_TLSIE
:
3036 /* In ILP32, the mode of dest can be either SImode or DImode,
3037 while the got entry is always of SImode size. The mode of
3038 dest depends on how dest is used: if dest is assigned to a
3039 pointer (e.g. in the memory), it has SImode; it may have
3040 DImode if dest is dereferenced to access the memeory.
3041 This is why we have to handle three different tlsie_small
3042 patterns here (two patterns for ILP32). */
3043 machine_mode mode
= GET_MODE (dest
);
3044 rtx tmp_reg
= gen_reg_rtx (mode
);
3045 rtx tp
= aarch64_load_tp (NULL
);
3047 if (mode
== ptr_mode
)
3050 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
3053 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
3054 tp
= gen_lowpart (mode
, tp
);
3059 gcc_assert (mode
== Pmode
);
3060 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
3063 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
3065 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3069 case SYMBOL_TLSLE12
:
3070 case SYMBOL_TLSLE24
:
3071 case SYMBOL_TLSLE32
:
3072 case SYMBOL_TLSLE48
:
3074 machine_mode mode
= GET_MODE (dest
);
3075 rtx tp
= aarch64_load_tp (NULL
);
3078 tp
= gen_lowpart (mode
, tp
);
3082 case SYMBOL_TLSLE12
:
3083 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
3086 case SYMBOL_TLSLE24
:
3087 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
3090 case SYMBOL_TLSLE32
:
3091 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
3093 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
3096 case SYMBOL_TLSLE48
:
3097 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
3099 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
3107 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3111 case SYMBOL_TINY_GOT
:
3114 machine_mode mode
= GET_MODE (dest
);
3116 if (mode
== ptr_mode
)
3117 insn
= gen_ldr_got_tiny (mode
, dest
, imm
);
3120 gcc_assert (mode
== Pmode
);
3121 insn
= gen_ldr_got_tiny_sidi (dest
, imm
);
3128 case SYMBOL_TINY_TLSIE
:
3130 machine_mode mode
= GET_MODE (dest
);
3131 rtx tp
= aarch64_load_tp (NULL
);
3133 if (mode
== ptr_mode
)
3136 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
3139 tp
= gen_lowpart (mode
, tp
);
3140 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
3145 gcc_assert (mode
== Pmode
);
3146 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
3150 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
3159 /* Emit a move from SRC to DEST. Assume that the move expanders can
3160 handle all moves if !can_create_pseudo_p (). The distinction is
3161 important because, unlike emit_move_insn, the move expanders know
3162 how to force Pmode objects into the constant pool even when the
3163 constant pool address is not itself legitimate. */
3165 aarch64_emit_move (rtx dest
, rtx src
)
3167 return (can_create_pseudo_p ()
3168 ? emit_move_insn (dest
, src
)
3169 : emit_move_insn_1 (dest
, src
));
3172 /* Apply UNOPTAB to OP and store the result in DEST. */
3175 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
3177 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
3179 emit_move_insn (dest
, tmp
);
3182 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3185 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
3187 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
3190 emit_move_insn (dest
, tmp
);
3193 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE. */
3196 aarch64_split_double_move (rtx dst
, rtx src
, machine_mode single_mode
)
3198 machine_mode mode
= GET_MODE (dst
);
3200 rtx dst0
= simplify_gen_subreg (single_mode
, dst
, mode
, 0);
3201 rtx dst1
= simplify_gen_subreg (single_mode
, dst
, mode
,
3202 GET_MODE_SIZE (single_mode
));
3203 rtx src0
= simplify_gen_subreg (single_mode
, src
, mode
, 0);
3204 rtx src1
= simplify_gen_subreg (single_mode
, src
, mode
,
3205 GET_MODE_SIZE (single_mode
));
3207 /* At most one pairing may overlap. */
3208 if (reg_overlap_mentioned_p (dst0
, src1
))
3210 aarch64_emit_move (dst1
, src1
);
3211 aarch64_emit_move (dst0
, src0
);
3215 aarch64_emit_move (dst0
, src0
);
3216 aarch64_emit_move (dst1
, src1
);
3220 /* Split a 128-bit move operation into two 64-bit move operations,
3221 taking care to handle partial overlap of register to register
3222 copies. Special cases are needed when moving between GP regs and
3223 FP regs. SRC can be a register, constant or memory; DST a register
3224 or memory. If either operand is memory it must not have any side
3227 aarch64_split_128bit_move (rtx dst
, rtx src
)
3229 machine_mode mode
= GET_MODE (dst
);
3231 gcc_assert (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
);
3232 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
3233 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
3235 if (REG_P (dst
) && REG_P (src
))
3237 int src_regno
= REGNO (src
);
3238 int dst_regno
= REGNO (dst
);
3240 /* Handle FP <-> GP regs. */
3241 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
3243 rtx src_lo
= gen_lowpart (word_mode
, src
);
3244 rtx src_hi
= gen_highpart (word_mode
, src
);
3246 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
3247 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
3250 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
3252 rtx dst_lo
= gen_lowpart (word_mode
, dst
);
3253 rtx dst_hi
= gen_highpart (word_mode
, dst
);
3255 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
3256 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
3261 aarch64_split_double_move (dst
, src
, word_mode
);
3264 /* Return true if we should split a move from 128-bit value SRC
3265 to 128-bit register DEST. */
3268 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
3270 if (FP_REGNUM_P (REGNO (dst
)))
3271 return REG_P (src
) && !FP_REGNUM_P (REGNO (src
));
3272 /* All moves to GPRs need to be split. */
3276 /* Split a complex SIMD move. */
3279 aarch64_split_simd_move (rtx dst
, rtx src
)
3281 machine_mode src_mode
= GET_MODE (src
);
3282 machine_mode dst_mode
= GET_MODE (dst
);
3284 gcc_assert (VECTOR_MODE_P (dst_mode
));
3286 if (REG_P (dst
) && REG_P (src
))
3288 gcc_assert (VECTOR_MODE_P (src_mode
));
3289 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
3293 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3294 The semantics of those of svreinterpret rather than those of subregs;
3295 see the comment at the head of aarch64-sve.md for details about the
3299 aarch64_sve_reinterpret (machine_mode mode
, rtx x
)
3301 if (GET_MODE (x
) == mode
)
3304 /* can_change_mode_class must only return true if subregs and svreinterprets
3305 have the same semantics. */
3306 if (targetm
.can_change_mode_class (GET_MODE (x
), mode
, FP_REGS
))
3307 return force_lowpart_subreg (mode
, x
, GET_MODE (x
));
3309 rtx res
= gen_reg_rtx (mode
);
3310 x
= force_reg (GET_MODE (x
), x
);
3311 emit_insn (gen_aarch64_sve_reinterpret (mode
, res
, x
));
3316 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
3317 machine_mode ymode
, rtx y
)
3319 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
3320 gcc_assert (r
!= NULL
);
3321 return rtx_equal_p (x
, r
);
3324 /* Return TARGET if it is nonnull and a register of mode MODE.
3325 Otherwise, return a fresh register of mode MODE if we can,
3326 or TARGET reinterpreted as MODE if we can't. */
3329 aarch64_target_reg (rtx target
, machine_mode mode
)
3331 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
3333 if (!can_create_pseudo_p ())
3335 gcc_assert (target
);
3336 return gen_lowpart (mode
, target
);
3338 return gen_reg_rtx (mode
);
3341 /* Return a register that contains the constant in BUILDER, given that
3342 the constant is a legitimate move operand. Use TARGET as the register
3343 if it is nonnull and convenient. */
3346 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
3348 rtx src
= builder
.build ();
3349 target
= aarch64_target_reg (target
, GET_MODE (src
));
3350 emit_insn (gen_rtx_SET (target
, src
));
3355 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
3357 if (can_create_pseudo_p ())
3358 return force_reg (mode
, value
);
3362 aarch64_emit_move (x
, value
);
3367 /* Return true if predicate value X is a constant in which every element
3368 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3369 value, i.e. as a predicate in which all bits are significant. */
3372 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
3374 if (!CONST_VECTOR_P (x
))
3377 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
3378 GET_MODE_NUNITS (GET_MODE (x
)));
3379 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
3380 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
3381 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
3383 unsigned int nelts
= const_vector_encoded_nelts (x
);
3384 for (unsigned int i
= 0; i
< nelts
; ++i
)
3386 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
3387 if (!CONST_INT_P (elt
))
3390 builder
.quick_push (elt
);
3391 for (unsigned int j
= 1; j
< factor
; ++j
)
3392 builder
.quick_push (const0_rtx
);
3394 builder
.finalize ();
3398 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3399 widest predicate element size it can have (that is, the largest size
3400 for which each element would still be 0 or 1). */
3403 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
3405 /* Start with the most optimistic assumption: that we only need
3406 one bit per pattern. This is what we will use if only the first
3407 bit in each pattern is ever set. */
3408 unsigned int mask
= GET_MODE_SIZE (DImode
);
3409 mask
|= builder
.npatterns ();
3411 /* Look for set bits. */
3412 unsigned int nelts
= builder
.encoded_nelts ();
3413 for (unsigned int i
= 1; i
< nelts
; ++i
)
3414 if (INTVAL (builder
.elt (i
)) != 0)
3420 return mask
& -mask
;
3423 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3424 return that predicate mode, otherwise return opt_machine_mode (). */
3427 aarch64_ptrue_all_mode (rtx x
)
3429 gcc_assert (GET_MODE (x
) == VNx16BImode
);
3430 if (!CONST_VECTOR_P (x
)
3431 || !CONST_VECTOR_DUPLICATE_P (x
)
3432 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x
, 0))
3433 || INTVAL (CONST_VECTOR_ENCODED_ELT (x
, 0)) == 0)
3434 return opt_machine_mode ();
3436 unsigned int nelts
= const_vector_encoded_nelts (x
);
3437 for (unsigned int i
= 1; i
< nelts
; ++i
)
3438 if (CONST_VECTOR_ENCODED_ELT (x
, i
) != const0_rtx
)
3439 return opt_machine_mode ();
3441 return aarch64_sve_pred_mode (nelts
);
3444 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3445 that the constant would have with predicate element size ELT_SIZE
3446 (ignoring the upper bits in each element) and return:
3448 * -1 if all bits are set
3449 * N if the predicate has N leading set bits followed by all clear bits
3450 * 0 if the predicate does not have any of these forms. */
3453 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
3454 unsigned int elt_size
)
3456 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3457 followed by set bits. */
3458 if (builder
.nelts_per_pattern () == 3)
3461 /* Skip over leading set bits. */
3462 unsigned int nelts
= builder
.encoded_nelts ();
3464 for (; i
< nelts
; i
+= elt_size
)
3465 if (INTVAL (builder
.elt (i
)) == 0)
3467 unsigned int vl
= i
/ elt_size
;
3469 /* Check for the all-true case. */
3473 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3474 repeating pattern of set bits followed by clear bits. */
3475 if (builder
.nelts_per_pattern () != 2)
3478 /* We have a "foreground" value and a duplicated "background" value.
3479 If the background might repeat and the last set bit belongs to it,
3480 we might have set bits followed by clear bits followed by set bits. */
3481 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
3484 /* Make sure that the rest are all clear. */
3485 for (; i
< nelts
; i
+= elt_size
)
3486 if (INTVAL (builder
.elt (i
)) != 0)
3492 /* See if there is an svpattern that encodes an SVE predicate of mode
3493 PRED_MODE in which the first VL bits are set and the rest are clear.
3494 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3495 A VL of -1 indicates an all-true vector. */
3498 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
3501 return AARCH64_SV_ALL
;
3503 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
3504 return AARCH64_NUM_SVPATTERNS
;
3506 if (vl
>= 1 && vl
<= 8)
3507 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
3509 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
3510 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
3513 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
3515 if (vl
== (max_vl
/ 3) * 3)
3516 return AARCH64_SV_MUL3
;
3517 /* These would only trigger for non-power-of-2 lengths. */
3518 if (vl
== (max_vl
& -4))
3519 return AARCH64_SV_MUL4
;
3520 if (vl
== (1 << floor_log2 (max_vl
)))
3521 return AARCH64_SV_POW2
;
3523 return AARCH64_SV_ALL
;
3525 return AARCH64_NUM_SVPATTERNS
;
3528 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3529 bits has the lowest bit set and the upper bits clear. This is the
3530 VNx16BImode equivalent of a PTRUE for controlling elements of
3531 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3532 all bits are significant, even the upper zeros. */
3535 aarch64_ptrue_all (unsigned int elt_size
)
3537 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
3538 builder
.quick_push (const1_rtx
);
3539 for (unsigned int i
= 1; i
< elt_size
; ++i
)
3540 builder
.quick_push (const0_rtx
);
3541 return builder
.build ();
3544 /* Return an all-true predicate register of mode MODE. */
3547 aarch64_ptrue_reg (machine_mode mode
)
3549 gcc_assert (aarch64_sve_pred_mode_p (mode
));
3550 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3551 return gen_lowpart (mode
, reg
);
3554 /* Return an all-false predicate register of mode MODE. */
3557 aarch64_pfalse_reg (machine_mode mode
)
3559 gcc_assert (aarch64_sve_pred_mode_p (mode
));
3560 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
3561 return gen_lowpart (mode
, reg
);
3564 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3565 for it. PRED2[0] is the predicate for the instruction whose result
3566 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3567 for it. Return true if we can prove that the two predicates are
3568 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3569 with PRED1[0] without changing behavior. */
3572 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
3574 machine_mode mode
= GET_MODE (pred1
[0]);
3575 gcc_assert (aarch64_sve_pred_mode_p (mode
)
3576 && mode
== GET_MODE (pred2
[0])
3577 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
3578 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
3580 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
3581 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
3582 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
3583 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
3584 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
3587 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3588 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3589 Use TARGET as the target register if nonnull and convenient. */
3592 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
3593 machine_mode data_mode
, rtx op1
, rtx op2
)
3595 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
3596 expand_operand ops
[5];
3597 create_output_operand (&ops
[0], target
, pred_mode
);
3598 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
3599 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
3600 create_input_operand (&ops
[3], op1
, data_mode
);
3601 create_input_operand (&ops
[4], op2
, data_mode
);
3602 expand_insn (icode
, 5, ops
);
3603 return ops
[0].value
;
3606 /* Use a comparison to convert integer vector SRC into MODE, which is
3607 the corresponding SVE predicate mode. Use TARGET for the result
3608 if it's nonnull and convenient. */
3611 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
3613 machine_mode src_mode
= GET_MODE (src
);
3614 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
3615 src
, CONST0_RTX (src_mode
));
3618 /* Return the assembly token for svprfop value PRFOP. */
3621 svprfop_token (enum aarch64_svprfop prfop
)
3625 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3626 AARCH64_FOR_SVPRFOP (CASE
)
3628 case AARCH64_NUM_SVPRFOPS
:
3634 /* Return the assembly string for an SVE prefetch operation with
3635 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3636 and that SUFFIX is the format for the remaining operands. */
3639 aarch64_output_sve_prefetch (const char *mnemonic
, rtx prfop_rtx
,
3642 static char buffer
[128];
3643 aarch64_svprfop prfop
= (aarch64_svprfop
) INTVAL (prfop_rtx
);
3644 unsigned int written
= snprintf (buffer
, sizeof (buffer
), "%s\t%s, %s",
3645 mnemonic
, svprfop_token (prfop
), suffix
);
3646 gcc_assert (written
< sizeof (buffer
));
3650 /* Check whether we can calculate the number of elements in PATTERN
3651 at compile time, given that there are NELTS_PER_VQ elements per
3652 128-bit block. Return the value if so, otherwise return -1. */
3655 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern
, unsigned int nelts_per_vq
)
3657 unsigned int vl
, const_vg
;
3658 if (pattern
>= AARCH64_SV_VL1
&& pattern
<= AARCH64_SV_VL8
)
3659 vl
= 1 + (pattern
- AARCH64_SV_VL1
);
3660 else if (pattern
>= AARCH64_SV_VL16
&& pattern
<= AARCH64_SV_VL256
)
3661 vl
= 16 << (pattern
- AARCH64_SV_VL16
);
3662 else if (aarch64_sve_vg
.is_constant (&const_vg
))
3664 /* There are two vector granules per quadword. */
3665 unsigned int nelts
= (const_vg
/ 2) * nelts_per_vq
;
3668 case AARCH64_SV_POW2
: return 1 << floor_log2 (nelts
);
3669 case AARCH64_SV_MUL4
: return nelts
& -4;
3670 case AARCH64_SV_MUL3
: return (nelts
/ 3) * 3;
3671 case AARCH64_SV_ALL
: return nelts
;
3672 default: gcc_unreachable ();
3678 /* There are two vector granules per quadword. */
3679 poly_uint64 nelts_all
= exact_div (aarch64_sve_vg
, 2) * nelts_per_vq
;
3680 if (known_le (vl
, nelts_all
))
3683 /* Requesting more elements than are available results in a PFALSE. */
3684 if (known_gt (vl
, nelts_all
))
3690 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3691 by the number of 128-bit quadwords in an SVE vector. */
3694 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor
)
3696 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3697 return (IN_RANGE (factor
, 2, 16 * 16)
3698 && (factor
& 1) == 0
3699 && factor
<= 16 * (factor
& -factor
));
3702 /* Return true if we can move VALUE into a register using a single
3703 CNT[BHWD] instruction. */
3706 aarch64_sve_cnt_immediate_p (poly_int64 value
)
3708 HOST_WIDE_INT factor
= value
.coeffs
[0];
3709 return value
.coeffs
[1] == factor
&& aarch64_sve_cnt_factor_p (factor
);
3712 /* Likewise for rtx X. */
3715 aarch64_sve_cnt_immediate_p (rtx x
)
3718 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
3721 /* Return the asm string for an instruction with a CNT-like vector size
3722 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3723 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3724 first part of the operands template (the part that comes before the
3725 vector size itself). PATTERN is the pattern to use. FACTOR is the
3726 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3727 in each quadword. If it is zero, we can use any element size. */
3730 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3731 aarch64_svpattern pattern
,
3732 unsigned int factor
,
3733 unsigned int nelts_per_vq
)
3735 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3737 if (nelts_per_vq
== 0)
3738 /* There is some overlap in the ranges of the four CNT instructions.
3739 Here we always use the smallest possible element size, so that the
3740 multiplier is 1 whereever possible. */
3741 nelts_per_vq
= factor
& -factor
;
3742 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
3743 gcc_assert (IN_RANGE (shift
, 1, 4));
3744 char suffix
= "dwhb"[shift
- 1];
3747 unsigned int written
;
3748 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
3749 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
3750 prefix
, suffix
, operands
);
3751 else if (factor
== 1)
3752 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
3753 prefix
, suffix
, operands
, svpattern_token (pattern
));
3755 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
3756 prefix
, suffix
, operands
, svpattern_token (pattern
),
3758 gcc_assert (written
< sizeof (buffer
));
3762 /* Return the asm string for an instruction with a CNT-like vector size
3763 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3764 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3765 first part of the operands template (the part that comes before the
3766 vector size itself). X is the value of the vector size operand,
3767 as a polynomial integer rtx; we need to convert this into an "all"
3768 pattern with a multiplier. */
3771 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3774 poly_int64 value
= rtx_to_poly_int64 (x
);
3775 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
3776 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
3777 value
.coeffs
[1], 0);
3780 /* Return the asm string for an instruction with a CNT-like vector size
3781 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3782 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3783 first part of the operands template (the part that comes before the
3784 vector size itself). CNT_PAT[0..2] are the operands of the
3785 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3788 aarch64_output_sve_cnt_pat_immediate (const char *prefix
,
3789 const char *operands
, rtx
*cnt_pat
)
3791 aarch64_svpattern pattern
= (aarch64_svpattern
) INTVAL (cnt_pat
[0]);
3792 unsigned int nelts_per_vq
= INTVAL (cnt_pat
[1]);
3793 unsigned int factor
= INTVAL (cnt_pat
[2]) * nelts_per_vq
;
3794 return aarch64_output_sve_cnt_immediate (prefix
, operands
, pattern
,
3795 factor
, nelts_per_vq
);
3798 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3801 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
3804 return (poly_int_rtx_p (x
, &value
)
3805 && (aarch64_sve_cnt_immediate_p (value
)
3806 || aarch64_sve_cnt_immediate_p (-value
)));
3809 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3813 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3815 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3816 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3817 if (offset_value
.coeffs
[1] > 0)
3818 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3819 offset_value
.coeffs
[1], 0);
3821 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3822 -offset_value
.coeffs
[1], 0);
3825 /* Return true if a single RDVL instruction can multiply FACTOR by the
3826 number of 128-bit quadwords in an SVE vector. This is also the
3830 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor
)
3832 return (multiple_p (factor
, 16)
3833 && IN_RANGE (factor
, -32 * 16, 31 * 16));
3836 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3837 of quadwords in an SVE vector. */
3840 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor
)
3842 return (multiple_p (factor
, 2)
3843 && IN_RANGE (factor
, -32 * 2, 31 * 2));
3846 /* Return true if we can move VALUE into a register using a single
3847 RDVL instruction. */
3850 aarch64_sve_rdvl_immediate_p (poly_int64 value
)
3852 HOST_WIDE_INT factor
= value
.coeffs
[0];
3853 return value
.coeffs
[1] == factor
&& aarch64_sve_rdvl_addvl_factor_p (factor
);
3856 /* Likewise for rtx X. */
3859 aarch64_sve_rdvl_immediate_p (rtx x
)
3862 return poly_int_rtx_p (x
, &value
) && aarch64_sve_rdvl_immediate_p (value
);
3865 /* Return the asm string for moving RDVL immediate OFFSET into register
3869 aarch64_output_sve_rdvl (rtx offset
)
3871 static char buffer
[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3872 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3873 gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value
));
3875 int factor
= offset_value
.coeffs
[1];
3876 snprintf (buffer
, sizeof (buffer
), "rdvl\t%%x0, #%d", factor
/ 16);
3880 /* Return true if we can add VALUE to a register using a single ADDVL
3881 or ADDPL instruction. */
3884 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3886 HOST_WIDE_INT factor
= value
.coeffs
[0];
3887 if (factor
== 0 || value
.coeffs
[1] != factor
)
3889 return (aarch64_sve_rdvl_addvl_factor_p (factor
)
3890 || aarch64_sve_addpl_factor_p (factor
));
3893 /* Likewise for rtx X. */
3896 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3899 return (poly_int_rtx_p (x
, &value
)
3900 && aarch64_sve_addvl_addpl_immediate_p (value
));
3903 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3904 to operand 1 and storing the result in operand 0. */
3907 aarch64_output_sve_addvl_addpl (rtx offset
)
3909 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3910 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3911 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3913 int factor
= offset_value
.coeffs
[1];
3914 if ((factor
& 15) == 0)
3915 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3917 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3921 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3922 instruction. If it is, store the number of elements in each vector
3923 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3924 factor in *FACTOR_OUT (if nonnull). */
3927 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3928 unsigned int *nelts_per_vq_out
)
3933 if (!const_vec_duplicate_p (x
, &elt
)
3934 || !poly_int_rtx_p (elt
, &value
))
3937 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3938 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3939 /* There's no vector INCB. */
3942 HOST_WIDE_INT factor
= value
.coeffs
[0];
3943 if (value
.coeffs
[1] != factor
)
3946 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3947 if ((factor
% nelts_per_vq
) != 0
3948 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3952 *factor_out
= factor
;
3953 if (nelts_per_vq_out
)
3954 *nelts_per_vq_out
= nelts_per_vq
;
3958 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3962 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3964 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3967 /* Return the asm template for an SVE vector INC or DEC instruction.
3968 OPERANDS gives the operands before the vector count and X is the
3969 value of the vector count operand itself. */
3972 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3975 unsigned int nelts_per_vq
;
3976 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3979 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3980 -factor
, nelts_per_vq
);
3982 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3983 factor
, nelts_per_vq
);
3986 /* Return a constant that represents FACTOR multiplied by the
3987 number of 128-bit quadwords in an SME vector. ISA_MODE is the
3988 ISA mode in which the calculation is being performed. */
3991 aarch64_sme_vq_immediate (machine_mode mode
, HOST_WIDE_INT factor
,
3992 aarch64_feature_flags isa_mode
)
3994 gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor
));
3995 if (isa_mode
& AARCH64_FL_SM_ON
)
3996 /* We're in streaming mode, so we can use normal poly-int values. */
3997 return gen_int_mode ({ factor
, factor
}, mode
);
3999 rtvec vec
= gen_rtvec (1, gen_int_mode (factor
, SImode
));
4000 rtx unspec
= gen_rtx_UNSPEC (mode
, vec
, UNSPEC_SME_VQ
);
4001 return gen_rtx_CONST (mode
, unspec
);
4004 /* Return true if X is a constant that represents some number X
4005 multiplied by the number of quadwords in an SME vector. Store this X
4006 in *FACTOR if so. */
4009 aarch64_sme_vq_unspec_p (const_rtx x
, HOST_WIDE_INT
*factor
)
4011 if (!TARGET_SME
|| GET_CODE (x
) != CONST
)
4015 if (GET_CODE (x
) != UNSPEC
4016 || XINT (x
, 1) != UNSPEC_SME_VQ
4017 || XVECLEN (x
, 0) != 1)
4020 x
= XVECEXP (x
, 0, 0);
4021 if (!CONST_INT_P (x
))
4024 *factor
= INTVAL (x
);
4028 /* Return true if X is a constant that represents some number Y
4029 multiplied by the number of quadwords in an SME vector, and if
4030 that Y is in the range of RDSVL. */
4033 aarch64_rdsvl_immediate_p (const_rtx x
)
4035 HOST_WIDE_INT factor
;
4036 return (aarch64_sme_vq_unspec_p (x
, &factor
)
4037 && aarch64_sve_rdvl_addvl_factor_p (factor
));
4040 /* Return the asm string for an RDSVL instruction that calculates X,
4041 which is a constant that satisfies aarch64_rdsvl_immediate_p. */
4044 aarch64_output_rdsvl (const_rtx x
)
4046 gcc_assert (aarch64_rdsvl_immediate_p (x
));
4047 static char buffer
[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4048 x
= XVECEXP (XEXP (x
, 0), 0, 0);
4049 snprintf (buffer
, sizeof (buffer
), "rdsvl\t%%x0, #%d",
4050 (int) INTVAL (x
) / 16);
4054 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL. */
4057 aarch64_addsvl_addspl_immediate_p (const_rtx x
)
4059 HOST_WIDE_INT factor
;
4060 return (aarch64_sme_vq_unspec_p (x
, &factor
)
4061 && (aarch64_sve_rdvl_addvl_factor_p (factor
)
4062 || aarch64_sve_addpl_factor_p (factor
)));
4065 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4066 Return the asm string for the associated instruction. */
4069 aarch64_output_addsvl_addspl (rtx x
)
4071 static char buffer
[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4072 HOST_WIDE_INT factor
;
4073 if (!aarch64_sme_vq_unspec_p (x
, &factor
))
4075 if (aarch64_sve_rdvl_addvl_factor_p (factor
))
4076 snprintf (buffer
, sizeof (buffer
), "addsvl\t%%x0, %%x1, #%d",
4078 else if (aarch64_sve_addpl_factor_p (factor
))
4079 snprintf (buffer
, sizeof (buffer
), "addspl\t%%x0, %%x1, #%d",
4086 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4088 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
4090 0x0000000100000001ull
,
4091 0x0001000100010001ull
,
4092 0x0101010101010101ull
,
4093 0x1111111111111111ull
,
4094 0x5555555555555555ull
,
4099 /* Return true if 64-bit VAL is a valid bitmask immediate. */
4101 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val
)
4103 unsigned HOST_WIDE_INT tmp
, mask
, first_one
, next_one
;
4106 /* Check for a single sequence of one bits and return quickly if so.
4107 The special cases of all ones and all zeroes returns false. */
4108 tmp
= val
+ (val
& -val
);
4110 if (tmp
== (tmp
& -tmp
))
4111 return (val
+ 1) > 1;
4113 /* Invert if the immediate doesn't start with a zero bit - this means we
4114 only need to search for sequences of one bits. */
4118 /* Find the first set bit and set tmp to val with the first sequence of one
4119 bits removed. Return success if there is a single sequence of ones. */
4120 first_one
= val
& -val
;
4121 tmp
= val
& (val
+ first_one
);
4126 /* Find the next set bit and compute the difference in bit position. */
4127 next_one
= tmp
& -tmp
;
4128 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
4131 /* Check the bit position difference is a power of 2, and that the first
4132 sequence of one bits fits within 'bits' bits. */
4133 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
4136 /* Check the sequence of one bits is repeated 64/bits times. */
4137 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
4141 /* Return true if VAL is a valid bitmask immediate for MODE. */
4143 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
4146 return aarch64_bitmask_imm (val
);
4149 return aarch64_bitmask_imm ((val
& 0xffffffff) | (val
<< 32));
4151 /* Replicate small immediates to fit 64 bits. */
4152 int size
= GET_MODE_UNIT_PRECISION (mode
);
4153 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
4154 val
*= bitmask_imm_mul
[__builtin_clz (size
) - 26];
4156 return aarch64_bitmask_imm (val
);
4160 /* Return true if the immediate VAL can be a bitfield immediate
4161 by changing the given MASK bits in VAL to zeroes, ones or bits
4162 from the other half of VAL. Return the new immediate in VAL2. */
4164 aarch64_check_bitmask (unsigned HOST_WIDE_INT val
,
4165 unsigned HOST_WIDE_INT
&val2
,
4166 unsigned HOST_WIDE_INT mask
)
4169 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4172 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4175 val2
= val
| (((val
>> 32) | (val
<< 32)) & mask
);
4176 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4178 val2
= val
| (((val
>> 16) | (val
<< 48)) & mask
);
4179 if (val2
!= val
&& aarch64_bitmask_imm (val2
))
4185 /* Return true if VAL is a valid MOVZ immediate. */
4187 aarch64_is_movz (unsigned HOST_WIDE_INT val
)
4189 return (val
>> (ctz_hwi (val
) & 48)) < 65536;
4193 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
4195 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val
)
4197 return aarch64_is_movz (val
) || aarch64_is_movz (~val
)
4198 || aarch64_bitmask_imm (val
);
4202 /* Return true if VAL is an immediate that can be created by a single
4205 aarch64_move_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
4207 gcc_assert (mode
== SImode
|| mode
== DImode
);
4212 unsigned HOST_WIDE_INT mask
=
4213 (val
>> 32) == 0 || mode
== SImode
? 0xffffffff : HOST_WIDE_INT_M1U
;
4215 if (aarch64_is_movz (val
& mask
) || aarch64_is_movz (~val
& mask
))
4218 val
= (val
& mask
) | ((val
<< 32) & ~mask
);
4219 return aarch64_bitmask_imm (val
);
4224 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
4228 unsigned HOST_WIDE_INT val
, val2
, val3
, mask
;
4229 int one_match
, zero_match
;
4232 gcc_assert (mode
== SImode
|| mode
== DImode
);
4236 if (aarch64_move_imm (val
, mode
))
4239 emit_insn (gen_rtx_SET (dest
, imm
));
4243 if ((val
>> 32) == 0 || mode
== SImode
)
4247 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
4249 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
4250 GEN_INT ((val
>> 16) & 0xffff)));
4252 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4253 GEN_INT ((val
>> 16) & 0xffff)));
4258 /* Remaining cases are all for DImode. */
4261 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
4262 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
4263 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
4264 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
4266 /* Try a bitmask immediate and a movk to generate the immediate
4267 in 2 instructions. */
4269 if (zero_match
< 2 && one_match
< 2)
4271 for (i
= 0; i
< 64; i
+= 16)
4273 if (aarch64_check_bitmask (val
, val2
, mask
<< i
))
4276 val2
= val
& ~(mask
<< i
);
4277 if ((val2
>> 32) == 0 && aarch64_move_imm (val2
, DImode
))
4285 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4286 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4287 GEN_INT ((val
>> i
) & 0xffff)));
4292 /* Try 2 bitmask immediates which are xor'd together. */
4293 for (i
= 0; i
< 64; i
+= 16)
4295 val2
= (val
>> i
) & mask
;
4298 if (aarch64_bitmask_imm (val2
) && aarch64_bitmask_imm (val
^ val2
))
4306 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4307 emit_insn (gen_xordi3 (dest
, dest
, GEN_INT (val
^ val2
)));
4313 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
4314 if (zero_match
+ one_match
== 0)
4316 for (i
= 0; i
< 48; i
+= 16)
4317 for (int j
= i
+ 16; j
< 64; j
+= 16)
4318 if (aarch64_check_bitmask (val
, val2
, (mask
<< i
) | (mask
<< j
)))
4322 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
4323 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4324 GEN_INT ((val
>> i
) & 0xffff)));
4325 emit_insn (gen_insv_immdi (dest
, GEN_INT (j
),
4326 GEN_INT ((val
>> j
) & 0xffff)));
4331 /* Try shifting and inserting the bottom 32-bits into the top bits. */
4332 val2
= val
& 0xffffffff;
4334 val3
= val2
| (val3
<< 32);
4335 for (i
= 17; i
< 48; i
++)
4336 if ((val2
| (val2
<< i
)) == val
)
4340 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
& 0xffff)));
4341 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4342 GEN_INT (val2
>> 16)));
4343 emit_insn (gen_ior_ashldi3 (dest
, dest
, GEN_INT (i
), dest
));
4347 else if ((val3
& ~(val3
<< i
)) == val
)
4351 emit_insn (gen_rtx_SET (dest
, GEN_INT (val3
| 0xffff0000)));
4352 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
4353 GEN_INT (val2
>> 16)));
4354 emit_insn (gen_and_one_cmpl_ashldi3 (dest
, dest
, GEN_INT (i
),
4361 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4362 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4363 otherwise skip zero bits. */
4367 val2
= one_match
> zero_match
? ~val
: val
;
4368 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
4371 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
4372 ? (val
| ~(mask
<< i
))
4373 : (val
& (mask
<< i
)))));
4374 for (i
+= 16; i
< 64; i
+= 16)
4376 if ((val2
& (mask
<< i
)) == 0)
4379 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
4380 GEN_INT ((val
>> i
) & 0xffff)));
4387 /* Return whether imm is a 128-bit immediate which is simple enough to
4390 aarch64_mov128_immediate (rtx imm
)
4392 if (CONST_INT_P (imm
))
4395 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
4397 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
4398 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
4400 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
4401 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
4405 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4406 a left shift of 0 or 12 bits. */
4408 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val
)
4410 return val
< 4096 || (val
& 0xfff000) == val
;
4413 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4414 that can be created with a left shift of 0 or 12. */
4415 static HOST_WIDE_INT
4416 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val
)
4418 /* Check to see if the value fits in 24 bits, as that is the maximum we can
4419 handle correctly. */
4420 gcc_assert (val
< 0x1000000);
4425 return val
& 0xfff000;
4431 X = (X & AND_VAL) | IOR_VAL;
4433 can be implemented using:
4435 MOVK X, #(IOR_VAL >> shift), LSL #shift
4437 Return the shift if so, otherwise return -1. */
4439 aarch64_movk_shift (const wide_int_ref
&and_val
,
4440 const wide_int_ref
&ior_val
)
4442 unsigned int precision
= and_val
.get_precision ();
4443 unsigned HOST_WIDE_INT mask
= 0xffff;
4444 for (unsigned int shift
= 0; shift
< precision
; shift
+= 16)
4446 if (and_val
== ~mask
&& (ior_val
& mask
) == ior_val
)
4453 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4454 Assumed precondition: VAL_IN Is not zero. */
4456 unsigned HOST_WIDE_INT
4457 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
4459 int lowest_bit_set
= ctz_hwi (val_in
);
4460 int highest_bit_set
= floor_log2 (val_in
);
4461 gcc_assert (val_in
!= 0);
4463 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
4464 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
4467 /* Create constant where bits outside of lowest bit set to highest bit set
4470 unsigned HOST_WIDE_INT
4471 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
4473 return val_in
| ~aarch64_and_split_imm1 (val_in
);
4476 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4479 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
4481 scalar_int_mode int_mode
;
4482 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
4485 if (aarch64_bitmask_imm (val_in
, int_mode
))
4488 if (aarch64_move_imm (val_in
, int_mode
))
4491 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
4493 return aarch64_bitmask_imm (imm2
, int_mode
);
4496 /* Return the number of temporary registers that aarch64_add_offset_1
4497 would need to add OFFSET to a register. */
4500 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
4502 return absu_hwi (offset
) < 0x1000000 ? 0 : 1;
4505 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4506 a non-polynomial OFFSET. MODE is the mode of the addition.
4507 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4508 be set and CFA adjustments added to the generated instructions.
4510 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4511 temporary if register allocation is already complete. This temporary
4512 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4513 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4514 the immediate again.
4516 Since this function may be used to adjust the stack pointer, we must
4517 ensure that it cannot cause transient stack deallocation (for example
4518 by first incrementing SP and then decrementing when adjusting by a
4519 large immediate). */
4522 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
4523 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
4524 bool frame_related_p
, bool emit_move_imm
)
4526 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
4527 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
4529 unsigned HOST_WIDE_INT moffset
= absu_hwi (offset
);
4534 if (!rtx_equal_p (dest
, src
))
4536 insn
= emit_insn (gen_rtx_SET (dest
, src
));
4537 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4542 /* Single instruction adjustment. */
4543 if (aarch64_uimm12_shift (moffset
))
4545 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
4546 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4550 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4553 a) the offset cannot be loaded by a 16-bit move or
4554 b) there is no spare register into which we can move it. */
4555 if (moffset
< 0x1000000
4556 && ((!temp1
&& !can_create_pseudo_p ())
4557 || !aarch64_move_imm (moffset
, mode
)))
4559 HOST_WIDE_INT low_off
= moffset
& 0xfff;
4561 low_off
= offset
< 0 ? -low_off
: low_off
;
4562 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
4563 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4564 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
4565 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4569 /* Emit a move immediate if required and an addition/subtraction. */
4572 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
4573 temp1
= aarch64_force_temporary (mode
, temp1
,
4574 gen_int_mode (moffset
, mode
));
4576 insn
= emit_insn (offset
< 0
4577 ? gen_sub3_insn (dest
, src
, temp1
)
4578 : gen_add3_insn (dest
, src
, temp1
));
4579 if (frame_related_p
)
4581 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4582 rtx adj
= plus_constant (mode
, src
, offset
);
4583 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
4587 /* Return the number of temporary registers that aarch64_add_offset
4588 would need to move OFFSET into a register or add OFFSET to a register;
4589 ADD_P is true if we want the latter rather than the former. */
4592 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
4594 /* This follows the same structure as aarch64_add_offset. */
4595 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
4598 unsigned int count
= 0;
4599 HOST_WIDE_INT factor
= offset
.coeffs
[1];
4600 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
4601 poly_int64
poly_offset (factor
, factor
);
4602 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
4603 /* Need one register for the ADDVL/ADDPL result. */
4605 else if (factor
!= 0)
4607 factor
/= (HOST_WIDE_INT
) least_bit_hwi (factor
);
4608 if (!IN_RANGE (factor
, -32, 31))
4609 /* Need one register for the CNT or RDVL result and one for the
4610 multiplication factor. If necessary, the second temporary
4611 can be reused for the constant part of the offset. */
4613 /* Need one register for the CNT or RDVL result (which might then
4617 return count
+ aarch64_add_offset_1_temporaries (constant
);
4620 /* If X can be represented as a poly_int64, return the number
4621 of temporaries that are required to add it to a register.
4622 Return -1 otherwise. */
4625 aarch64_add_offset_temporaries (rtx x
)
4628 if (!poly_int_rtx_p (x
, &offset
))
4630 return aarch64_offset_temporaries (true, offset
);
4633 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4634 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4635 be set and CFA adjustments added to the generated instructions.
4637 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4638 temporary if register allocation is already complete. This temporary
4639 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4640 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4641 false to avoid emitting the immediate again.
4643 TEMP2, if nonnull, is a second temporary register that doesn't
4644 overlap either DEST or REG.
4646 FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
4647 is measured relative to the SME vector length instead of the current
4648 prevailing vector length. It is 0 otherwise.
4650 Since this function may be used to adjust the stack pointer, we must
4651 ensure that it cannot cause transient stack deallocation (for example
4652 by first incrementing SP and then decrementing when adjusting by a
4653 large immediate). */
4656 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4657 poly_int64 offset
, rtx temp1
, rtx temp2
,
4658 aarch64_feature_flags force_isa_mode
,
4659 bool frame_related_p
, bool emit_move_imm
= true)
4661 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
4662 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
4663 gcc_assert (temp1
== NULL_RTX
4665 || !reg_overlap_mentioned_p (temp1
, dest
));
4666 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
4668 /* Try using ADDVL or ADDPL to add the whole value. */
4669 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
4671 gcc_assert (offset
.coeffs
[0] == offset
.coeffs
[1]);
4673 if (force_isa_mode
== 0)
4674 offset_rtx
= gen_int_mode (offset
, mode
);
4676 offset_rtx
= aarch64_sme_vq_immediate (mode
, offset
.coeffs
[0], 0);
4677 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
4678 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
4679 if (frame_related_p
&& (force_isa_mode
& AARCH64_FL_SM_ON
))
4680 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4681 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4686 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4687 SVE vector register, over and above the minimum size of 128 bits.
4688 This is equivalent to half the value returned by CNTD with a
4689 vector shape of ALL. */
4690 HOST_WIDE_INT factor
= offset
.coeffs
[1];
4691 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
4693 /* Try using ADDVL or ADDPL to add the VG-based part. */
4694 poly_int64
poly_offset (factor
, factor
);
4695 if (src
!= const0_rtx
4696 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
4699 if (force_isa_mode
== 0)
4700 offset_rtx
= gen_int_mode (poly_offset
, mode
);
4702 offset_rtx
= aarch64_sme_vq_immediate (mode
, factor
, 0);
4703 if (frame_related_p
)
4705 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
4706 RTX_FRAME_RELATED_P (insn
) = true;
4707 if (force_isa_mode
& AARCH64_FL_SM_ON
)
4708 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4709 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4715 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
4716 src
= aarch64_force_temporary (mode
, temp1
, addr
);
4721 /* Otherwise use a CNT-based sequence. */
4722 else if (factor
!= 0)
4724 /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4725 with negative shifts indicating a shift right. */
4726 HOST_WIDE_INT low_bit
= least_bit_hwi (factor
);
4727 HOST_WIDE_INT rel_factor
= factor
/ low_bit
;
4728 int shift
= exact_log2 (low_bit
) - 4;
4729 gcc_assert (shift
>= -4 && (rel_factor
& 1) != 0);
4731 /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4732 equal to CNTB * FACTOR / 16, with CODE being the [+-].
4734 We can avoid a multiplication if REL_FACTOR is in the range
4735 of RDVL, although there are then various optimizations that
4736 we can try on top. */
4737 rtx_code code
= PLUS
;
4739 if (IN_RANGE (rel_factor
, -32, 31))
4741 if (force_isa_mode
& AARCH64_FL_SM_ON
)
4743 /* Try to use an unshifted RDSVL, otherwise fall back on
4744 a shifted RDSVL #1. */
4745 if (aarch64_sve_rdvl_addvl_factor_p (factor
))
4748 factor
= rel_factor
* 16;
4749 val
= aarch64_sme_vq_immediate (mode
, factor
, 0);
4751 /* Try to use an unshifted CNT[BHWD] or RDVL. */
4752 else if (aarch64_sve_cnt_factor_p (factor
)
4753 || aarch64_sve_rdvl_addvl_factor_p (factor
))
4755 val
= gen_int_mode (poly_int64 (factor
, factor
), mode
);
4758 /* Try to subtract an unshifted CNT[BHWD]. */
4759 else if (aarch64_sve_cnt_factor_p (-factor
))
4762 val
= gen_int_mode (poly_int64 (-factor
, -factor
), mode
);
4765 /* If subtraction is free, prefer to load a positive constant.
4766 In the best case this will fit a shifted CNTB. */
4767 else if (src
!= const0_rtx
&& rel_factor
< 0)
4770 val
= gen_int_mode (-rel_factor
* BYTES_PER_SVE_VECTOR
, mode
);
4772 /* Otherwise use a shifted RDVL or CNT[BHWD]. */
4774 val
= gen_int_mode (rel_factor
* BYTES_PER_SVE_VECTOR
, mode
);
4778 /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4779 since it should increase the chances of being able to use
4780 a shift and add sequence for the multiplication.
4781 If CNTB << SHIFT is out of range, stick with the current
4783 if (force_isa_mode
== 0
4784 && IN_RANGE (low_bit
, 2, 16 * 16))
4786 val
= gen_int_mode (poly_int64 (low_bit
, low_bit
), mode
);
4789 else if ((force_isa_mode
& AARCH64_FL_SM_ON
)
4790 && aarch64_sve_rdvl_addvl_factor_p (low_bit
))
4792 val
= aarch64_sme_vq_immediate (mode
, low_bit
, 0);
4796 val
= gen_int_mode (BYTES_PER_SVE_VECTOR
, mode
);
4798 val
= aarch64_force_temporary (mode
, temp1
, val
);
4800 /* Prefer to multiply by a positive factor and subtract rather
4801 than multiply by a negative factor and add, since positive
4802 values are usually easier to move. */
4803 if (rel_factor
< 0 && src
!= const0_rtx
)
4805 rel_factor
= -rel_factor
;
4809 if (can_create_pseudo_p ())
4811 rtx coeff1
= gen_int_mode (rel_factor
, mode
);
4812 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, true, true);
4816 rtx coeff1
= gen_int_mode (rel_factor
, mode
);
4817 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
4818 val
= gen_rtx_MULT (mode
, val
, coeff1
);
4822 /* Multiply by 2 ** SHIFT. */
4825 val
= aarch64_force_temporary (mode
, temp1
, val
);
4826 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
4830 val
= aarch64_force_temporary (mode
, temp1
, val
);
4831 val
= gen_rtx_ASHIFTRT (mode
, val
, GEN_INT (-shift
));
4834 /* Add the result to SRC or subtract the result from SRC. */
4835 if (src
!= const0_rtx
)
4837 val
= aarch64_force_temporary (mode
, temp1
, val
);
4838 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
4840 else if (code
== MINUS
)
4842 val
= aarch64_force_temporary (mode
, temp1
, val
);
4843 val
= gen_rtx_NEG (mode
, val
);
4846 if (constant
== 0 || frame_related_p
)
4848 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
4849 if (frame_related_p
)
4851 RTX_FRAME_RELATED_P (insn
) = true;
4852 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4853 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
4862 src
= aarch64_force_temporary (mode
, temp1
, val
);
4867 emit_move_imm
= true;
4870 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
4871 frame_related_p
, emit_move_imm
);
4874 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4875 than a poly_int64. */
4878 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
4879 rtx offset_rtx
, rtx temp1
, rtx temp2
)
4881 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
4882 temp1
, temp2
, 0, false);
4885 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4886 TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
4887 for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
4888 contains abs (DELTA). */
4891 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
,
4892 aarch64_feature_flags force_isa_mode
, bool emit_move_imm
)
4894 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
4895 temp1
, temp2
, force_isa_mode
, true, emit_move_imm
);
4898 /* Subtract DELTA from the stack pointer, marking the instructions
4899 frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
4900 aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
4903 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
,
4904 aarch64_feature_flags force_isa_mode
,
4905 bool frame_related_p
, bool emit_move_imm
= true)
4907 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
4908 temp1
, temp2
, force_isa_mode
, frame_related_p
,
4912 /* A streaming-compatible function needs to switch temporarily to the known
4913 PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains
4914 the runtime state of PSTATE.SM in the streaming-compatible code, before
4915 the start of the switch to LOCAL_MODE.
4917 Emit instructions to branch around the mode switch if PSTATE.SM already
4918 matches LOCAL_MODE. Return the label that the branch jumps to. */
4921 aarch64_guard_switch_pstate_sm (rtx old_svcr
, aarch64_feature_flags local_mode
)
4923 local_mode
&= AARCH64_FL_SM_STATE
;
4924 gcc_assert (local_mode
!= 0);
4925 auto already_ok_cond
= (local_mode
& AARCH64_FL_SM_ON
? NE
: EQ
);
4926 auto *label
= gen_label_rtx ();
4927 auto branch
= aarch64_gen_test_and_branch (already_ok_cond
, old_svcr
, 0,
4929 auto *jump
= emit_jump_insn (branch
);
4930 JUMP_LABEL (jump
) = label
;
4934 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
4935 state in NEW_MODE. This is known to involve either an SMSTART SM or
4939 aarch64_switch_pstate_sm (aarch64_feature_flags old_mode
,
4940 aarch64_feature_flags new_mode
)
4942 old_mode
&= AARCH64_FL_SM_STATE
;
4943 new_mode
&= AARCH64_FL_SM_STATE
;
4944 gcc_assert (old_mode
!= new_mode
);
4946 if ((new_mode
& AARCH64_FL_SM_ON
)
4947 || (new_mode
== 0 && (old_mode
& AARCH64_FL_SM_OFF
)))
4948 emit_insn (gen_aarch64_smstart_sm ());
4950 emit_insn (gen_aarch64_smstop_sm ());
4953 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
4954 FP and predicate registers. This class emits code to preserve any
4955 necessary registers around the mode switch.
4957 The class uses four approaches to saving and restoring contents, enumerated
4960 - GPR: save and restore the contents of FP registers using GPRs.
4961 This is used if the FP register contains no more than 64 significant
4962 bits. The registers used are FIRST_GPR onwards.
4964 - MEM_128: save and restore 128-bit SIMD registers using memory.
4966 - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
4968 - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
4970 The save slots within each memory group are consecutive, with the
4971 MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
4973 There will only be two mode switches for each use of SME, so they should
4974 not be particularly performance-sensitive. It's also rare for SIMD, SVE
4975 or predicate registers to be live across mode switches. We therefore
4976 don't preallocate the save slots but instead allocate them locally on
4977 demand. This makes the code emitted by the class self-contained. */
4979 class aarch64_sme_mode_switch_regs
4982 static const unsigned int FIRST_GPR
= R10_REGNUM
;
4984 void add_reg (machine_mode
, unsigned int);
4985 void add_call_args (rtx_call_insn
*);
4986 void add_call_result (rtx_call_insn
*);
4987 void add_call_preserved_reg (unsigned int);
4988 void add_call_preserved_regs (bitmap
);
4990 void emit_prologue ();
4991 void emit_epilogue ();
4993 /* The number of GPRs needed to save FP registers, starting from
4995 unsigned int num_gprs () { return m_group_count
[GPR
]; }
4998 enum sequence
{ PROLOGUE
, EPILOGUE
};
4999 enum group_type
{ GPR
, MEM_128
, MEM_SVE_PRED
, MEM_SVE_DATA
, NUM_GROUPS
};
5001 /* Information about the save location for one FP, SIMD, SVE data, or
5002 SVE predicate register. */
5003 struct save_location
{
5004 /* The register to be saved. */
5007 /* Which group the save location belongs to. */
5010 /* A zero-based index of the register within the group. */
5014 unsigned int sve_data_headroom ();
5015 rtx
get_slot_mem (machine_mode
, poly_int64
);
5016 void emit_stack_adjust (sequence
, poly_int64
);
5017 void emit_mem_move (sequence
, const save_location
&, poly_int64
);
5019 void emit_gpr_moves (sequence
);
5020 void emit_mem_128_moves (sequence
);
5021 void emit_sve_sp_adjust (sequence
);
5022 void emit_sve_pred_moves (sequence
);
5023 void emit_sve_data_moves (sequence
);
5025 /* All save locations, in no particular order. */
5026 auto_vec
<save_location
, 12> m_save_locations
;
5028 /* The number of registers in each group. */
5029 unsigned int m_group_count
[NUM_GROUPS
] = {};
5032 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5036 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode
, unsigned int regno
)
5038 if (!FP_REGNUM_P (regno
) && !PR_REGNUM_P (regno
))
5041 unsigned int end_regno
= end_hard_regno (mode
, regno
);
5042 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5043 gcc_assert ((vec_flags
& VEC_STRUCT
) || end_regno
== regno
+ 1);
5044 for (; regno
< end_regno
; regno
++)
5046 /* Force the mode of SVE saves and restores even for single registers.
5047 This is necessary because big-endian targets only allow LDR Z and
5048 STR Z to be used with byte modes. */
5049 machine_mode submode
= mode
;
5050 if (vec_flags
& VEC_SVE_PRED
)
5051 submode
= VNx16BImode
;
5052 else if (vec_flags
& VEC_SVE_DATA
)
5053 submode
= SVE_BYTE_MODE
;
5054 else if (vec_flags
& VEC_STRUCT
)
5056 if (vec_flags
& VEC_PARTIAL
)
5059 submode
= V16QImode
;
5062 loc
.reg
= gen_rtx_REG (submode
, regno
);
5063 if (vec_flags
& VEC_SVE_PRED
)
5065 gcc_assert (PR_REGNUM_P (regno
));
5066 loc
.group
= MEM_SVE_PRED
;
5070 gcc_assert (FP_REGNUM_P (regno
));
5071 if (known_le (GET_MODE_SIZE (submode
), 8))
5073 else if (known_eq (GET_MODE_SIZE (submode
), 16))
5074 loc
.group
= MEM_128
;
5076 loc
.group
= MEM_SVE_DATA
;
5078 loc
.index
= m_group_count
[loc
.group
]++;
5079 m_save_locations
.quick_push (loc
);
5083 /* Record that the arguments to CALL_INSN need to be preserved around
5087 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn
*call_insn
)
5089 for (rtx node
= CALL_INSN_FUNCTION_USAGE (call_insn
);
5090 node
; node
= XEXP (node
, 1))
5092 rtx item
= XEXP (node
, 0);
5093 if (GET_CODE (item
) != USE
)
5095 item
= XEXP (item
, 0);
5098 add_reg (GET_MODE (item
), REGNO (item
));
5102 /* Record that the return value from CALL_INSN (if any) needs to be
5103 preserved around the mode switch. */
5106 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn
*call_insn
)
5108 rtx pat
= PATTERN (call_insn
);
5109 gcc_assert (GET_CODE (pat
) == PARALLEL
);
5110 pat
= XVECEXP (pat
, 0, 0);
5111 if (GET_CODE (pat
) == CALL
)
5113 rtx dest
= SET_DEST (pat
);
5114 if (GET_CODE (dest
) == PARALLEL
)
5115 for (int i
= 0; i
< XVECLEN (dest
, 0); ++i
)
5117 rtx x
= XVECEXP (dest
, 0, i
);
5118 gcc_assert (GET_CODE (x
) == EXPR_LIST
);
5119 rtx reg
= XEXP (x
, 0);
5120 add_reg (GET_MODE (reg
), REGNO (reg
));
5123 add_reg (GET_MODE (dest
), REGNO (dest
));
5126 /* REGNO is a register that is call-preserved under the current function's ABI.
5127 Record that it must be preserved around the mode switch. */
5130 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno
)
5132 if (FP_REGNUM_P (regno
))
5133 switch (crtl
->abi
->id ())
5136 add_reg (VNx16QImode
, regno
);
5139 add_reg (V16QImode
, regno
);
5141 case ARM_PCS_AAPCS64
:
5142 add_reg (DImode
, regno
);
5147 else if (PR_REGNUM_P (regno
))
5148 add_reg (VNx16BImode
, regno
);
5151 /* The hard registers in REGS are call-preserved under the current function's
5152 ABI. Record that they must be preserved around the mode switch. */
5155 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs
)
5159 EXECUTE_IF_SET_IN_BITMAP (regs
, 0, regno
, bi
)
5160 if (HARD_REGISTER_NUM_P (regno
))
5161 add_call_preserved_reg (regno
);
5166 /* Emit code to save registers before the mode switch. */
5169 aarch64_sme_mode_switch_regs::emit_prologue ()
5171 emit_sve_sp_adjust (PROLOGUE
);
5172 emit_sve_pred_moves (PROLOGUE
);
5173 emit_sve_data_moves (PROLOGUE
);
5174 emit_mem_128_moves (PROLOGUE
);
5175 emit_gpr_moves (PROLOGUE
);
5178 /* Emit code to restore registers after the mode switch. */
5181 aarch64_sme_mode_switch_regs::emit_epilogue ()
5183 emit_gpr_moves (EPILOGUE
);
5184 emit_mem_128_moves (EPILOGUE
);
5185 emit_sve_pred_moves (EPILOGUE
);
5186 emit_sve_data_moves (EPILOGUE
);
5187 emit_sve_sp_adjust (EPILOGUE
);
5190 /* The SVE predicate registers are stored below the SVE data registers,
5191 with the predicate save area being padded to a data-register-sized
5192 boundary. Return the size of this padded area as a whole number
5193 of data register slots. */
5196 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5198 return CEIL (m_group_count
[MEM_SVE_PRED
], 8);
5201 /* Return a memory reference of mode MODE to OFFSET bytes from the
5205 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode
,
5208 rtx addr
= plus_constant (Pmode
, stack_pointer_rtx
, offset
);
5209 return gen_rtx_MEM (mode
, addr
);
5212 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */
5215 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq
,
5218 if (seq
== PROLOGUE
)
5220 emit_insn (gen_rtx_SET (stack_pointer_rtx
,
5221 plus_constant (Pmode
, stack_pointer_rtx
, size
)));
5224 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5225 the stack pointer. SEQ chooses between saving and restoring. */
5228 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq
,
5229 const save_location
&loc
,
5232 rtx mem
= get_slot_mem (GET_MODE (loc
.reg
), offset
);
5233 if (seq
== PROLOGUE
)
5234 emit_move_insn (mem
, loc
.reg
);
5236 emit_move_insn (loc
.reg
, mem
);
5239 /* Emit instructions to save or restore the GPR group. SEQ chooses between
5240 saving and restoring. */
5243 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq
)
5245 for (auto &loc
: m_save_locations
)
5246 if (loc
.group
== GPR
)
5248 gcc_assert (loc
.index
< 8);
5249 rtx gpr
= gen_rtx_REG (GET_MODE (loc
.reg
), FIRST_GPR
+ loc
.index
);
5250 if (seq
== PROLOGUE
)
5251 emit_move_insn (gpr
, loc
.reg
);
5253 emit_move_insn (loc
.reg
, gpr
);
5257 /* Emit instructions to save or restore the MEM_128 group. SEQ chooses
5258 between saving and restoring. */
5261 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq
)
5263 HOST_WIDE_INT count
= m_group_count
[MEM_128
];
5267 auto sp
= stack_pointer_rtx
;
5268 auto sp_adjust
= (seq
== PROLOGUE
? -count
: count
) * 16;
5270 /* Pick a common mode that supports LDR & STR with pre/post-modification
5271 and LDP & STP with pre/post-modification. */
5274 /* An instruction pattern that should be emitted at the end. */
5275 rtx last_pat
= NULL_RTX
;
5277 /* A previous MEM_128 location that hasn't been handled yet. */
5278 save_location
*prev_loc
= nullptr;
5280 /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */
5281 for (auto &loc
: m_save_locations
)
5282 if (loc
.group
== MEM_128
)
5289 gcc_assert (loc
.index
== prev_loc
->index
+ 1);
5291 /* The offset of the base of the save area from the current
5293 HOST_WIDE_INT bias
= 0;
5294 if (prev_loc
->index
== 0 && seq
== PROLOGUE
)
5297 /* Get the two sets in the LDP/STP. */
5299 gen_rtx_REG (mode
, REGNO (prev_loc
->reg
)),
5300 get_slot_mem (mode
, prev_loc
->index
* 16 + bias
),
5301 gen_rtx_REG (mode
, REGNO (loc
.reg
)),
5302 get_slot_mem (mode
, loc
.index
* 16 + bias
)
5304 unsigned int lhs
= (seq
== PROLOGUE
);
5305 rtx set1
= gen_rtx_SET (ops
[lhs
], ops
[1 - lhs
]);
5306 rtx set2
= gen_rtx_SET (ops
[lhs
+ 2], ops
[3 - lhs
]);
5308 /* Combine the sets with any stack allocation/deallocation. */
5310 if (prev_loc
->index
== 0)
5312 rtx plus_sp
= plus_constant (Pmode
, sp
, sp_adjust
);
5313 rtvec vec
= gen_rtvec (3, gen_rtx_SET (sp
, plus_sp
), set1
, set2
);
5314 pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
5316 else if (seq
== PROLOGUE
)
5317 pat
= aarch64_gen_store_pair (ops
[1], ops
[0], ops
[2]);
5319 pat
= aarch64_gen_load_pair (ops
[0], ops
[2], ops
[1]);
5321 /* Queue a deallocation to the end, otherwise emit the
5323 if (seq
== EPILOGUE
&& prev_loc
->index
== 0)
5330 /* Handle any leftover LDR/STR. */
5333 rtx reg
= gen_rtx_REG (mode
, REGNO (prev_loc
->reg
));
5335 if (prev_loc
->index
!= 0)
5336 addr
= plus_constant (Pmode
, sp
, prev_loc
->index
* 16);
5337 else if (seq
== PROLOGUE
)
5339 rtx allocate
= plus_constant (Pmode
, sp
, -count
* 16);
5340 addr
= gen_rtx_PRE_MODIFY (Pmode
, sp
, allocate
);
5344 rtx deallocate
= plus_constant (Pmode
, sp
, count
* 16);
5345 addr
= gen_rtx_POST_MODIFY (Pmode
, sp
, deallocate
);
5347 rtx mem
= gen_rtx_MEM (mode
, addr
);
5348 if (seq
== PROLOGUE
)
5349 emit_move_insn (mem
, reg
);
5351 emit_move_insn (reg
, mem
);
5355 emit_insn (last_pat
);
5358 /* Allocate or deallocate the stack space needed by the SVE groups.
5359 SEQ chooses between allocating and deallocating. */
5362 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq
)
5364 if (unsigned int count
= m_group_count
[MEM_SVE_DATA
] + sve_data_headroom ())
5365 emit_stack_adjust (seq
, count
* BYTES_PER_SVE_VECTOR
);
5368 /* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving
5372 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq
)
5374 for (auto &loc
: m_save_locations
)
5375 if (loc
.group
== MEM_SVE_DATA
)
5377 auto index
= loc
.index
+ sve_data_headroom ();
5378 emit_mem_move (seq
, loc
, index
* BYTES_PER_SVE_VECTOR
);
5382 /* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving
5386 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq
)
5388 for (auto &loc
: m_save_locations
)
5389 if (loc
.group
== MEM_SVE_PRED
)
5390 emit_mem_move (seq
, loc
, loc
.index
* BYTES_PER_SVE_PRED
);
5393 /* Set DEST to (vec_series BASE STEP). */
5396 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
5398 machine_mode mode
= GET_MODE (dest
);
5399 scalar_mode inner
= GET_MODE_INNER (mode
);
5401 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5402 if (!aarch64_sve_index_immediate_p (base
))
5403 base
= force_reg (inner
, base
);
5404 if (!aarch64_sve_index_immediate_p (step
))
5405 step
= force_reg (inner
, step
);
5407 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
5410 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5411 register of mode MODE. Use TARGET for the result if it's nonnull
5414 The two vector modes must have the same element mode. The behavior
5415 is to duplicate architectural lane N of SRC into architectural lanes
5416 N + I * STEP of the result. On big-endian targets, architectural
5417 lane 0 of an Advanced SIMD vector is the last element of the vector
5418 in memory layout, so for big-endian targets this operation has the
5419 effect of reversing SRC before duplicating it. Callers need to
5420 account for this. */
5423 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
5425 machine_mode src_mode
= GET_MODE (src
);
5426 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
5427 insn_code icode
= (BYTES_BIG_ENDIAN
5428 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
5429 : code_for_aarch64_vec_duplicate_vq_le (mode
));
5432 expand_operand ops
[3];
5433 create_output_operand (&ops
[i
++], target
, mode
);
5434 create_output_operand (&ops
[i
++], src
, src_mode
);
5435 if (BYTES_BIG_ENDIAN
)
5437 /* Create a PARALLEL describing the reversal of SRC. */
5438 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
5439 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
5440 nelts_per_vq
- 1, -1);
5441 create_fixed_operand (&ops
[i
++], sel
);
5443 expand_insn (icode
, i
, ops
);
5444 return ops
[0].value
;
5447 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5448 the memory image into DEST. Return true on success. */
5451 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
5453 src
= force_const_mem (GET_MODE (src
), src
);
5457 /* Make sure that the address is legitimate. */
5458 if (!aarch64_sve_ld1rq_operand_p (src
))
5460 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
5461 src
= replace_equiv_address (src
, addr
);
5464 machine_mode mode
= GET_MODE (dest
);
5465 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
5466 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
5467 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
5471 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5472 by N "background" values. Try to move it into TARGET using:
5474 PTRUE PRED.<T>, VL<N>
5475 MOV TRUE.<T>, #<foreground>
5476 MOV FALSE.<T>, #<background>
5477 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5479 The PTRUE is always a single instruction but the MOVs might need a
5480 longer sequence. If the background value is zero (as it often is),
5481 the sequence can sometimes collapse to a PTRUE followed by a
5482 zero-predicated move.
5484 Return the target on success, otherwise return null. */
5487 aarch64_expand_sve_const_vector_sel (rtx target
, rtx src
)
5489 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src
) == 2);
5491 /* Make sure that the PTRUE is valid. */
5492 machine_mode mode
= GET_MODE (src
);
5493 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
5494 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
5495 if (aarch64_svpattern_for_vl (pred_mode
, npatterns
)
5496 == AARCH64_NUM_SVPATTERNS
)
5499 rtx_vector_builder
pred_builder (pred_mode
, npatterns
, 2);
5500 rtx_vector_builder
true_builder (mode
, npatterns
, 1);
5501 rtx_vector_builder
false_builder (mode
, npatterns
, 1);
5502 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5504 true_builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
5505 pred_builder
.quick_push (CONST1_RTX (BImode
));
5507 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5509 false_builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
+ npatterns
));
5510 pred_builder
.quick_push (CONST0_RTX (BImode
));
5512 expand_operand ops
[4];
5513 create_output_operand (&ops
[0], target
, mode
);
5514 create_input_operand (&ops
[1], true_builder
.build (), mode
);
5515 create_input_operand (&ops
[2], false_builder
.build (), mode
);
5516 create_input_operand (&ops
[3], pred_builder
.build (), pred_mode
);
5517 expand_insn (code_for_vcond_mask (mode
, mode
), 4, ops
);
5521 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5522 SVE data mode and isn't a legitimate constant. Use TARGET for the
5523 result if convenient.
5525 The returned register can have whatever mode seems most natural
5526 given the contents of SRC. */
5529 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
5531 machine_mode mode
= GET_MODE (src
);
5532 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
5533 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
5534 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
5535 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
5536 unsigned int container_bits
= aarch64_sve_container_bits (mode
);
5537 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* container_bits
;
5539 if (nelts_per_pattern
== 1
5540 && encoded_bits
<= 128
5541 && container_bits
!= elt_bits
)
5543 /* We have a partial vector mode and a constant whose full-vector
5544 equivalent would occupy a repeating 128-bit sequence. Build that
5545 full-vector equivalent instead, so that we have the option of
5546 using LD1RQ and Advanced SIMD operations. */
5547 unsigned int repeat
= container_bits
/ elt_bits
;
5548 machine_mode full_mode
= aarch64_full_sve_mode (elt_mode
).require ();
5549 rtx_vector_builder
builder (full_mode
, npatterns
* repeat
, 1);
5550 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5551 for (unsigned int j
= 0; j
< repeat
; ++j
)
5552 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
5553 target
= aarch64_target_reg (target
, full_mode
);
5554 return aarch64_expand_sve_const_vector (target
, builder
.build ());
5557 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
5559 /* The constant is a duplicated quadword but can't be narrowed
5560 beyond a quadword. Get the memory image of the first quadword
5561 as a 128-bit vector and try using LD1RQ to load it from memory.
5563 The effect for both endiannesses is to load memory lane N into
5564 architectural lanes N + I * STEP of the result. On big-endian
5565 targets, the layout of the 128-bit vector in an Advanced SIMD
5566 register would be different from its layout in an SVE register,
5567 but this 128-bit vector is a memory value only. */
5568 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
5569 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
5570 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
5574 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
5576 /* The vector is a repeating sequence of 64 bits or fewer.
5577 See if we can load them using an Advanced SIMD move and then
5578 duplicate it to fill a vector. This is better than using a GPR
5579 move because it keeps everything in the same register file. */
5580 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
5581 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
5582 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5584 /* We want memory lane N to go into architectural lane N,
5585 so reverse for big-endian targets. The DUP .Q pattern
5586 has a compensating reverse built-in. */
5587 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
5588 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
5590 rtx vq_src
= builder
.build ();
5591 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
5593 vq_src
= force_reg (vq_mode
, vq_src
);
5594 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
5597 /* Get an integer representation of the repeating part of Advanced
5598 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5599 which for big-endian targets is lane-swapped wrt a normal
5600 Advanced SIMD vector. This means that for both endiannesses,
5601 memory lane N of SVE vector SRC corresponds to architectural
5602 lane N of a register holding VQ_SRC. This in turn means that
5603 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5604 as a single 128-bit value) and thus that memory lane 0 of SRC is
5605 in the lsb of the integer. Duplicating the integer therefore
5606 ensures that memory lane N of SRC goes into architectural lane
5607 N + I * INDEX of the SVE register. */
5608 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
5609 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
5612 /* Pretend that we had a vector of INT_MODE to start with. */
5613 elt_mode
= int_mode
;
5614 mode
= aarch64_full_sve_mode (int_mode
).require ();
5616 /* If the integer can be moved into a general register by a
5617 single instruction, do that and duplicate the result. */
5618 if (CONST_INT_P (elt_value
)
5619 && aarch64_move_imm (INTVAL (elt_value
),
5620 encoded_bits
<= 32 ? SImode
: DImode
))
5622 elt_value
= force_reg (elt_mode
, elt_value
);
5623 return expand_vector_broadcast (mode
, elt_value
);
5626 else if (npatterns
== 1)
5627 /* We're duplicating a single value, but can't do better than
5628 force it to memory and load from there. This handles things
5629 like symbolic constants. */
5630 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
5634 /* Load the element from memory if we can, otherwise move it into
5635 a register and use a DUP. */
5636 rtx op
= force_const_mem (elt_mode
, elt_value
);
5638 op
= force_reg (elt_mode
, elt_value
);
5639 return expand_vector_broadcast (mode
, op
);
5643 /* Try using INDEX. */
5645 if (const_vec_series_p (src
, &base
, &step
))
5647 aarch64_expand_vec_series (target
, base
, step
);
5651 /* From here on, it's better to force the whole constant to memory
5653 if (GET_MODE_NUNITS (mode
).is_constant ())
5656 if (nelts_per_pattern
== 2)
5657 if (rtx res
= aarch64_expand_sve_const_vector_sel (target
, src
))
5660 /* Expand each pattern individually. */
5661 gcc_assert (npatterns
> 1);
5662 rtx_vector_builder builder
;
5663 auto_vec
<rtx
, 16> vectors (npatterns
);
5664 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5666 builder
.new_vector (mode
, 1, nelts_per_pattern
);
5667 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
5668 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
5669 vectors
.quick_push (force_reg (mode
, builder
.build ()));
5672 /* Use permutes to interleave the separate vectors. */
5673 while (npatterns
> 1)
5676 for (unsigned int i
= 0; i
< npatterns
; ++i
)
5678 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
5679 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
5680 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
5684 gcc_assert (vectors
[0] == target
);
5688 /* Use WHILE to set a predicate register of mode MODE in which the first
5689 VL bits are set and the rest are clear. Use TARGET for the register
5690 if it's nonnull and convenient. */
5693 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
5696 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
5697 target
= aarch64_target_reg (target
, mode
);
5698 emit_insn (gen_while (UNSPEC_WHILELO
, DImode
, mode
,
5699 target
, const0_rtx
, limit
));
5704 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
5706 /* BUILDER is a constant predicate in which the index of every set bit
5707 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5708 by inverting every element at a multiple of ELT_SIZE and EORing the
5709 result with an ELT_SIZE PTRUE.
5711 Return a register that contains the constant on success, otherwise
5712 return null. Use TARGET as the register if it is nonnull and
5716 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
5717 unsigned int elt_size
)
5719 /* Invert every element at a multiple of ELT_SIZE, keeping the
5721 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
5722 builder
.nelts_per_pattern ());
5723 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
5724 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
5725 inv_builder
.quick_push (const1_rtx
);
5727 inv_builder
.quick_push (const0_rtx
);
5728 inv_builder
.finalize ();
5730 /* See if we can load the constant cheaply. */
5731 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
5735 /* EOR the result with an ELT_SIZE PTRUE. */
5736 rtx mask
= aarch64_ptrue_all (elt_size
);
5737 mask
= force_reg (VNx16BImode
, mask
);
5738 inv
= gen_lowpart (VNx16BImode
, inv
);
5739 target
= aarch64_target_reg (target
, VNx16BImode
);
5740 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
5744 /* BUILDER is a constant predicate in which the index of every set bit
5745 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5746 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5747 register on success, otherwise return null. Use TARGET as the register
5748 if nonnull and convenient. */
5751 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
5752 unsigned int elt_size
,
5753 unsigned int permute_size
)
5755 /* We're going to split the constant into two new constants A and B,
5756 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5757 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5759 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5760 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5762 where _ indicates elements that will be discarded by the permute.
5764 First calculate the ELT_SIZEs for A and B. */
5765 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
5766 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
5767 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
5768 if (INTVAL (builder
.elt (i
)) != 0)
5770 if (i
& permute_size
)
5771 b_elt_size
|= i
- permute_size
;
5775 a_elt_size
&= -a_elt_size
;
5776 b_elt_size
&= -b_elt_size
;
5778 /* Now construct the vectors themselves. */
5779 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
5780 builder
.nelts_per_pattern ());
5781 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
5782 builder
.nelts_per_pattern ());
5783 unsigned int nelts
= builder
.encoded_nelts ();
5784 for (unsigned int i
= 0; i
< nelts
; ++i
)
5785 if (i
& (elt_size
- 1))
5787 a_builder
.quick_push (const0_rtx
);
5788 b_builder
.quick_push (const0_rtx
);
5790 else if ((i
& permute_size
) == 0)
5792 /* The A and B elements are significant. */
5793 a_builder
.quick_push (builder
.elt (i
));
5794 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
5798 /* The A and B elements are going to be discarded, so pick whatever
5799 is likely to give a nice constant. We are targeting element
5800 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5801 with the aim of each being a sequence of ones followed by
5802 a sequence of zeros. So:
5804 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5805 duplicate the last X_ELT_SIZE element, to extend the
5806 current sequence of ones or zeros.
5808 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5809 zero, so that the constant really does have X_ELT_SIZE and
5810 not a smaller size. */
5811 if (a_elt_size
> permute_size
)
5812 a_builder
.quick_push (const0_rtx
);
5814 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
5815 if (b_elt_size
> permute_size
)
5816 b_builder
.quick_push (const0_rtx
);
5818 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
5820 a_builder
.finalize ();
5821 b_builder
.finalize ();
5823 /* Try loading A into a register. */
5824 rtx_insn
*last
= get_last_insn ();
5825 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
5829 /* Try loading B into a register. */
5831 if (a_builder
!= b_builder
)
5833 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
5836 delete_insns_since (last
);
5841 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
5842 operands but permutes them as though they had mode MODE. */
5843 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
5844 target
= aarch64_target_reg (target
, GET_MODE (a
));
5845 rtx type_reg
= CONST0_RTX (mode
);
5846 emit_insn (gen_aarch64_sve_trn1_conv (mode
, target
, a
, b
, type_reg
));
5850 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5851 constant in BUILDER into an SVE predicate register. Return the register
5852 on success, otherwise return null. Use TARGET for the register if
5853 nonnull and convenient.
5855 ALLOW_RECURSE_P is true if we can use methods that would call this
5856 function recursively. */
5859 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
5860 bool allow_recurse_p
)
5862 if (builder
.encoded_nelts () == 1)
5863 /* A PFALSE or a PTRUE .B ALL. */
5864 return aarch64_emit_set_immediate (target
, builder
);
5866 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
5867 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
5869 /* If we can load the constant using PTRUE, use it as-is. */
5870 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
5871 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
5872 return aarch64_emit_set_immediate (target
, builder
);
5874 /* Otherwise use WHILE to set the first VL bits. */
5875 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
5878 if (!allow_recurse_p
)
5881 /* Try inverting the vector in element size ELT_SIZE and then EORing
5882 the result with an ELT_SIZE PTRUE. */
5883 if (INTVAL (builder
.elt (0)) == 0)
5884 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
5888 /* Try using TRN1 to permute two simpler constants. */
5889 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
5890 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
5897 /* Return an SVE predicate register that contains the VNx16BImode
5898 constant in BUILDER, without going through the move expanders.
5900 The returned register can have whatever mode seems most natural
5901 given the contents of BUILDER. Use TARGET for the result if
5905 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
5907 /* Try loading the constant using pure predicate operations. */
5908 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
5911 /* Try forcing the constant to memory. */
5912 if (builder
.full_nelts ().is_constant ())
5913 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
5915 target
= aarch64_target_reg (target
, VNx16BImode
);
5916 emit_move_insn (target
, mem
);
5920 /* The last resort is to load the constant as an integer and then
5921 compare it against zero. Use -1 for set bits in order to increase
5922 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5923 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
5924 builder
.nelts_per_pattern ());
5925 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
5926 int_builder
.quick_push (INTVAL (builder
.elt (i
))
5927 ? constm1_rtx
: const0_rtx
);
5928 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
5929 int_builder
.build ());
5932 /* Set DEST to immediate IMM. */
5935 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
5937 machine_mode mode
= GET_MODE (dest
);
5939 /* Check on what type of symbol it is. */
5940 scalar_int_mode int_mode
;
5941 if ((SYMBOL_REF_P (imm
)
5942 || LABEL_REF_P (imm
)
5943 || GET_CODE (imm
) == CONST
5944 || GET_CODE (imm
) == CONST_POLY_INT
)
5945 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
5949 HOST_WIDE_INT const_offset
;
5950 enum aarch64_symbol_type sty
;
5952 /* If we have (const (plus symbol offset)), separate out the offset
5953 before we start classifying the symbol. */
5954 rtx base
= strip_offset (imm
, &offset
);
5956 /* We must always add an offset involving VL separately, rather than
5957 folding it into the relocation. */
5958 if (!offset
.is_constant (&const_offset
))
5962 aarch64_report_sve_required ();
5965 if (base
== const0_rtx
5966 && (aarch64_sve_cnt_immediate_p (offset
)
5967 || aarch64_sve_rdvl_immediate_p (offset
)))
5968 emit_insn (gen_rtx_SET (dest
, imm
));
5971 /* Do arithmetic on 32-bit values if the result is smaller
5973 if (partial_subreg_p (int_mode
, SImode
))
5975 /* It is invalid to do symbol calculations in modes
5976 narrower than SImode. */
5977 gcc_assert (base
== const0_rtx
);
5978 dest
= gen_lowpart (SImode
, dest
);
5981 if (base
!= const0_rtx
)
5983 base
= aarch64_force_temporary (int_mode
, dest
, base
);
5984 aarch64_add_offset (int_mode
, dest
, base
, offset
,
5985 NULL_RTX
, NULL_RTX
, 0, false);
5988 aarch64_add_offset (int_mode
, dest
, base
, offset
,
5989 dest
, NULL_RTX
, 0, false);
5994 if (aarch64_rdsvl_immediate_p (base
))
5996 /* We could handle non-constant offsets if they are ever
5998 gcc_assert (const_offset
== 0);
5999 emit_insn (gen_rtx_SET (dest
, imm
));
6003 sty
= aarch64_classify_symbol (base
, const_offset
);
6006 case SYMBOL_FORCE_TO_MEM
:
6007 if (int_mode
!= ptr_mode
)
6008 imm
= convert_memory_address (ptr_mode
, imm
);
6010 if (const_offset
!= 0
6011 && targetm
.cannot_force_const_mem (ptr_mode
, imm
))
6013 gcc_assert (can_create_pseudo_p ());
6014 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6015 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
6016 NULL_RTX
, NULL_RTX
, 0, false);
6020 mem
= force_const_mem (ptr_mode
, imm
);
6023 /* If we aren't generating PC relative literals, then
6024 we need to expand the literal pool access carefully.
6025 This is something that needs to be done in a number
6026 of places, so could well live as a separate function. */
6027 if (!aarch64_pcrelative_literal_loads
)
6029 gcc_assert (can_create_pseudo_p ());
6030 base
= gen_reg_rtx (ptr_mode
);
6031 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
6032 if (ptr_mode
!= Pmode
)
6033 base
= convert_memory_address (Pmode
, base
);
6034 mem
= gen_rtx_MEM (ptr_mode
, base
);
6037 if (int_mode
!= ptr_mode
)
6038 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
6040 emit_insn (gen_rtx_SET (dest
, mem
));
6044 case SYMBOL_SMALL_TLSGD
:
6045 case SYMBOL_SMALL_TLSDESC
:
6046 case SYMBOL_SMALL_TLSIE
:
6047 case SYMBOL_SMALL_GOT_28K
:
6048 case SYMBOL_SMALL_GOT_4G
:
6049 case SYMBOL_TINY_GOT
:
6050 case SYMBOL_TINY_TLSIE
:
6051 if (const_offset
!= 0)
6053 gcc_assert(can_create_pseudo_p ());
6054 base
= aarch64_force_temporary (int_mode
, dest
, base
);
6055 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
6056 NULL_RTX
, NULL_RTX
, 0, false);
6061 case SYMBOL_SMALL_ABSOLUTE
:
6062 case SYMBOL_TINY_ABSOLUTE
:
6063 case SYMBOL_TLSLE12
:
6064 case SYMBOL_TLSLE24
:
6065 case SYMBOL_TLSLE32
:
6066 case SYMBOL_TLSLE48
:
6067 aarch64_load_symref_appropriately (dest
, imm
, sty
);
6075 if (!CONST_INT_P (imm
))
6077 if (aarch64_sve_pred_mode_p (mode
))
6079 /* Only the low bit of each .H, .S and .D element is defined,
6080 so we can set the upper bits to whatever we like. If the
6081 predicate is all-true in MODE, prefer to set all the undefined
6082 bits as well, so that we can share a single .B predicate for
6084 if (imm
== CONSTM1_RTX (mode
))
6085 imm
= CONSTM1_RTX (VNx16BImode
);
6087 /* All methods for constructing predicate modes wider than VNx16BI
6088 will set the upper bits of each element to zero. Expose this
6089 by moving such constants as a VNx16BI, so that all bits are
6090 significant and so that constants for different modes can be
6091 shared. The wider constant will still be available as a
6093 rtx_vector_builder builder
;
6094 if (aarch64_get_sve_pred_bits (builder
, imm
))
6096 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
6098 emit_move_insn (dest
, gen_lowpart (mode
, res
));
6103 if (GET_CODE (imm
) == HIGH
6104 || aarch64_simd_valid_immediate (imm
, NULL
))
6106 emit_insn (gen_rtx_SET (dest
, imm
));
6110 if (CONST_VECTOR_P (imm
) && aarch64_sve_data_mode_p (mode
))
6111 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
6114 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
6118 rtx mem
= force_const_mem (mode
, imm
);
6120 emit_move_insn (dest
, mem
);
6124 aarch64_internal_mov_immediate (dest
, imm
, true, mode
);
6127 /* Return the MEM rtx that provides the canary value that should be used
6128 for stack-smashing protection. MODE is the mode of the memory.
6129 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6130 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6131 indicates whether the caller is performing a SET or a TEST operation. */
6134 aarch64_stack_protect_canary_mem (machine_mode mode
, rtx decl_rtl
,
6135 aarch64_salt_type salt_type
)
6138 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
6140 gcc_assert (MEM_P (decl_rtl
));
6141 addr
= XEXP (decl_rtl
, 0);
6143 rtx base
= strip_offset_and_salt (addr
, &offset
);
6144 if (!SYMBOL_REF_P (base
))
6147 rtvec v
= gen_rtvec (2, base
, GEN_INT (salt_type
));
6148 addr
= gen_rtx_UNSPEC (Pmode
, v
, UNSPEC_SALT_ADDR
);
6149 addr
= gen_rtx_CONST (Pmode
, addr
);
6150 addr
= plus_constant (Pmode
, addr
, offset
);
6154 /* Calculate the address from the system register. */
6155 rtx salt
= GEN_INT (salt_type
);
6156 addr
= gen_reg_rtx (mode
);
6158 emit_insn (gen_reg_stack_protect_address_di (addr
, salt
));
6161 emit_insn (gen_reg_stack_protect_address_si (addr
, salt
));
6162 addr
= convert_memory_address (Pmode
, addr
);
6164 addr
= plus_constant (Pmode
, addr
, aarch64_stack_protector_guard_offset
);
6166 return gen_rtx_MEM (mode
, force_reg (Pmode
, addr
));
6169 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6170 that is known to contain PTRUE. */
6173 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
6175 expand_operand ops
[3];
6176 machine_mode mode
= GET_MODE (dest
);
6177 create_output_operand (&ops
[0], dest
, mode
);
6178 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
6179 create_input_operand (&ops
[2], src
, mode
);
6180 temporary_volatile_ok
v (true);
6181 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
6184 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6185 operand is in memory. In this case we need to use the predicated LD1
6186 and ST1 instead of LDR and STR, both for correctness on big-endian
6187 targets and because LD1 and ST1 support a wider range of addressing modes.
6188 PRED_MODE is the mode of the predicate.
6190 See the comment at the head of aarch64-sve.md for details about the
6191 big-endian handling. */
6194 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
6196 machine_mode mode
= GET_MODE (dest
);
6197 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
6198 if (!register_operand (src
, mode
)
6199 && !register_operand (dest
, mode
))
6201 rtx tmp
= gen_reg_rtx (mode
);
6203 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
6205 emit_move_insn (tmp
, src
);
6208 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
6211 /* Called only on big-endian targets. See whether an SVE vector move
6212 from SRC to DEST is effectively a REV[BHW] instruction, because at
6213 least one operand is a subreg of an SVE vector that has wider or
6214 narrower elements. Return true and emit the instruction if so.
6218 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6220 represents a VIEW_CONVERT between the following vectors, viewed
6223 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6224 R1: { [0], [1], [2], [3], ... }
6226 The high part of lane X in R2 should therefore correspond to lane X*2
6227 of R1, but the register representations are:
6230 R2: ...... [1].high [1].low [0].high [0].low
6231 R1: ...... [3] [2] [1] [0]
6233 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6234 We therefore need a reverse operation to swap the high and low values
6237 This is purely an optimization. Without it we would spill the
6238 subreg operand to the stack in one mode and reload it in the
6239 other mode, which has the same effect as the REV. */
6242 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
6244 gcc_assert (BYTES_BIG_ENDIAN
);
6246 /* Do not try to optimize subregs that LRA has created for matched
6247 reloads. These subregs only exist as a temporary measure to make
6248 the RTL well-formed, but they are exempt from the usual
6249 TARGET_CAN_CHANGE_MODE_CLASS rules.
6251 For example, if we have:
6253 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6255 and the constraints require R1 and R2 to be in the same register,
6256 LRA may need to create RTL such as:
6258 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6259 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6260 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6262 which forces both the input and output of the original instruction
6263 to use the same hard register. But for this to work, the normal
6264 rules have to be suppressed on the subreg input, otherwise LRA
6265 would need to reload that input too, meaning that the process
6266 would never terminate. To compensate for this, the normal rules
6267 are also suppressed for the subreg output of the first move.
6268 Ignoring the special case and handling the first move normally
6269 would therefore generate wrong code: we would reverse the elements
6270 for the first subreg but not reverse them back for the second subreg. */
6271 if (SUBREG_P (dest
) && !LRA_SUBREG_P (dest
))
6272 dest
= SUBREG_REG (dest
);
6273 if (SUBREG_P (src
) && !LRA_SUBREG_P (src
))
6274 src
= SUBREG_REG (src
);
6276 /* The optimization handles two single SVE REGs with different element
6280 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
6281 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
6282 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
6283 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
6286 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
6287 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
6288 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
6290 emit_insn (gen_rtx_SET (dest
, unspec
));
6294 /* Return a copy of X with mode MODE, without changing its other
6295 attributes. Unlike gen_lowpart, this doesn't care whether the
6296 mode change is valid. */
6299 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
6301 if (GET_MODE (x
) == mode
)
6304 x
= shallow_copy_rtx (x
);
6305 set_mode_and_regno (x
, mode
, REGNO (x
));
6309 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6310 stored in wider integer containers. */
6313 aarch64_sve_rev_unspec (machine_mode mode
)
6315 switch (GET_MODE_UNIT_SIZE (mode
))
6317 case 1: return UNSPEC_REVB
;
6318 case 2: return UNSPEC_REVH
;
6319 case 4: return UNSPEC_REVW
;
6324 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6328 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
6330 /* Decide which REV operation we need. The mode with wider elements
6331 determines the mode of the operands and the mode with the narrower
6332 elements determines the reverse width. */
6333 machine_mode mode_with_wider_elts
= aarch64_sve_int_mode (GET_MODE (dest
));
6334 machine_mode mode_with_narrower_elts
= aarch64_sve_int_mode (GET_MODE (src
));
6335 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
6336 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
6337 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
6339 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
6340 machine_mode pred_mode
= aarch64_sve_pred_mode (mode_with_wider_elts
);
6342 /* Get the operands in the appropriate modes and emit the instruction. */
6343 ptrue
= gen_lowpart (pred_mode
, ptrue
);
6344 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
6345 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
6346 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
6351 aarch64_function_ok_for_sibcall (tree
, tree exp
)
6353 if (crtl
->abi
->id () != expr_callee_abi (exp
).id ())
6356 tree fntype
= TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp
)));
6357 if (aarch64_fntype_pstate_sm (fntype
) & ~aarch64_cfun_incoming_pstate_sm ())
6359 for (auto state
: { "za", "zt0" })
6360 if (bool (aarch64_cfun_shared_flags (state
))
6361 != bool (aarch64_fntype_shared_flags (fntype
, state
)))
6366 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6367 passed in SVE registers. */
6370 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS
*pcum
,
6371 const function_arg_info
&arg
)
6374 machine_mode dummymode
;
6377 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6378 if (arg
.mode
== BLKmode
&& arg
.type
)
6379 size
= int_size_in_bytes (arg
.type
);
6381 /* No frontends can create types with variable-sized modes, so we
6382 shouldn't be asked to pass or return them. */
6383 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
6385 /* Aggregates are passed by reference based on their size. */
6386 if (arg
.aggregate_type_p ())
6387 size
= int_size_in_bytes (arg
.type
);
6389 /* Variable sized arguments are always returned by reference. */
6393 /* Can this be a candidate to be passed in fp/simd register(s)? */
6394 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
6395 &dummymode
, &nregs
, NULL
,
6396 !pcum
|| pcum
->silent_p
))
6399 /* Arguments which are variable sized or larger than 2 registers are
6400 passed by reference unless they are a homogenous floating point
6402 return size
> 2 * UNITS_PER_WORD
;
6405 /* Implement TARGET_PASS_BY_REFERENCE. */
6408 aarch64_pass_by_reference (cumulative_args_t pcum_v
,
6409 const function_arg_info
&arg
)
6411 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
6414 return aarch64_pass_by_reference_1 (pcum
, arg
);
6416 pure_scalable_type_info pst_info
;
6417 switch (pst_info
.analyze (arg
.type
))
6419 case pure_scalable_type_info::IS_PST
:
6420 if (pcum
&& !pcum
->silent_p
&& !TARGET_SVE
)
6421 /* We can't gracefully recover at this point, so make this a
6423 fatal_error (input_location
, "arguments of type %qT require"
6424 " the SVE ISA extension", arg
.type
);
6426 /* Variadic SVE types are passed by reference. Normal non-variadic
6427 arguments are too if we've run out of registers. */
6429 || pcum
->aapcs_nvrn
+ pst_info
.num_zr () > NUM_FP_ARG_REGS
6430 || pcum
->aapcs_nprn
+ pst_info
.num_pr () > NUM_PR_ARG_REGS
);
6432 case pure_scalable_type_info::DOESNT_MATTER
:
6433 gcc_assert (aarch64_pass_by_reference_1 (pcum
, arg
));
6436 case pure_scalable_type_info::NO_ABI_IDENTITY
:
6437 case pure_scalable_type_info::ISNT_PST
:
6438 return aarch64_pass_by_reference_1 (pcum
, arg
);
6443 /* Return TRUE if VALTYPE is padded to its least significant bits. */
6445 aarch64_return_in_msb (const_tree valtype
)
6447 machine_mode dummy_mode
;
6450 /* Never happens in little-endian mode. */
6451 if (!BYTES_BIG_ENDIAN
)
6454 /* Only composite types smaller than or equal to 16 bytes can
6455 be potentially returned in registers. */
6456 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
6457 || int_size_in_bytes (valtype
) <= 0
6458 || int_size_in_bytes (valtype
) > 16)
6461 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6462 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6463 is always passed/returned in the least significant bits of fp/simd
6465 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
6466 &dummy_mode
, &dummy_int
, NULL
,
6470 /* Likewise pure scalable types for SVE vector and predicate registers. */
6471 pure_scalable_type_info pst_info
;
6472 if (pst_info
.analyze_registers (valtype
))
6478 /* Implement TARGET_FUNCTION_VALUE.
6479 Define how to find the value returned by a function. */
6482 aarch64_function_value (const_tree type
, const_tree func
,
6483 bool outgoing ATTRIBUTE_UNUSED
)
6488 mode
= TYPE_MODE (type
);
6489 if (INTEGRAL_TYPE_P (type
))
6490 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
6492 pure_scalable_type_info pst_info
;
6493 if (type
&& pst_info
.analyze_registers (type
))
6494 return pst_info
.get_rtx (mode
, V0_REGNUM
, P0_REGNUM
);
6496 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6497 are returned in memory, not by value. */
6498 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6499 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
6501 if (aarch64_return_in_msb (type
))
6503 HOST_WIDE_INT size
= int_size_in_bytes (type
);
6505 if (size
% UNITS_PER_WORD
!= 0)
6507 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
6508 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
6513 machine_mode ag_mode
;
6514 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
, &ag_mode
, &count
,
6517 gcc_assert (!sve_p
);
6518 if (!aarch64_composite_type_p (type
, mode
))
6520 gcc_assert (count
== 1 && mode
== ag_mode
);
6521 return gen_rtx_REG (mode
, V0_REGNUM
);
6523 else if (aarch64_advsimd_full_struct_mode_p (mode
)
6524 && known_eq (GET_MODE_SIZE (ag_mode
), 16))
6525 return gen_rtx_REG (mode
, V0_REGNUM
);
6526 else if (aarch64_advsimd_partial_struct_mode_p (mode
)
6527 && known_eq (GET_MODE_SIZE (ag_mode
), 8))
6528 return gen_rtx_REG (mode
, V0_REGNUM
);
6534 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
6535 for (i
= 0; i
< count
; i
++)
6537 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
6538 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
6539 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
6540 XVECEXP (par
, 0, i
) = tmp
;
6549 /* Vector types can acquire a partial SVE mode using things like
6550 __attribute__((vector_size(N))), and this is potentially useful.
6551 However, the choice of mode doesn't affect the type's ABI
6552 identity, so we should treat the types as though they had
6553 the associated integer mode, just like they did before SVE
6556 We know that the vector must be 128 bits or smaller,
6557 otherwise we'd have returned it in memory instead. */
6559 && (aarch64_some_values_include_pst_objects_p (type
)
6560 || (vec_flags
& VEC_PARTIAL
)));
6562 scalar_int_mode int_mode
= int_mode_for_mode (mode
).require ();
6563 rtx reg
= gen_rtx_REG (int_mode
, R0_REGNUM
);
6564 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
6565 return gen_rtx_PARALLEL (mode
, gen_rtvec (1, pair
));
6567 return gen_rtx_REG (mode
, R0_REGNUM
);
6571 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6572 Return true if REGNO is the number of a hard register in which the values
6573 of called function may come back. */
6576 aarch64_function_value_regno_p (const unsigned int regno
)
6578 /* Maximum of 16 bytes can be returned in the general registers. Examples
6579 of 16-byte return values are: 128-bit integers and 16-byte small
6580 structures (excluding homogeneous floating-point aggregates). */
6581 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
6584 /* Up to four fp/simd registers can return a function value, e.g. a
6585 homogeneous floating-point aggregate having four members. */
6586 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
6587 return TARGET_FLOAT
;
6589 if (regno
>= P0_REGNUM
&& regno
< P0_REGNUM
+ HA_MAX_NUM_FLDS
)
6595 /* Subroutine for aarch64_return_in_memory for types that are not returned
6596 in SVE registers. */
6599 aarch64_return_in_memory_1 (const_tree type
)
6602 machine_mode ag_mode
;
6605 if (!AGGREGATE_TYPE_P (type
)
6606 && TREE_CODE (type
) != BITINT_TYPE
6607 && TREE_CODE (type
) != COMPLEX_TYPE
6608 && TREE_CODE (type
) != VECTOR_TYPE
)
6609 /* Simple scalar types always returned in registers. */
6612 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
6613 &ag_mode
, &count
, NULL
, false))
6616 /* Types larger than 2 registers returned in memory. */
6617 size
= int_size_in_bytes (type
);
6618 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
6621 /* Implement TARGET_RETURN_IN_MEMORY.
6623 If the type T of the result of a function is such that
6625 would require that arg be passed as a value in a register (or set of
6626 registers) according to the parameter passing rules, then the result
6627 is returned in the same registers as would be used for such an
6631 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
6633 pure_scalable_type_info pst_info
;
6634 switch (pst_info
.analyze (type
))
6636 case pure_scalable_type_info::IS_PST
:
6637 return (pst_info
.num_zr () > NUM_FP_ARG_REGS
6638 || pst_info
.num_pr () > NUM_PR_ARG_REGS
);
6640 case pure_scalable_type_info::DOESNT_MATTER
:
6641 gcc_assert (aarch64_return_in_memory_1 (type
));
6644 case pure_scalable_type_info::NO_ABI_IDENTITY
:
6645 case pure_scalable_type_info::ISNT_PST
:
6646 return aarch64_return_in_memory_1 (type
);
6652 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
6653 const_tree type
, int *nregs
)
6655 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
6656 return aarch64_vfp_is_call_or_return_candidate (mode
, type
,
6657 &pcum
->aapcs_vfp_rmode
,
6658 nregs
, NULL
, pcum
->silent_p
);
6661 /* Given MODE and TYPE of a function argument, return the alignment in
6662 bits. The idea is to suppress any stronger alignment requested by
6663 the user and opt for the natural alignment (specified in AAPCS64 \S
6664 4.1). ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6665 was incorrectly calculated in versions of GCC prior to GCC 9.
6666 ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6667 calculated in versions between GCC 9 and GCC 13. If the alignment
6668 might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6669 is the old GCC 13 alignment, otherwise it is zero.
6671 This is a helper function for local use only. */
6674 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
6675 unsigned int *abi_break_gcc_9
,
6676 unsigned int *abi_break_gcc_13
,
6677 unsigned int *abi_break_gcc_14
)
6679 *abi_break_gcc_9
= 0;
6680 *abi_break_gcc_13
= 0;
6681 *abi_break_gcc_14
= 0;
6683 return GET_MODE_ALIGNMENT (mode
);
6685 if (integer_zerop (TYPE_SIZE (type
)))
6688 gcc_assert (TYPE_MODE (type
) == mode
);
6690 if (!AGGREGATE_TYPE_P (type
))
6692 /* The ABI alignment is the natural alignment of the type, without
6693 any attributes applied. Normally this is the alignment of the
6694 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6695 For now we just handle the known exceptions explicitly. */
6696 type
= TYPE_MAIN_VARIANT (type
);
6697 if (POINTER_TYPE_P (type
))
6699 gcc_assert (known_eq (POINTER_SIZE
, GET_MODE_BITSIZE (mode
)));
6700 return POINTER_SIZE
;
6702 if (TREE_CODE (type
) == ENUMERAL_TYPE
&& TREE_TYPE (type
))
6704 *abi_break_gcc_14
= TYPE_ALIGN (type
);
6705 type
= TYPE_MAIN_VARIANT (TREE_TYPE (type
));
6707 gcc_assert (!TYPE_USER_ALIGN (type
));
6708 return TYPE_ALIGN (type
);
6711 if (TREE_CODE (type
) == ARRAY_TYPE
)
6712 return TYPE_ALIGN (TREE_TYPE (type
));
6714 unsigned int alignment
= 0;
6715 unsigned int bitfield_alignment_with_packed
= 0;
6716 unsigned int bitfield_alignment
= 0;
6717 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
6718 if (TREE_CODE (field
) == FIELD_DECL
)
6720 /* Note that we explicitly consider zero-sized fields here,
6721 even though they don't map to AAPCS64 machine types.
6724 struct __attribute__((aligned(8))) empty {};
6727 [[no_unique_address]] empty e;
6731 "s" contains only one Fundamental Data Type (the int field)
6732 but gains 8-byte alignment and size thanks to "e". */
6733 alignment
= std::max (alignment
, DECL_ALIGN (field
));
6734 if (DECL_BIT_FIELD_TYPE (field
))
6736 /* Take the bit-field type's alignment into account only
6737 if the user didn't reduce this field's alignment with
6738 the packed attribute. */
6739 if (!DECL_PACKED (field
))
6741 = std::max (bitfield_alignment
,
6742 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
6744 /* Compute the alignment even if the bit-field is
6745 packed, so that we can emit a warning in case the
6746 alignment changed between GCC versions. */
6747 bitfield_alignment_with_packed
6748 = std::max (bitfield_alignment_with_packed
,
6749 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
6753 /* Emit a warning if the alignment is different when taking the
6754 'packed' attribute into account. */
6755 if (bitfield_alignment
!= bitfield_alignment_with_packed
6756 && bitfield_alignment_with_packed
> alignment
)
6757 *abi_break_gcc_13
= bitfield_alignment_with_packed
;
6759 if (bitfield_alignment
> alignment
)
6761 *abi_break_gcc_9
= alignment
;
6762 return bitfield_alignment
;
6768 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
6769 _BitInt(N) type. These include ARRAY_TYPE's with an element that is a
6770 _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
6771 with a field member that is a _BitInt(N) or an aggregate that uses it.
6772 Return false otherwise. */
6775 bitint_or_aggr_of_bitint_p (tree type
)
6780 if (TREE_CODE (type
) == BITINT_TYPE
)
6783 /* If ARRAY_TYPE, check it's element type. */
6784 if (TREE_CODE (type
) == ARRAY_TYPE
)
6785 return bitint_or_aggr_of_bitint_p (TREE_TYPE (type
));
6787 /* If RECORD_TYPE or UNION_TYPE, check the fields' types. */
6788 if (RECORD_OR_UNION_TYPE_P (type
))
6789 for (tree field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
6791 if (TREE_CODE (field
) != FIELD_DECL
)
6793 if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field
)))
6799 /* Layout a function argument according to the AAPCS64 rules. The rule
6800 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
6801 mode that was originally given to us by the target hook, whereas the
6802 mode in ARG might be the result of replacing partial SVE modes with
6803 the equivalent integer mode. */
6806 aarch64_layout_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
6808 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
6809 tree type
= arg
.type
;
6810 machine_mode mode
= arg
.mode
;
6811 int ncrn
, nvrn
, nregs
;
6812 bool allocate_ncrn
, allocate_nvrn
;
6814 unsigned int abi_break_gcc_9
;
6815 unsigned int abi_break_gcc_13
;
6816 unsigned int abi_break_gcc_14
;
6818 /* We need to do this once per argument. */
6819 if (pcum
->aapcs_arg_processed
)
6822 bool warn_pcs_change
6825 && (currently_expanding_function_start
6826 || currently_expanding_gimple_stmt
));
6828 /* HFAs and HVAs can have an alignment greater than 16 bytes. For example:
6830 typedef struct foo {
6831 __Int8x16_t foo[2] __attribute__((aligned(32)));
6834 is still a HVA despite its larger-than-normal alignment.
6835 However, such over-aligned HFAs and HVAs are guaranteed to have
6838 If we exclude HFAs and HVAs from the discussion below, then there
6839 are several things to note:
6841 - Both the C and AAPCS64 interpretations of a type's alignment should
6842 give a value that is no greater than the type's size.
6844 - Types bigger than 16 bytes are passed indirectly.
6846 - If an argument of type T is passed indirectly, TYPE and MODE describe
6847 a pointer to T rather than T iself.
6849 It follows that the AAPCS64 alignment of TYPE must be no greater
6852 Versions prior to GCC 9.1 ignored a bitfield's underlying type
6853 and so could calculate an alignment that was too small. If this
6854 happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6856 Although GCC 9.1 fixed that bug, it introduced a different one:
6857 it would consider the alignment of a bitfield's underlying type even
6858 if the field was packed (which should have the effect of overriding
6859 the alignment of the underlying type). This was fixed in GCC 13.1.
6861 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6862 that was too big. If this happened for TYPE, ABI_BREAK_GCC_13 is
6863 this older, too-big alignment.
6865 Also, the fact that GCC 9 to GCC 12 considered irrelevant
6866 alignments meant they could calculate type alignments that were
6867 bigger than the type's size, contrary to the assumption above.
6868 The handling of register arguments was nevertheless (and justifiably)
6869 written to follow the assumption that the alignment can never be
6870 greater than the size. The same was not true for stack arguments;
6871 their alignment was instead handled by MIN bounds in
6872 aarch64_function_arg_boundary.
6874 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6875 an alignment of more than 16 bytes for TYPE then:
6877 - If the argument was passed in registers, these GCC versions
6878 would treat the alignment as though it was *less than* 16 bytes.
6880 - If the argument was passed on the stack, these GCC versions
6881 would treat the alignment as though it was *equal to* 16 bytes.
6883 Both behaviors were wrong, but in different cases. */
6885 pcum
->aapcs_arg_processed
= true;
6887 pure_scalable_type_info pst_info
;
6888 if (type
&& pst_info
.analyze_registers (type
))
6890 /* aarch64_function_arg_alignment has never had an effect on
6893 /* The PCS says that it is invalid to pass an SVE value to an
6894 unprototyped function. There is no ABI-defined location we
6895 can return in this case, so we have no real choice but to raise
6896 an error immediately, even though this is only a query function. */
6897 if (arg
.named
&& pcum
->pcs_variant
!= ARM_PCS_SVE
)
6899 gcc_assert (!pcum
->silent_p
);
6900 error ("SVE type %qT cannot be passed to an unprototyped function",
6902 /* Avoid repeating the message, and avoid tripping the assert
6904 pcum
->pcs_variant
= ARM_PCS_SVE
;
6907 /* We would have converted the argument into pass-by-reference
6908 form if it didn't fit in registers. */
6909 pcum
->aapcs_nextnvrn
= pcum
->aapcs_nvrn
+ pst_info
.num_zr ();
6910 pcum
->aapcs_nextnprn
= pcum
->aapcs_nprn
+ pst_info
.num_pr ();
6911 gcc_assert (arg
.named
6912 && pcum
->pcs_variant
== ARM_PCS_SVE
6913 && pcum
->aapcs_nextnvrn
<= NUM_FP_ARG_REGS
6914 && pcum
->aapcs_nextnprn
<= NUM_PR_ARG_REGS
);
6915 pcum
->aapcs_reg
= pst_info
.get_rtx (mode
, V0_REGNUM
+ pcum
->aapcs_nvrn
,
6916 P0_REGNUM
+ pcum
->aapcs_nprn
);
6920 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6921 are passed by reference, not by value. */
6922 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6923 bool sve_p
= (vec_flags
& VEC_ANY_SVE
);
6925 /* Vector types can acquire a partial SVE mode using things like
6926 __attribute__((vector_size(N))), and this is potentially useful.
6927 However, the choice of mode doesn't affect the type's ABI
6928 identity, so we should treat the types as though they had
6929 the associated integer mode, just like they did before SVE
6932 We know that the vector must be 128 bits or smaller,
6933 otherwise we'd have passed it in memory instead. */
6935 && (aarch64_some_values_include_pst_objects_p (type
)
6936 || (vec_flags
& VEC_PARTIAL
)));
6938 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6940 size
= int_size_in_bytes (type
);
6942 /* No frontends can create types with variable-sized modes, so we
6943 shouldn't be asked to pass or return them. */
6944 size
= GET_MODE_SIZE (mode
).to_constant ();
6945 size
= ROUND_UP (size
, UNITS_PER_WORD
);
6947 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
6948 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
6952 gcc_assert (!sve_p
|| !allocate_nvrn
);
6954 unsigned int alignment
6955 = aarch64_function_arg_alignment (mode
, type
, &abi_break_gcc_9
,
6956 &abi_break_gcc_13
, &abi_break_gcc_14
);
6958 gcc_assert ((allocate_nvrn
|| alignment
<= 16 * BITS_PER_UNIT
)
6959 && (!alignment
|| abi_break_gcc_9
< alignment
)
6960 && (!abi_break_gcc_13
|| alignment
< abi_break_gcc_13
));
6962 /* _BitInt(N) was only added in GCC 14. */
6963 bool warn_pcs_change_le_gcc14
6964 = warn_pcs_change
&& !bitint_or_aggr_of_bitint_p (type
);
6966 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6967 The following code thus handles passing by SIMD/FP registers first. */
6969 nvrn
= pcum
->aapcs_nvrn
;
6971 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6972 and homogenous short-vector aggregates (HVA). */
6975 /* aarch64_function_arg_alignment has never had an effect on
6977 if (!pcum
->silent_p
&& !TARGET_FLOAT
)
6978 aarch64_err_no_fpadvsimd (mode
);
6980 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
6982 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
6983 if (!aarch64_composite_type_p (type
, mode
))
6985 gcc_assert (nregs
== 1);
6986 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
6988 else if (aarch64_advsimd_full_struct_mode_p (mode
)
6989 && known_eq (GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), 16))
6990 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
6991 else if (aarch64_advsimd_partial_struct_mode_p (mode
)
6992 && known_eq (GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), 8))
6993 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
6998 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
6999 for (i
= 0; i
< nregs
; i
++)
7001 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
7002 V0_REGNUM
+ nvrn
+ i
);
7003 rtx offset
= gen_int_mode
7004 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
7005 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
7006 XVECEXP (par
, 0, i
) = tmp
;
7008 pcum
->aapcs_reg
= par
;
7014 /* C.3 NSRN is set to 8. */
7015 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
7020 ncrn
= pcum
->aapcs_ncrn
;
7021 nregs
= size
/ UNITS_PER_WORD
;
7023 /* C6 - C9. though the sign and zero extension semantics are
7024 handled elsewhere. This is the case where the argument fits
7025 entirely general registers. */
7026 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
7028 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
7030 /* C.8 if the argument has an alignment of 16 then the NGRN is
7031 rounded up to the next even number. */
7035 /* Emit a warning if the alignment changed when taking the
7036 'packed' attribute into account. */
7037 if (warn_pcs_change_le_gcc14
7039 && ((abi_break_gcc_13
== 16 * BITS_PER_UNIT
)
7040 != (alignment
== 16 * BITS_PER_UNIT
)))
7041 inform (input_location
, "parameter passing for argument of type "
7042 "%qT changed in GCC 13.1", type
);
7044 if (warn_pcs_change_le_gcc14
7046 && ((abi_break_gcc_14
== 16 * BITS_PER_UNIT
)
7047 != (alignment
== 16 * BITS_PER_UNIT
)))
7048 inform (input_location
, "parameter passing for argument of type "
7049 "%qT changed in GCC 14.1", type
);
7051 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7052 comparison is there because for > 16 * BITS_PER_UNIT
7053 alignment nregs should be > 2 and therefore it should be
7054 passed by reference rather than value. */
7055 if (alignment
== 16 * BITS_PER_UNIT
)
7057 if (warn_pcs_change_le_gcc14
7059 inform (input_location
, "parameter passing for argument of type "
7060 "%qT changed in GCC 9.1", type
);
7062 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
7066 /* If an argument with an SVE mode needs to be shifted up to the
7067 high part of the register, treat it as though it had an integer mode.
7068 Using the normal (parallel [...]) would suppress the shifting. */
7071 && maybe_ne (GET_MODE_SIZE (mode
), nregs
* UNITS_PER_WORD
)
7072 && aarch64_pad_reg_upward (mode
, type
, false))
7074 mode
= int_mode_for_mode (mode
).require ();
7078 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7079 A reg is still generated for it, but the caller should be smart
7080 enough not to use it. */
7082 || (nregs
== 1 && !sve_p
)
7083 || GET_MODE_CLASS (mode
) == MODE_INT
)
7084 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
7090 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
7091 for (i
= 0; i
< nregs
; i
++)
7093 scalar_int_mode reg_mode
= word_mode
;
7095 reg_mode
= int_mode_for_mode (mode
).require ();
7096 rtx tmp
= gen_rtx_REG (reg_mode
, R0_REGNUM
+ ncrn
+ i
);
7097 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
7098 GEN_INT (i
* UNITS_PER_WORD
));
7099 XVECEXP (par
, 0, i
) = tmp
;
7101 pcum
->aapcs_reg
= par
;
7104 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
7109 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
7111 /* The argument is passed on stack; record the needed number of words for
7112 this argument and align the total size if necessary. */
7114 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
7116 if (warn_pcs_change_le_gcc14
7118 && ((abi_break_gcc_13
>= 16 * BITS_PER_UNIT
)
7119 != (alignment
>= 16 * BITS_PER_UNIT
)))
7120 inform (input_location
, "parameter passing for argument of type "
7121 "%qT changed in GCC 13.1", type
);
7123 if (warn_pcs_change_le_gcc14
7125 && ((abi_break_gcc_14
>= 16 * BITS_PER_UNIT
)
7126 != (alignment
>= 16 * BITS_PER_UNIT
)))
7127 inform (input_location
, "parameter passing for argument of type "
7128 "%qT changed in GCC 14.1", type
);
7130 if (alignment
== 16 * BITS_PER_UNIT
)
7132 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
7133 if (pcum
->aapcs_stack_size
!= new_size
)
7135 if (warn_pcs_change_le_gcc14
7137 inform (input_location
, "parameter passing for argument of type "
7138 "%qT changed in GCC 9.1", type
);
7139 pcum
->aapcs_stack_size
= new_size
;
7145 /* Add the current argument register to the set of those that need
7146 to be saved and restored around a change to PSTATE.SM. */
7149 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS
*pcum
)
7151 subrtx_var_iterator::array_type array
;
7152 FOR_EACH_SUBRTX_VAR (iter
, array
, pcum
->aapcs_reg
, NONCONST
)
7155 if (REG_P (x
) && (FP_REGNUM_P (REGNO (x
)) || PR_REGNUM_P (REGNO (x
))))
7157 unsigned int i
= pcum
->num_sme_mode_switch_args
++;
7158 gcc_assert (i
< ARRAY_SIZE (pcum
->sme_mode_switch_args
));
7159 pcum
->sme_mode_switch_args
[i
] = x
;
7164 /* Return a parallel that contains all the registers that need to be
7165 saved around a change to PSTATE.SM. Return const0_rtx if there is
7166 no such mode switch, or if no registers need to be saved. */
7169 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS
*pcum
)
7171 if (!pcum
->num_sme_mode_switch_args
)
7174 auto argvec
= gen_rtvec_v (pcum
->num_sme_mode_switch_args
,
7175 pcum
->sme_mode_switch_args
);
7176 return gen_rtx_PARALLEL (VOIDmode
, argvec
);
7179 /* Implement TARGET_FUNCTION_ARG. */
7182 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
7184 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7185 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
7186 || pcum
->pcs_variant
== ARM_PCS_SIMD
7187 || pcum
->pcs_variant
== ARM_PCS_SVE
);
7189 if (arg
.end_marker_p ())
7191 rtx abi_cookie
= aarch64_gen_callee_cookie (pcum
->isa_mode
,
7193 rtx sme_mode_switch_args
= aarch64_finish_sme_mode_switch_args (pcum
);
7194 rtx shared_za_flags
= gen_int_mode (pcum
->shared_za_flags
, SImode
);
7195 rtx shared_zt0_flags
= gen_int_mode (pcum
->shared_zt0_flags
, SImode
);
7196 return gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (4, abi_cookie
,
7197 sme_mode_switch_args
,
7202 aarch64_layout_arg (pcum_v
, arg
);
7203 return pcum
->aapcs_reg
;
7207 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
7209 rtx libname ATTRIBUTE_UNUSED
,
7211 unsigned n_named ATTRIBUTE_UNUSED
,
7214 pcum
->aapcs_ncrn
= 0;
7215 pcum
->aapcs_nvrn
= 0;
7216 pcum
->aapcs_nprn
= 0;
7217 pcum
->aapcs_nextncrn
= 0;
7218 pcum
->aapcs_nextnvrn
= 0;
7219 pcum
->aapcs_nextnprn
= 0;
7222 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
7223 pcum
->isa_mode
= aarch64_fntype_isa_mode (fntype
);
7227 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
7228 pcum
->isa_mode
= AARCH64_FL_DEFAULT_ISA_MODE
;
7230 pcum
->aapcs_reg
= NULL_RTX
;
7231 pcum
->aapcs_arg_processed
= false;
7232 pcum
->aapcs_stack_words
= 0;
7233 pcum
->aapcs_stack_size
= 0;
7234 pcum
->silent_p
= silent_p
;
7235 pcum
->shared_za_flags
7236 = (fntype
? aarch64_fntype_shared_flags (fntype
, "za") : 0U);
7237 pcum
->shared_zt0_flags
7238 = (fntype
? aarch64_fntype_shared_flags (fntype
, "zt0") : 0U);
7239 pcum
->num_sme_mode_switch_args
= 0;
7243 && fntype
&& fntype
!= error_mark_node
)
7245 const_tree type
= TREE_TYPE (fntype
);
7246 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
7247 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
7248 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
7249 &mode
, &nregs
, NULL
, false))
7250 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
7255 && pcum
->pcs_variant
== ARM_PCS_SVE
)
7257 /* We can't gracefully recover at this point, so make this a
7260 fatal_error (input_location
, "%qE requires the SVE ISA extension",
7263 fatal_error (input_location
, "calls to functions of type %qT require"
7264 " the SVE ISA extension", fntype
);
7269 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
7270 const function_arg_info
&arg
)
7272 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
7273 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
7274 || pcum
->pcs_variant
== ARM_PCS_SIMD
7275 || pcum
->pcs_variant
== ARM_PCS_SVE
)
7277 aarch64_layout_arg (pcum_v
, arg
);
7278 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
7279 != (pcum
->aapcs_stack_words
!= 0));
7281 && aarch64_call_switches_pstate_sm (pcum
->isa_mode
))
7282 aarch64_record_sme_mode_switch_args (pcum
);
7284 pcum
->aapcs_arg_processed
= false;
7285 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
7286 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
7287 pcum
->aapcs_nprn
= pcum
->aapcs_nextnprn
;
7288 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
7289 pcum
->aapcs_stack_words
= 0;
7290 pcum
->aapcs_reg
= NULL_RTX
;
7295 aarch64_function_arg_regno_p (unsigned regno
)
7297 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
7298 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
)
7299 || (PR_REGNUM_P (regno
) && regno
< P0_REGNUM
+ NUM_PR_ARG_REGS
));
7302 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7303 PARM_BOUNDARY bits of alignment, but will be given anything up
7304 to STACK_BOUNDARY bits if the type requires it. This makes sure
7305 that both before and after the layout of each argument, the Next
7306 Stacked Argument Address (NSAA) will have a minimum alignment of
7310 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
7312 unsigned int abi_break_gcc_9
;
7313 unsigned int abi_break_gcc_13
;
7314 unsigned int abi_break_gcc_14
;
7315 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
7319 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7320 to emit warnings about ABI incompatibility. */
7321 alignment
= MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
7325 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7327 static fixed_size_mode
7328 aarch64_get_reg_raw_mode (int regno
)
7330 /* Don't use any non GP registers for __builtin_apply and
7331 __builtin_return if general registers only mode is requested. */
7332 if (TARGET_GENERAL_REGS_ONLY
&& !GP_REGNUM_P (regno
))
7333 return as_a
<fixed_size_mode
> (VOIDmode
);
7334 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
7335 /* Don't use the SVE part of the register for __builtin_apply and
7336 __builtin_return. The SVE registers aren't used by the normal PCS,
7337 so using them there would be a waste of time. The PCS extensions
7338 for SVE types are fundamentally incompatible with the
7339 __builtin_return/__builtin_apply interface. */
7340 return as_a
<fixed_size_mode
> (V16QImode
);
7341 if (PR_REGNUM_P (regno
))
7342 /* For SVE PR regs, indicate that they should be ignored for
7343 __builtin_apply/__builtin_return. */
7344 return as_a
<fixed_size_mode
> (VOIDmode
);
7345 return default_get_reg_raw_mode (regno
);
7348 /* Implement TARGET_FUNCTION_ARG_PADDING.
7350 Small aggregate types are placed in the lowest memory address.
7352 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7354 static pad_direction
7355 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
7357 /* On little-endian targets, the least significant byte of every stack
7358 argument is passed at the lowest byte address of the stack slot. */
7359 if (!BYTES_BIG_ENDIAN
)
7362 /* Otherwise, integral, floating-point and pointer types are padded downward:
7363 the least significant byte of a stack argument is passed at the highest
7364 byte address of the stack slot. */
7366 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
7367 || POINTER_TYPE_P (type
))
7368 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
7369 return PAD_DOWNWARD
;
7371 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7375 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7377 It specifies padding for the last (may also be the only)
7378 element of a block move between registers and memory. If
7379 assuming the block is in the memory, padding upward means that
7380 the last element is padded after its highest significant byte,
7381 while in downward padding, the last element is padded at the
7382 its least significant byte side.
7384 Small aggregates and small complex types are always padded
7387 We don't need to worry about homogeneous floating-point or
7388 short-vector aggregates; their move is not affected by the
7389 padding direction determined here. Regardless of endianness,
7390 each element of such an aggregate is put in the least
7391 significant bits of a fp/simd register.
7393 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7394 register has useful data, and return the opposite if the most
7395 significant byte does. */
7398 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
7399 bool first ATTRIBUTE_UNUSED
)
7402 /* Aside from pure scalable types, small composite types are always
7404 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
7408 size
= int_size_in_bytes (type
);
7410 /* No frontends can create types with variable-sized modes, so we
7411 shouldn't be asked to pass or return them. */
7412 size
= GET_MODE_SIZE (mode
).to_constant ();
7413 if (size
< 2 * UNITS_PER_WORD
)
7415 pure_scalable_type_info pst_info
;
7416 if (pst_info
.analyze_registers (type
))
7422 /* Otherwise, use the default padding. */
7423 return !BYTES_BIG_ENDIAN
;
7426 static scalar_int_mode
7427 aarch64_libgcc_cmp_return_mode (void)
7432 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7434 /* We use the 12-bit shifted immediate arithmetic instructions so values
7435 must be multiple of (1 << 12), i.e. 4096. */
7436 #define ARITH_FACTOR 4096
7438 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7439 #error Cannot use simple address calculation for stack probing
7442 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7443 inclusive. These are offsets from the current stack pointer. */
7446 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
7449 if (!poly_size
.is_constant (&size
))
7451 sorry ("stack probes for SVE frames");
7455 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REGNUM
);
7457 /* See the same assertion on PROBE_INTERVAL above. */
7458 gcc_assert ((first
% ARITH_FACTOR
) == 0);
7460 /* See if we have a constant small number of probes to generate. If so,
7461 that's the easy case. */
7462 if (size
<= PROBE_INTERVAL
)
7464 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
7466 emit_set_insn (reg1
,
7467 plus_constant (Pmode
,
7468 stack_pointer_rtx
, -(first
+ base
)));
7469 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
7472 /* The run-time loop is made up of 8 insns in the generic case while the
7473 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7474 else if (size
<= 4 * PROBE_INTERVAL
)
7476 HOST_WIDE_INT i
, rem
;
7478 emit_set_insn (reg1
,
7479 plus_constant (Pmode
,
7481 -(first
+ PROBE_INTERVAL
)));
7482 emit_stack_probe (reg1
);
7484 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7485 it exceeds SIZE. If only two probes are needed, this will not
7486 generate any code. Then probe at FIRST + SIZE. */
7487 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
7489 emit_set_insn (reg1
,
7490 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
7491 emit_stack_probe (reg1
);
7494 rem
= size
- (i
- PROBE_INTERVAL
);
7497 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
7499 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
7500 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
7503 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
7506 /* Otherwise, do the same as above, but in a loop. Note that we must be
7507 extra careful with variables wrapping around because we might be at
7508 the very top (or the very bottom) of the address space and we have
7509 to be able to handle this case properly; in particular, we use an
7510 equality test for the loop condition. */
7513 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REGNUM
);
7515 /* Step 1: round SIZE to the previous multiple of the interval. */
7517 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
7520 /* Step 2: compute initial and final value of the loop counter. */
7522 /* TEST_ADDR = SP + FIRST. */
7523 emit_set_insn (reg1
,
7524 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
7526 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
7527 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
7528 if (! aarch64_uimm12_shift (adjustment
))
7530 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
7532 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
7535 emit_set_insn (reg2
,
7536 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
7542 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7545 while (TEST_ADDR != LAST_ADDR)
7547 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7548 until it is equal to ROUNDED_SIZE. */
7550 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
7553 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7554 that SIZE is equal to ROUNDED_SIZE. */
7556 if (size
!= rounded_size
)
7558 HOST_WIDE_INT rem
= size
- rounded_size
;
7562 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
7564 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
7565 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
7568 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
7572 /* Make sure nothing is scheduled before we are done. */
7573 emit_insn (gen_blockage ());
7576 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7577 absolute addresses. */
7580 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
7582 static int labelno
= 0;
7586 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
7589 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
7591 HOST_WIDE_INT stack_clash_probe_interval
7592 = 1 << param_stack_clash_protection_guard_size
;
7594 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7596 HOST_WIDE_INT interval
;
7597 if (flag_stack_clash_protection
)
7598 interval
= stack_clash_probe_interval
;
7600 interval
= PROBE_INTERVAL
;
7602 gcc_assert (aarch64_uimm12_shift (interval
));
7603 xops
[1] = GEN_INT (interval
);
7605 output_asm_insn ("sub\t%0, %0, %1", xops
);
7607 /* If doing stack clash protection then we probe up by the ABI specified
7608 amount. We do this because we're dropping full pages at a time in the
7609 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7610 if (flag_stack_clash_protection
)
7611 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
7613 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
7615 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7616 by this amount for each iteration. */
7617 output_asm_insn ("str\txzr, [%0, %1]", xops
);
7619 /* Test if TEST_ADDR == LAST_ADDR. */
7621 output_asm_insn ("cmp\t%0, %1", xops
);
7624 fputs ("\tb.ne\t", asm_out_file
);
7625 assemble_name_raw (asm_out_file
, loop_lab
);
7626 fputc ('\n', asm_out_file
);
7631 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7632 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7633 of GUARD_SIZE. When a probe is emitted it is done at most
7634 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7635 at most MIN_PROBE_THRESHOLD. By the end of this function
7636 BASE = BASE - ADJUSTMENT. */
7639 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
7640 rtx min_probe_threshold
, rtx guard_size
)
7642 /* This function is not allowed to use any instruction generation function
7643 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7644 so instead emit the code you want using output_asm_insn. */
7645 gcc_assert (flag_stack_clash_protection
);
7646 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
7647 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
7649 /* The minimum required allocation before the residual requires probing. */
7650 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
7652 /* Clamp the value down to the nearest value that can be used with a cmp. */
7653 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
7654 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
7656 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
7657 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
7659 static int labelno
= 0;
7660 char loop_start_lab
[32];
7661 char loop_end_lab
[32];
7664 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
7665 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
7667 /* Emit loop start label. */
7668 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
7670 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7671 xops
[0] = adjustment
;
7672 xops
[1] = probe_offset_value_rtx
;
7673 output_asm_insn ("cmp\t%0, %1", xops
);
7675 /* Branch to end if not enough adjustment to probe. */
7676 fputs ("\tb.lt\t", asm_out_file
);
7677 assemble_name_raw (asm_out_file
, loop_end_lab
);
7678 fputc ('\n', asm_out_file
);
7680 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7682 xops
[1] = probe_offset_value_rtx
;
7683 output_asm_insn ("sub\t%0, %0, %1", xops
);
7685 /* Probe at BASE. */
7686 xops
[1] = const0_rtx
;
7687 output_asm_insn ("str\txzr, [%0, %1]", xops
);
7689 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7690 xops
[0] = adjustment
;
7691 xops
[1] = probe_offset_value_rtx
;
7692 output_asm_insn ("sub\t%0, %0, %1", xops
);
7694 /* Branch to start if still more bytes to allocate. */
7695 fputs ("\tb\t", asm_out_file
);
7696 assemble_name_raw (asm_out_file
, loop_start_lab
);
7697 fputc ('\n', asm_out_file
);
7699 /* No probe leave. */
7700 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
7702 /* BASE = BASE - ADJUSTMENT. */
7704 xops
[1] = adjustment
;
7705 output_asm_insn ("sub\t%0, %0, %1", xops
);
7709 /* Determine whether a frame chain needs to be generated. */
7711 aarch64_needs_frame_chain (void)
7713 if (frame_pointer_needed
)
7716 /* A leaf function cannot have calls or write LR. */
7717 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
7719 /* Don't use a frame chain in leaf functions if leaf frame pointers
7721 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
7724 return aarch64_use_frame_pointer
;
7727 /* Return true if the current function should save registers above
7728 the locals area, rather than below it. */
7731 aarch64_save_regs_above_locals_p ()
7733 /* When using stack smash protection, make sure that the canary slot
7734 comes between the locals and the saved registers. Otherwise,
7735 it would be possible for a carefully sized smash attack to change
7736 the saved registers (particularly LR and FP) without reaching the
7738 return crtl
->stack_protect_guard
;
7741 /* Return true if the current function needs to record the incoming
7742 value of PSTATE.SM. */
7744 aarch64_need_old_pstate_sm ()
7746 /* Exit early if the incoming value of PSTATE.SM is known at
7748 if (aarch64_cfun_incoming_pstate_sm () != 0)
7751 if (aarch64_cfun_enables_pstate_sm ())
7754 /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7755 but the function needs to return with PSTATE.SM unchanged. */
7756 if (nonlocal_goto_handler_labels
)
7759 /* Likewise for exception handlers. */
7761 for (unsigned int i
= 1; vec_safe_iterate (cfun
->eh
->lp_array
, i
, &lp
); ++i
)
7762 if (lp
&& lp
->post_landing_pad
)
7765 /* Non-local gotos need to set PSTATE.SM to zero. It's possible to call
7766 streaming-compatible functions without SME being available, so PSTATE.SM
7767 should only be changed if it is currently set to one. */
7768 if (crtl
->has_nonlocal_goto
)
7771 if (cfun
->machine
->call_switches_pstate_sm
)
7772 for (auto insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
7773 if (auto *call
= dyn_cast
<rtx_call_insn
*> (insn
))
7774 if (!SIBLING_CALL_P (call
))
7776 /* Return true if there is a call to a non-streaming-compatible
7778 auto callee_isa_mode
= aarch64_insn_callee_isa_mode (call
);
7779 if (aarch64_call_switches_pstate_sm (callee_isa_mode
))
7785 /* Mark the registers that need to be saved by the callee and calculate
7786 the size of the callee-saved registers area and frame record (both FP
7787 and LR may be omitted). */
7789 aarch64_layout_frame (void)
7791 unsigned regno
, last_fp_reg
= INVALID_REGNUM
;
7792 machine_mode vector_save_mode
= aarch64_reg_save_mode (V8_REGNUM
);
7793 poly_int64 vector_save_size
= GET_MODE_SIZE (vector_save_mode
);
7794 bool frame_related_fp_reg_p
= false;
7795 aarch64_frame
&frame
= cfun
->machine
->frame
;
7796 poly_int64 top_of_locals
= -1;
7797 bool enables_pstate_sm
= aarch64_cfun_enables_pstate_sm ();
7799 vec_safe_truncate (frame
.saved_gprs
, 0);
7800 vec_safe_truncate (frame
.saved_fprs
, 0);
7801 vec_safe_truncate (frame
.saved_prs
, 0);
7803 frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
7805 /* Adjust the outgoing arguments size if required. Keep it in sync with what
7806 the mid-end is doing. */
7807 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
7809 #define SLOT_NOT_REQUIRED (-2)
7810 #define SLOT_REQUIRED (-1)
7812 frame
.wb_push_candidate1
= INVALID_REGNUM
;
7813 frame
.wb_push_candidate2
= INVALID_REGNUM
;
7814 frame
.spare_pred_reg
= INVALID_REGNUM
;
7816 /* First mark all the registers that really need to be saved... */
7817 for (regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
7818 frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
7819 frame
.old_svcr_offset
= SLOT_NOT_REQUIRED
;
7821 /* ... that includes the eh data registers (if needed)... */
7822 if (crtl
->calls_eh_return
)
7823 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
7824 frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)] = SLOT_REQUIRED
;
7826 /* ... and any callee saved register that dataflow says is live. */
7827 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
7828 if (df_regs_ever_live_p (regno
)
7829 && !fixed_regs
[regno
]
7830 && (regno
== R30_REGNUM
7831 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
7832 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
7834 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
7835 if ((enables_pstate_sm
|| df_regs_ever_live_p (regno
))
7836 && !fixed_regs
[regno
]
7837 && !crtl
->abi
->clobbers_full_reg_p (regno
))
7839 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
7840 last_fp_reg
= regno
;
7841 if (aarch64_emit_cfi_for_reg_p (regno
))
7842 frame_related_fp_reg_p
= true;
7845 /* Big-endian SVE frames need a spare predicate register in order
7846 to save Z8-Z15. Decide which register they should use. Prefer
7847 an unused argument register if possible, so that we don't force P4
7848 to be saved unnecessarily. */
7849 if (frame_related_fp_reg_p
7850 && crtl
->abi
->id () == ARM_PCS_SVE
7851 && BYTES_BIG_ENDIAN
)
7853 bitmap live1
= df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
7854 bitmap live2
= df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun
));
7855 for (regno
= P0_REGNUM
; regno
<= P7_REGNUM
; regno
++)
7856 if (!bitmap_bit_p (live1
, regno
) && !bitmap_bit_p (live2
, regno
))
7858 gcc_assert (regno
<= P7_REGNUM
);
7859 frame
.spare_pred_reg
= regno
;
7860 df_set_regs_ever_live (regno
, true);
7863 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
7864 if ((enables_pstate_sm
|| df_regs_ever_live_p (regno
))
7865 && !fixed_regs
[regno
]
7866 && !crtl
->abi
->clobbers_full_reg_p (regno
))
7867 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
7869 bool regs_at_top_p
= aarch64_save_regs_above_locals_p ();
7871 poly_int64 offset
= crtl
->outgoing_args_size
;
7872 gcc_assert (multiple_p (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
));
7875 offset
+= get_frame_size ();
7876 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
7877 top_of_locals
= offset
;
7879 frame
.bytes_below_saved_regs
= offset
;
7880 frame
.sve_save_and_probe
= INVALID_REGNUM
;
7882 /* Now assign stack slots for the registers. Start with the predicate
7883 registers, since predicate LDR and STR have a relatively small
7884 offset range. These saves happen below the hard frame pointer. */
7885 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
7886 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
7888 vec_safe_push (frame
.saved_prs
, regno
);
7889 if (frame
.sve_save_and_probe
== INVALID_REGNUM
)
7890 frame
.sve_save_and_probe
= regno
;
7891 frame
.reg_offset
[regno
] = offset
;
7892 offset
+= BYTES_PER_SVE_PRED
;
7895 poly_int64 saved_prs_size
= offset
- frame
.bytes_below_saved_regs
;
7896 if (maybe_ne (saved_prs_size
, 0))
7898 /* If we have any vector registers to save above the predicate registers,
7899 the offset of the vector register save slots need to be a multiple
7900 of the vector size. This lets us use the immediate forms of LDR/STR
7901 (or LD1/ST1 for big-endian).
7903 A vector register is 8 times the size of a predicate register,
7904 and we need to save a maximum of 12 predicate registers, so the
7905 first vector register will be at either #1, MUL VL or #2, MUL VL.
7907 If we don't have any vector registers to save, and we know how
7908 big the predicate save area is, we can just round it up to the
7909 next 16-byte boundary. */
7910 if (last_fp_reg
== INVALID_REGNUM
&& offset
.is_constant ())
7911 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
7914 if (known_le (saved_prs_size
, vector_save_size
))
7915 offset
= frame
.bytes_below_saved_regs
+ vector_save_size
;
7916 else if (known_le (saved_prs_size
, vector_save_size
* 2))
7917 offset
= frame
.bytes_below_saved_regs
+ vector_save_size
* 2;
7923 /* If we need to save any SVE vector registers, add them next. */
7924 if (last_fp_reg
!= INVALID_REGNUM
&& crtl
->abi
->id () == ARM_PCS_SVE
)
7925 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
7926 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
7928 vec_safe_push (frame
.saved_fprs
, regno
);
7929 if (frame
.sve_save_and_probe
== INVALID_REGNUM
)
7930 frame
.sve_save_and_probe
= regno
;
7931 frame
.reg_offset
[regno
] = offset
;
7932 offset
+= vector_save_size
;
7935 /* OFFSET is now the offset of the hard frame pointer from the bottom
7936 of the callee save area. */
7937 auto below_hard_fp_saved_regs_size
= offset
- frame
.bytes_below_saved_regs
;
7938 bool saves_below_hard_fp_p
= maybe_ne (below_hard_fp_saved_regs_size
, 0);
7939 gcc_assert (!saves_below_hard_fp_p
7940 || (frame
.sve_save_and_probe
!= INVALID_REGNUM
7941 && known_eq (frame
.reg_offset
[frame
.sve_save_and_probe
],
7942 frame
.bytes_below_saved_regs
)));
7944 frame
.bytes_below_hard_fp
= offset
;
7945 frame
.hard_fp_save_and_probe
= INVALID_REGNUM
;
7947 auto allocate_gpr_slot
= [&](unsigned int regno
)
7949 vec_safe_push (frame
.saved_gprs
, regno
);
7950 frame
.reg_offset
[regno
] = offset
;
7951 offset
+= UNITS_PER_WORD
;
7954 if (frame
.emit_frame_chain
)
7956 /* FP and LR are placed in the linkage record. */
7957 allocate_gpr_slot (R29_REGNUM
);
7958 allocate_gpr_slot (R30_REGNUM
);
7960 else if ((flag_stack_clash_protection
|| !frame
.is_scs_enabled
)
7961 && known_eq (frame
.reg_offset
[R30_REGNUM
], SLOT_REQUIRED
))
7962 /* Put the LR save slot first, since it makes a good choice of probe
7963 for stack clash purposes. The idea is that the link register usually
7964 has to be saved before a call anyway, and so we lose little by
7965 stopping it from being individually shrink-wrapped. */
7966 allocate_gpr_slot (R30_REGNUM
);
7968 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
7969 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
7970 allocate_gpr_slot (regno
);
7972 if (aarch64_need_old_pstate_sm ())
7974 frame
.old_svcr_offset
= offset
;
7975 offset
+= UNITS_PER_WORD
;
7978 /* If the current function changes the SVE vector length, ensure that the
7979 old value of the DWARF VG register is saved and available in the CFI,
7980 so that outer frames with VL-sized offsets can be processed correctly. */
7981 if (cfun
->machine
->call_switches_pstate_sm
7982 || aarch64_cfun_enables_pstate_sm ())
7984 frame
.reg_offset
[VG_REGNUM
] = offset
;
7985 offset
+= UNITS_PER_WORD
;
7988 poly_int64 max_int_offset
= offset
;
7989 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
7990 bool has_align_gap
= maybe_ne (offset
, max_int_offset
);
7992 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
7993 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
7995 vec_safe_push (frame
.saved_fprs
, regno
);
7996 /* If there is an alignment gap between integer and fp callee-saves,
7997 allocate the last fp register to it if possible. */
7998 if (regno
== last_fp_reg
8000 && known_eq (vector_save_size
, 8)
8001 && multiple_p (offset
, 16))
8003 frame
.reg_offset
[regno
] = max_int_offset
;
8007 frame
.reg_offset
[regno
] = offset
;
8008 offset
+= vector_save_size
;
8011 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8012 auto saved_regs_size
= offset
- frame
.bytes_below_saved_regs
;
8014 array_slice
<unsigned int> push_regs
= (!vec_safe_is_empty (frame
.saved_gprs
)
8016 : frame
.saved_fprs
);
8017 if (!push_regs
.empty ()
8018 && known_eq (frame
.reg_offset
[push_regs
[0]], frame
.bytes_below_hard_fp
))
8020 frame
.hard_fp_save_and_probe
= push_regs
[0];
8021 frame
.wb_push_candidate1
= push_regs
[0];
8022 if (push_regs
.size () > 1)
8023 frame
.wb_push_candidate2
= push_regs
[1];
8026 /* With stack-clash, a register must be saved in non-leaf functions.
8027 The saving of the bottommost register counts as an implicit probe,
8028 which allows us to maintain the invariant described in the comment
8029 at expand_prologue. */
8030 gcc_assert (crtl
->is_leaf
|| maybe_ne (saved_regs_size
, 0));
8034 offset
+= get_frame_size ();
8035 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
8036 top_of_locals
= offset
;
8038 offset
+= frame
.saved_varargs_size
;
8039 gcc_assert (multiple_p (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
));
8040 frame
.frame_size
= offset
;
8042 frame
.bytes_above_hard_fp
= frame
.frame_size
- frame
.bytes_below_hard_fp
;
8043 gcc_assert (known_ge (top_of_locals
, 0));
8044 frame
.bytes_above_locals
= frame
.frame_size
- top_of_locals
;
8046 frame
.initial_adjust
= 0;
8047 frame
.final_adjust
= 0;
8048 frame
.callee_adjust
= 0;
8049 frame
.sve_callee_adjust
= 0;
8051 frame
.wb_pop_candidate1
= frame
.wb_push_candidate1
;
8052 frame
.wb_pop_candidate2
= frame
.wb_push_candidate2
;
8054 /* Shadow call stack only deals with functions where the LR is pushed
8055 onto the stack and without specifying the "no_sanitize" attribute
8056 with the argument "shadow-call-stack". */
8057 frame
.is_scs_enabled
8058 = (!crtl
->calls_eh_return
8059 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK
)
8060 && known_ge (frame
.reg_offset
[LR_REGNUM
], 0));
8062 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8063 restore x30, and we don't need to pop x30 again in the traditional
8064 way. Pop candidates record the registers that need to be popped
8066 if (frame
.is_scs_enabled
)
8068 if (frame
.wb_pop_candidate2
== R30_REGNUM
)
8069 frame
.wb_pop_candidate2
= INVALID_REGNUM
;
8070 else if (frame
.wb_pop_candidate1
== R30_REGNUM
)
8071 frame
.wb_pop_candidate1
= INVALID_REGNUM
;
8074 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8075 256 to ensure that the offset meets the requirements of emit_move_insn.
8076 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8077 max_push_offset to 0, because no registers are popped at this time,
8078 so callee_adjust cannot be adjusted. */
8079 HOST_WIDE_INT max_push_offset
= 0;
8080 if (frame
.wb_pop_candidate1
!= INVALID_REGNUM
)
8082 if (frame
.wb_pop_candidate2
!= INVALID_REGNUM
)
8083 max_push_offset
= 512;
8085 max_push_offset
= 256;
8088 HOST_WIDE_INT const_size
, const_below_saved_regs
, const_above_fp
;
8089 HOST_WIDE_INT const_saved_regs_size
;
8090 if (known_eq (saved_regs_size
, 0))
8091 frame
.initial_adjust
= frame
.frame_size
;
8092 else if (frame
.frame_size
.is_constant (&const_size
)
8093 && const_size
< max_push_offset
8094 && known_eq (frame
.bytes_above_hard_fp
, const_size
))
8096 /* Simple, small frame with no data below the saved registers.
8098 stp reg1, reg2, [sp, -frame_size]!
8099 stp reg3, reg4, [sp, 16] */
8100 frame
.callee_adjust
= const_size
;
8102 else if (frame
.bytes_below_saved_regs
.is_constant (&const_below_saved_regs
)
8103 && saved_regs_size
.is_constant (&const_saved_regs_size
)
8104 && const_below_saved_regs
+ const_saved_regs_size
< 512
8105 /* We could handle this case even with data below the saved
8106 registers, provided that that data left us with valid offsets
8107 for all predicate and vector save slots. It's such a rare
8108 case that it hardly seems worth the effort though. */
8109 && (!saves_below_hard_fp_p
|| const_below_saved_regs
== 0)
8110 && !(cfun
->calls_alloca
8111 && frame
.bytes_above_hard_fp
.is_constant (&const_above_fp
)
8112 && const_above_fp
< max_push_offset
))
8114 /* Frame with small area below the saved registers:
8116 sub sp, sp, frame_size
8117 stp reg1, reg2, [sp, bytes_below_saved_regs]
8118 stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
8119 frame
.initial_adjust
= frame
.frame_size
;
8121 else if (saves_below_hard_fp_p
8122 && known_eq (saved_regs_size
, below_hard_fp_saved_regs_size
))
8124 /* Frame in which all saves are SVE saves:
8126 sub sp, sp, frame_size - bytes_below_saved_regs
8127 save SVE registers relative to SP
8128 sub sp, sp, bytes_below_saved_regs */
8129 frame
.initial_adjust
= frame
.frame_size
- frame
.bytes_below_saved_regs
;
8130 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8132 else if (frame
.wb_push_candidate1
!= INVALID_REGNUM
8133 && frame
.bytes_above_hard_fp
.is_constant (&const_above_fp
)
8134 && const_above_fp
< max_push_offset
)
8136 /* Frame with large area below the saved registers, or with SVE saves,
8137 but with a small area above:
8139 stp reg1, reg2, [sp, -hard_fp_offset]!
8140 stp reg3, reg4, [sp, 16]
8141 [sub sp, sp, below_hard_fp_saved_regs_size]
8142 [save SVE registers relative to SP]
8143 sub sp, sp, bytes_below_saved_regs */
8144 frame
.callee_adjust
= const_above_fp
;
8145 frame
.sve_callee_adjust
= below_hard_fp_saved_regs_size
;
8146 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8152 sub sp, sp, hard_fp_offset
8153 stp x29, x30, [sp, 0]
8155 stp reg3, reg4, [sp, 16]
8156 [sub sp, sp, below_hard_fp_saved_regs_size]
8157 [save SVE registers relative to SP]
8158 sub sp, sp, bytes_below_saved_regs */
8159 frame
.initial_adjust
= frame
.bytes_above_hard_fp
;
8160 frame
.sve_callee_adjust
= below_hard_fp_saved_regs_size
;
8161 frame
.final_adjust
= frame
.bytes_below_saved_regs
;
8164 /* The frame is allocated in pieces, with each non-final piece
8165 including a register save at offset 0 that acts as a probe for
8166 the following piece. In addition, the save of the bottommost register
8167 acts as a probe for callees and allocas. Roll back any probes that
8170 A probe isn't needed if it is associated with the final allocation
8171 (including callees and allocas) that happens before the epilogue is
8174 && !cfun
->calls_alloca
8175 && known_eq (frame
.final_adjust
, 0))
8177 if (maybe_ne (frame
.sve_callee_adjust
, 0))
8178 frame
.sve_save_and_probe
= INVALID_REGNUM
;
8180 frame
.hard_fp_save_and_probe
= INVALID_REGNUM
;
8183 /* Make sure the individual adjustments add up to the full frame size. */
8184 gcc_assert (known_eq (frame
.initial_adjust
8185 + frame
.callee_adjust
8186 + frame
.sve_callee_adjust
8187 + frame
.final_adjust
, frame
.frame_size
));
8189 if (frame
.callee_adjust
== 0)
8191 /* We've decided not to do a "real" push and pop. However,
8192 setting up the frame chain is treated as being essentially
8193 a multi-instruction push. */
8194 frame
.wb_pop_candidate1
= frame
.wb_pop_candidate2
= INVALID_REGNUM
;
8195 if (!frame
.emit_frame_chain
)
8196 frame
.wb_push_candidate1
= frame
.wb_push_candidate2
= INVALID_REGNUM
;
8199 frame
.laid_out
= true;
8202 /* Return true if the register REGNO is saved on entry to
8203 the current function. */
8206 aarch64_register_saved_on_entry (int regno
)
8208 return known_ge (cfun
->machine
->frame
.reg_offset
[regno
], 0);
8211 /* Push the register number REGNO of mode MODE to the stack with write-back
8212 adjusting the stack by ADJUSTMENT. */
8215 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
8216 HOST_WIDE_INT adjustment
)
8218 rtx base_rtx
= stack_pointer_rtx
;
8221 reg
= gen_rtx_REG (mode
, regno
);
8222 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
8223 plus_constant (Pmode
, base_rtx
, -adjustment
));
8224 mem
= gen_frame_mem (mode
, mem
);
8226 insn
= emit_move_insn (mem
, reg
);
8227 RTX_FRAME_RELATED_P (insn
) = 1;
8230 /* Generate and return an instruction to store the pair of registers
8231 REG and REG2 of mode MODE to location BASE with write-back adjusting
8232 the stack location BASE by ADJUSTMENT. */
8235 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
8236 HOST_WIDE_INT adjustment
)
8238 rtx new_base
= plus_constant (Pmode
, base
, -adjustment
);
8239 rtx mem
= gen_frame_mem (mode
, new_base
);
8240 rtx mem2
= adjust_address_nv (mem
, mode
, GET_MODE_SIZE (mode
));
8242 return gen_rtx_PARALLEL (VOIDmode
,
8244 gen_rtx_SET (base
, new_base
),
8245 gen_rtx_SET (mem
, reg
),
8246 gen_rtx_SET (mem2
, reg2
)));
8249 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8250 stack pointer by ADJUSTMENT. */
8253 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
8256 machine_mode mode
= aarch64_reg_save_mode (regno1
);
8258 if (regno2
== INVALID_REGNUM
)
8259 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
8261 rtx reg1
= gen_rtx_REG (mode
, regno1
);
8262 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8264 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
8266 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
8267 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
8268 RTX_FRAME_RELATED_P (insn
) = 1;
8271 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8272 adjusting it by ADJUSTMENT afterwards. */
8275 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
8276 HOST_WIDE_INT adjustment
)
8278 rtx mem
= gen_frame_mem (mode
, base
);
8279 rtx mem2
= adjust_address_nv (mem
, mode
, GET_MODE_SIZE (mode
));
8280 rtx new_base
= plus_constant (Pmode
, base
, adjustment
);
8282 return gen_rtx_PARALLEL (VOIDmode
,
8284 gen_rtx_SET (base
, new_base
),
8285 gen_rtx_SET (reg
, mem
),
8286 gen_rtx_SET (reg2
, mem2
)));
8289 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8290 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8294 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
8297 machine_mode mode
= aarch64_reg_save_mode (regno1
);
8298 rtx reg1
= gen_rtx_REG (mode
, regno1
);
8300 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
8302 if (regno2
== INVALID_REGNUM
)
8304 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
8305 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
8306 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
8310 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8311 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
8312 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
8317 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8318 for a mem rtx representing the entire pair. */
8321 aarch64_pair_mode_for_mode (machine_mode mode
)
8323 if (known_eq (GET_MODE_SIZE (mode
), 4))
8325 else if (known_eq (GET_MODE_SIZE (mode
), 8))
8327 else if (known_eq (GET_MODE_SIZE (mode
), 16))
8333 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8334 operand, return an rtx like MEM which instead represents the entire pair. */
8337 aarch64_pair_mem_from_base (rtx mem
)
8339 auto pair_mode
= aarch64_pair_mode_for_mode (GET_MODE (mem
));
8340 mem
= adjust_bitfield_address_nv (mem
, pair_mode
, 0);
8341 gcc_assert (aarch64_mem_pair_lanes_operand (mem
, pair_mode
));
8345 /* Generate and return a store pair instruction to store REG1 and REG2
8346 into memory starting at BASE_MEM. All three rtxes should have modes of the
8350 aarch64_gen_store_pair (rtx base_mem
, rtx reg1
, rtx reg2
)
8352 rtx pair_mem
= aarch64_pair_mem_from_base (base_mem
);
8354 return gen_rtx_SET (pair_mem
,
8355 gen_rtx_UNSPEC (GET_MODE (pair_mem
),
8356 gen_rtvec (2, reg1
, reg2
),
8360 /* Generate and return a load pair instruction to load a pair of
8361 registers starting at BASE_MEM into REG1 and REG2. If CODE is
8362 UNKNOWN, all three rtxes should have modes of the same size.
8363 Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8364 and REG{1,2} should be in DImode. */
8367 aarch64_gen_load_pair (rtx reg1
, rtx reg2
, rtx base_mem
, enum rtx_code code
)
8369 rtx pair_mem
= aarch64_pair_mem_from_base (base_mem
);
8371 const bool any_extend_p
= (code
== ZERO_EXTEND
|| code
== SIGN_EXTEND
);
8373 gcc_checking_assert (GET_MODE (base_mem
) == SImode
8374 && GET_MODE (reg1
) == DImode
8375 && GET_MODE (reg2
) == DImode
);
8377 gcc_assert (code
== UNKNOWN
);
8380 gen_rtx_UNSPEC (any_extend_p
? SImode
: GET_MODE (reg1
),
8381 gen_rtvec (1, pair_mem
),
8383 gen_rtx_UNSPEC (any_extend_p
? SImode
: GET_MODE (reg2
),
8384 gen_rtvec (1, copy_rtx (pair_mem
)),
8389 for (int i
= 0; i
< 2; i
++)
8390 unspecs
[i
] = gen_rtx_fmt_e (code
, DImode
, unspecs
[i
]);
8392 return gen_rtx_PARALLEL (VOIDmode
,
8394 gen_rtx_SET (reg1
, unspecs
[0]),
8395 gen_rtx_SET (reg2
, unspecs
[1])));
8398 /* Return TRUE if return address signing should be enabled for the current
8399 function, otherwise return FALSE. */
8402 aarch64_return_address_signing_enabled (void)
8404 /* This function should only be called after frame laid out. */
8405 gcc_assert (cfun
->machine
->frame
.laid_out
);
8407 /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8408 if its LR is pushed onto stack. */
8409 return (aarch_ra_sign_scope
== AARCH_FUNCTION_ALL
8410 || (aarch_ra_sign_scope
== AARCH_FUNCTION_NON_LEAF
8411 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0)));
8414 /* Only used by the arm backend. */
8415 void aarch_bti_arch_check (void)
8418 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8420 aarch_bti_enabled (void)
8422 return (aarch_enable_bti
== 1);
8425 /* Check if INSN is a BTI J insn. */
8427 aarch_bti_j_insn_p (rtx_insn
*insn
)
8429 if (!insn
|| !INSN_P (insn
))
8432 rtx pat
= PATTERN (insn
);
8433 return GET_CODE (pat
) == UNSPEC_VOLATILE
&& XINT (pat
, 1) == UNSPECV_BTI_J
;
8436 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction. */
8438 aarch_pac_insn_p (rtx x
)
8443 subrtx_var_iterator::array_type array
;
8444 FOR_EACH_SUBRTX_VAR (iter
, array
, PATTERN (x
), ALL
)
8447 if (sub
&& GET_CODE (sub
) == UNSPEC
)
8449 int unspec_val
= XINT (sub
, 1);
8452 case UNSPEC_PACIASP
:
8453 case UNSPEC_PACIBSP
:
8459 iter
.skip_subrtxes ();
8465 rtx
aarch_gen_bti_c (void)
8467 return gen_bti_c ();
8470 rtx
aarch_gen_bti_j (void)
8472 return gen_bti_j ();
8475 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8476 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8477 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8479 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8482 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8483 if the variable isn't already nonnull
8485 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8486 Handle this case using a temporary base register that is suitable for
8487 all offsets in that range. Use ANCHOR_REG as this base register if it
8488 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8491 aarch64_adjust_sve_callee_save_base (machine_mode mode
, rtx
&base_rtx
,
8492 rtx
&anchor_reg
, poly_int64
&offset
,
8495 if (maybe_ge (offset
, 8 * GET_MODE_SIZE (mode
)))
8497 /* This is the maximum valid offset of the anchor from the base.
8498 Lower values would be valid too. */
8499 poly_int64 anchor_offset
= 16 * GET_MODE_SIZE (mode
);
8502 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
8503 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
8504 gen_int_mode (anchor_offset
, Pmode
)));
8506 base_rtx
= anchor_reg
;
8507 offset
-= anchor_offset
;
8511 int pred_reg
= cfun
->machine
->frame
.spare_pred_reg
;
8512 emit_move_insn (gen_rtx_REG (VNx16BImode
, pred_reg
),
8513 CONSTM1_RTX (VNx16BImode
));
8514 ptrue
= gen_rtx_REG (VNx2BImode
, pred_reg
);
8518 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8519 is saved at BASE + OFFSET. */
8522 aarch64_add_cfa_expression (rtx_insn
*insn
, rtx reg
,
8523 rtx base
, poly_int64 offset
)
8525 rtx mem
= gen_frame_mem (GET_MODE (reg
),
8526 plus_constant (Pmode
, base
, offset
));
8527 add_reg_note (insn
, REG_CFA_EXPRESSION
, gen_rtx_SET (mem
, reg
));
8530 /* Emit code to save the callee-saved registers in REGS. Skip any
8531 write-back candidates if SKIP_WB is true, otherwise consider only
8532 write-back candidates.
8534 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8535 of the static frame. HARD_FP_VALID_P is true if the hard frame pointer
8539 aarch64_save_callee_saves (poly_int64 bytes_below_sp
,
8540 array_slice
<unsigned int> regs
, bool skip_wb
,
8541 bool hard_fp_valid_p
)
8543 aarch64_frame
&frame
= cfun
->machine
->frame
;
8545 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
8547 auto skip_save_p
= [&](unsigned int regno
)
8549 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
8552 if (skip_wb
== (regno
== frame
.wb_push_candidate1
8553 || regno
== frame
.wb_push_candidate2
))
8559 for (unsigned int i
= 0; i
< regs
.size (); ++i
)
8561 unsigned int regno
= regs
[i
];
8563 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
8565 if (skip_save_p (regno
))
8568 machine_mode mode
= aarch64_reg_save_mode (regno
);
8569 rtx reg
= gen_rtx_REG (mode
, regno
);
8571 offset
= frame
.reg_offset
[regno
] - bytes_below_sp
;
8572 if (regno
== VG_REGNUM
)
8574 move_src
= gen_rtx_REG (DImode
, IP0_REGNUM
);
8575 emit_move_insn (move_src
, gen_int_mode (aarch64_sve_vg
, DImode
));
8577 rtx base_rtx
= stack_pointer_rtx
;
8578 poly_int64 sp_offset
= offset
;
8580 HOST_WIDE_INT const_offset
;
8581 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8582 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
8584 else if (GP_REGNUM_P (REGNO (reg
))
8585 && (!offset
.is_constant (&const_offset
) || const_offset
>= 512))
8587 poly_int64 fp_offset
= frame
.bytes_below_hard_fp
- bytes_below_sp
;
8588 if (hard_fp_valid_p
)
8589 base_rtx
= hard_frame_pointer_rtx
;
8594 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
8595 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
8596 gen_int_mode (fp_offset
, Pmode
)));
8598 base_rtx
= anchor_reg
;
8600 offset
-= fp_offset
;
8602 rtx mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
8603 rtx cfi_mem
= gen_frame_mem (mode
, plus_constant (Pmode
,
8606 rtx cfi_set
= gen_rtx_SET (cfi_mem
, reg
);
8607 bool need_cfi_note_p
= (base_rtx
!= stack_pointer_rtx
);
8609 unsigned int regno2
;
8610 if (!aarch64_sve_mode_p (mode
)
8612 && i
+ 1 < regs
.size ()
8613 && (regno2
= regs
[i
+ 1], !skip_save_p (regno2
))
8614 && known_eq (GET_MODE_SIZE (mode
),
8615 frame
.reg_offset
[regno2
] - frame
.reg_offset
[regno
]))
8617 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8619 offset
+= GET_MODE_SIZE (mode
);
8620 insn
= emit_insn (aarch64_gen_store_pair (mem
, reg
, reg2
));
8623 = gen_frame_mem (mode
,
8624 plus_constant (Pmode
,
8626 sp_offset
+ GET_MODE_SIZE (mode
)));
8627 rtx cfi_set2
= gen_rtx_SET (cfi_mem2
, reg2
);
8629 /* The first part of a frame-related parallel insn is always
8630 assumed to be relevant to the frame calculations;
8631 subsequent parts, are only frame-related if
8632 explicitly marked. */
8633 if (aarch64_emit_cfi_for_reg_p (regno2
))
8634 RTX_FRAME_RELATED_P (cfi_set2
) = 1;
8636 /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8637 representation of stp cannot be understood directly by
8639 rtx par
= gen_rtx_PARALLEL (VOIDmode
,
8640 gen_rtvec (2, cfi_set
, cfi_set2
));
8641 add_reg_note (insn
, REG_FRAME_RELATED_EXPR
, par
);
8648 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8650 insn
= emit_insn (gen_aarch64_pred_mov (mode
, mem
,
8652 need_cfi_note_p
= true;
8654 else if (aarch64_sve_mode_p (mode
))
8655 insn
= emit_insn (gen_rtx_SET (mem
, move_src
));
8657 insn
= emit_move_insn (mem
, move_src
);
8659 if (frame_related_p
&& (need_cfi_note_p
|| move_src
!= reg
))
8660 add_reg_note (insn
, REG_FRAME_RELATED_EXPR
, cfi_set
);
8663 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
8665 /* Emit a fake instruction to indicate that the VG save slot has
8666 been initialized. */
8667 if (regno
== VG_REGNUM
)
8668 emit_insn (gen_aarch64_old_vg_saved (move_src
, mem
));
8672 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8673 and any other registers that are handled separately. Write the appropriate
8674 REG_CFA_RESTORE notes into CFI_OPS.
8676 The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8677 of the static frame. */
8680 aarch64_restore_callee_saves (poly_int64 bytes_below_sp
,
8681 array_slice
<unsigned int> regs
, rtx
*cfi_ops
)
8683 aarch64_frame
&frame
= cfun
->machine
->frame
;
8685 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
8687 auto skip_restore_p
= [&](unsigned int regno
)
8689 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
8692 if (regno
== frame
.wb_pop_candidate1
8693 || regno
== frame
.wb_pop_candidate2
)
8696 /* The shadow call stack code restores LR separately. */
8697 if (frame
.is_scs_enabled
&& regno
== LR_REGNUM
)
8703 for (unsigned int i
= 0; i
< regs
.size (); ++i
)
8705 unsigned int regno
= regs
[i
];
8706 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
8707 if (skip_restore_p (regno
))
8710 machine_mode mode
= aarch64_reg_save_mode (regno
);
8711 rtx reg
= gen_rtx_REG (mode
, regno
);
8712 offset
= frame
.reg_offset
[regno
] - bytes_below_sp
;
8713 rtx base_rtx
= stack_pointer_rtx
;
8714 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8715 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
8717 rtx mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
8719 unsigned int regno2
;
8720 if (!aarch64_sve_mode_p (mode
)
8721 && i
+ 1 < regs
.size ()
8722 && (regno2
= regs
[i
+ 1], !skip_restore_p (regno2
))
8723 && known_eq (GET_MODE_SIZE (mode
),
8724 frame
.reg_offset
[regno2
] - frame
.reg_offset
[regno
]))
8726 rtx reg2
= gen_rtx_REG (mode
, regno2
);
8728 offset
+= GET_MODE_SIZE (mode
);
8729 emit_insn (aarch64_gen_load_pair (reg
, reg2
, mem
));
8731 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
8735 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8736 emit_insn (gen_aarch64_pred_mov (mode
, reg
, ptrue
, mem
));
8737 else if (aarch64_sve_mode_p (mode
))
8738 emit_insn (gen_rtx_SET (reg
, mem
));
8740 emit_move_insn (reg
, mem
);
8741 if (frame_related_p
)
8742 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
8746 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8750 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
8752 HOST_WIDE_INT multiple
;
8753 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8754 && IN_RANGE (multiple
, -8, 7));
8757 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8761 offset_6bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
8763 HOST_WIDE_INT multiple
;
8764 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8765 && IN_RANGE (multiple
, -32, 31));
8768 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8772 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
8774 HOST_WIDE_INT multiple
;
8775 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8776 && IN_RANGE (multiple
, 0, 63));
8779 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8783 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
8785 HOST_WIDE_INT multiple
;
8786 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8787 && IN_RANGE (multiple
, -64, 63));
8790 /* Return true if OFFSET is a signed 9-bit value. */
8793 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
8796 HOST_WIDE_INT const_offset
;
8797 return (offset
.is_constant (&const_offset
)
8798 && IN_RANGE (const_offset
, -256, 255));
8801 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8805 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
8807 HOST_WIDE_INT multiple
;
8808 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8809 && IN_RANGE (multiple
, -256, 255));
8812 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8816 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
8818 HOST_WIDE_INT multiple
;
8819 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
8820 && IN_RANGE (multiple
, 0, 4095));
8823 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
8826 aarch64_get_separate_components (void)
8828 aarch64_frame
&frame
= cfun
->machine
->frame
;
8829 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
8830 bitmap_clear (components
);
8832 /* The registers we need saved to the frame. */
8833 bool enables_pstate_sm
= aarch64_cfun_enables_pstate_sm ();
8834 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
8835 if (aarch64_register_saved_on_entry (regno
))
8837 /* Disallow shrink wrapping for registers that will be clobbered
8838 by an SMSTART SM in the prologue. */
8839 if (enables_pstate_sm
8840 && (FP_REGNUM_P (regno
) || PR_REGNUM_P (regno
)))
8843 /* Punt on saves and restores that use ST1D and LD1D. We could
8844 try to be smarter, but it would involve making sure that the
8845 spare predicate register itself is safe to use at the save
8846 and restore points. Also, when a frame pointer is being used,
8847 the slots are often out of reach of ST1D and LD1D anyway. */
8848 machine_mode mode
= aarch64_reg_save_mode (regno
);
8849 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
8852 poly_int64 offset
= frame
.reg_offset
[regno
];
8854 /* Get the offset relative to the register we'll use. */
8855 if (frame_pointer_needed
)
8856 offset
-= frame
.bytes_below_hard_fp
;
8858 /* Check that we can access the stack slot of the register with one
8859 direct load with no adjustments needed. */
8860 if (aarch64_sve_mode_p (mode
)
8861 ? offset_9bit_signed_scaled_p (mode
, offset
)
8862 : offset_12bit_unsigned_scaled_p (mode
, offset
))
8863 bitmap_set_bit (components
, regno
);
8866 /* Don't mess with the hard frame pointer. */
8867 if (frame_pointer_needed
)
8868 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
8870 /* If the spare predicate register used by big-endian SVE code
8871 is call-preserved, it must be saved in the main prologue
8872 before any saves that use it. */
8873 if (frame
.spare_pred_reg
!= INVALID_REGNUM
)
8874 bitmap_clear_bit (components
, frame
.spare_pred_reg
);
8876 unsigned reg1
= frame
.wb_push_candidate1
;
8877 unsigned reg2
= frame
.wb_push_candidate2
;
8878 /* If registers have been chosen to be stored/restored with
8879 writeback don't interfere with them to avoid having to output explicit
8880 stack adjustment instructions. */
8881 if (reg2
!= INVALID_REGNUM
)
8882 bitmap_clear_bit (components
, reg2
);
8883 if (reg1
!= INVALID_REGNUM
)
8884 bitmap_clear_bit (components
, reg1
);
8886 bitmap_clear_bit (components
, LR_REGNUM
);
8887 bitmap_clear_bit (components
, SP_REGNUM
);
8888 if (flag_stack_clash_protection
)
8890 if (frame
.sve_save_and_probe
!= INVALID_REGNUM
)
8891 bitmap_clear_bit (components
, frame
.sve_save_and_probe
);
8892 if (frame
.hard_fp_save_and_probe
!= INVALID_REGNUM
)
8893 bitmap_clear_bit (components
, frame
.hard_fp_save_and_probe
);
8896 /* The VG save sequence needs a temporary GPR. Punt for now on trying
8898 bitmap_clear_bit (components
, VG_REGNUM
);
8903 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
8906 aarch64_components_for_bb (basic_block bb
)
8908 bitmap in
= DF_LIVE_IN (bb
);
8909 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
8910 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
8912 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
8913 bitmap_clear (components
);
8915 /* Clobbered registers don't generate values in any meaningful sense,
8916 since nothing after the clobber can rely on their value. And we can't
8917 say that partially-clobbered registers are unconditionally killed,
8918 because whether they're killed or not depends on the mode of the
8919 value they're holding. Thus partially call-clobbered registers
8920 appear in neither the kill set nor the gen set.
8922 Check manually for any calls that clobber more of a register than the
8923 current function can. */
8924 function_abi_aggregator callee_abis
;
8926 FOR_BB_INSNS (bb
, insn
)
8928 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
8929 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
8931 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
8932 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
8933 if (!fixed_regs
[regno
]
8934 && !crtl
->abi
->clobbers_full_reg_p (regno
)
8935 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
8936 || bitmap_bit_p (in
, regno
)
8937 || bitmap_bit_p (gen
, regno
)
8938 || bitmap_bit_p (kill
, regno
)))
8940 bitmap_set_bit (components
, regno
);
8942 /* If there is a callee-save at an adjacent offset, add it too
8943 to increase the use of LDP/STP. */
8944 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
8945 unsigned regno2
= multiple_p (offset
, 16) ? regno
+ 1 : regno
- 1;
8947 if (regno2
<= LAST_SAVED_REGNUM
)
8949 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
8951 ? known_eq (offset
+ 8, offset2
)
8952 : multiple_p (offset2
, 16) && known_eq (offset2
+ 8, offset
))
8953 bitmap_set_bit (components
, regno2
);
8960 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8961 Nothing to do for aarch64. */
8964 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
8968 /* Return the next set bit in BMP from START onwards. Return the total number
8969 of bits in BMP if no set bit is found at or after START. */
8972 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
8974 unsigned int nbits
= SBITMAP_SIZE (bmp
);
8978 gcc_assert (start
< nbits
);
8979 for (unsigned int i
= start
; i
< nbits
; i
++)
8980 if (bitmap_bit_p (bmp
, i
))
8986 /* Do the work for aarch64_emit_prologue_components and
8987 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
8988 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8989 for these components or the epilogue sequence. That is, it determines
8990 whether we should emit stores or loads and what kind of CFA notes to attach
8991 to the insns. Otherwise the logic for the two sequences is very
8995 aarch64_process_components (sbitmap components
, bool prologue_p
)
8997 aarch64_frame
&frame
= cfun
->machine
->frame
;
8998 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
8999 ? HARD_FRAME_POINTER_REGNUM
9000 : STACK_POINTER_REGNUM
);
9002 unsigned last_regno
= SBITMAP_SIZE (components
);
9003 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
9004 rtx_insn
*insn
= NULL
;
9006 while (regno
!= last_regno
)
9008 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
9009 machine_mode mode
= aarch64_reg_save_mode (regno
);
9011 rtx reg
= gen_rtx_REG (mode
, regno
);
9012 poly_int64 offset
= frame
.reg_offset
[regno
];
9013 if (frame_pointer_needed
)
9014 offset
-= frame
.bytes_below_hard_fp
;
9016 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
9017 rtx mem
= gen_frame_mem (mode
, addr
);
9019 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
9020 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
9021 /* No more registers to handle after REGNO.
9022 Emit a single save/restore and exit. */
9023 if (regno2
== last_regno
)
9025 insn
= emit_insn (set
);
9026 if (frame_related_p
)
9028 RTX_FRAME_RELATED_P (insn
) = 1;
9030 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
9032 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9037 poly_int64 offset2
= frame
.reg_offset
[regno2
];
9038 /* The next register is not of the same class or its offset is not
9039 mergeable with the current one into a pair. */
9040 if (aarch64_sve_mode_p (mode
)
9041 || !satisfies_constraint_Ump (mem
)
9042 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
9043 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
9044 || maybe_ne ((offset2
- frame
.reg_offset
[regno
]),
9045 GET_MODE_SIZE (mode
)))
9047 insn
= emit_insn (set
);
9048 if (frame_related_p
)
9050 RTX_FRAME_RELATED_P (insn
) = 1;
9052 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
9054 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9061 bool frame_related2_p
= aarch64_emit_cfi_for_reg_p (regno2
);
9063 /* REGNO2 can be saved/restored in a pair with REGNO. */
9064 rtx reg2
= gen_rtx_REG (mode
, regno2
);
9065 if (frame_pointer_needed
)
9066 offset2
-= frame
.bytes_below_hard_fp
;
9067 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
9068 rtx mem2
= gen_frame_mem (mode
, addr2
);
9069 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
9070 : gen_rtx_SET (reg2
, mem2
);
9073 insn
= emit_insn (aarch64_gen_store_pair (mem
, reg
, reg2
));
9075 insn
= emit_insn (aarch64_gen_load_pair (reg
, reg2
, mem
));
9077 if (frame_related_p
|| frame_related2_p
)
9079 RTX_FRAME_RELATED_P (insn
) = 1;
9082 if (frame_related_p
)
9083 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
9084 if (frame_related2_p
)
9085 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
9089 if (frame_related_p
)
9090 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9091 if (frame_related2_p
)
9092 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
9096 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
9100 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9103 aarch64_emit_prologue_components (sbitmap components
)
9105 aarch64_process_components (components
, true);
9108 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9111 aarch64_emit_epilogue_components (sbitmap components
)
9113 aarch64_process_components (components
, false);
9116 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9119 aarch64_set_handled_components (sbitmap components
)
9121 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
9122 if (bitmap_bit_p (components
, regno
))
9123 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
9126 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9127 determining the probe offset for alloca. */
9129 static HOST_WIDE_INT
9130 aarch64_stack_clash_protection_alloca_probe_range (void)
9132 return STACK_CLASH_CALLER_GUARD
;
9135 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9136 subsequent memory accesses and that requires the stack pointer and REG
9137 to have their current values. REG can be stack_pointer_rtx if no
9138 other register's value needs to be fixed. */
9141 aarch64_emit_stack_tie (rtx reg
)
9143 emit_insn (gen_stack_tie (reg
, gen_int_mode (REGNO (reg
), DImode
)));
9146 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9147 registers. If POLY_SIZE is not large enough to require a probe this function
9148 will only adjust the stack. When allocating the stack space
9149 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9150 FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9151 the saved registers. If we are then we ensure that any allocation
9152 larger than the ABI defined buffer needs a probe so that the
9153 invariant of having a 1KB buffer is maintained.
9155 We emit barriers after each stack adjustment to prevent optimizations from
9156 breaking the invariant that we never drop the stack more than a page. This
9157 invariant is needed to make it easier to correctly handle asynchronous
9158 events, e.g. if we were to allow the stack to be dropped by more than a page
9159 and then have multiple probes up and we take a signal somewhere in between
9160 then the signal handler doesn't know the state of the stack and can make no
9161 assumptions about which pages have been probed.
9163 FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
9164 is measured relative to the SME vector length instead of the current
9165 prevailing vector length. It is 0 otherwise. */
9168 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
9169 poly_int64 poly_size
,
9170 aarch64_feature_flags force_isa_mode
,
9171 bool frame_related_p
,
9172 bool final_adjustment_p
)
9174 aarch64_frame
&frame
= cfun
->machine
->frame
;
9175 HOST_WIDE_INT guard_size
9176 = 1 << param_stack_clash_protection_guard_size
;
9177 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
9178 HOST_WIDE_INT byte_sp_alignment
= STACK_BOUNDARY
/ BITS_PER_UNIT
;
9179 gcc_assert (multiple_p (poly_size
, byte_sp_alignment
));
9180 HOST_WIDE_INT min_probe_threshold
9181 = (final_adjustment_p
9182 ? guard_used_by_caller
+ byte_sp_alignment
9183 : guard_size
- guard_used_by_caller
);
9184 poly_int64 frame_size
= frame
.frame_size
;
9186 /* We should always have a positive probe threshold. */
9187 gcc_assert (min_probe_threshold
> 0);
9189 if (flag_stack_clash_protection
&& !final_adjustment_p
)
9191 poly_int64 initial_adjust
= frame
.initial_adjust
;
9192 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
9193 poly_int64 final_adjust
= frame
.final_adjust
;
9195 if (known_eq (frame_size
, 0))
9197 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
9199 else if (known_lt (initial_adjust
+ sve_callee_adjust
,
9200 guard_size
- guard_used_by_caller
)
9201 && known_lt (final_adjust
, guard_used_by_caller
))
9203 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
9207 /* If SIZE is not large enough to require probing, just adjust the stack and
9209 if (known_lt (poly_size
, min_probe_threshold
)
9210 || !flag_stack_clash_protection
)
9212 aarch64_sub_sp (temp1
, temp2
, poly_size
, force_isa_mode
,
9218 /* Handle the SVE non-constant case first. */
9219 if (!poly_size
.is_constant (&size
))
9223 fprintf (dump_file
, "Stack clash SVE prologue: ");
9224 print_dec (poly_size
, dump_file
);
9225 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
9228 /* First calculate the amount of bytes we're actually spilling. */
9229 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
9230 poly_size
, temp1
, temp2
, force_isa_mode
,
9233 rtx_insn
*insn
= get_last_insn ();
9235 if (frame_related_p
)
9237 /* This is done to provide unwinding information for the stack
9238 adjustments we're about to do, however to prevent the optimizers
9239 from removing the R11 move and leaving the CFA note (which would be
9240 very wrong) we tie the old and new stack pointer together.
9241 The tie will expand to nothing but the optimizers will not touch
9243 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
9244 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
9245 aarch64_emit_stack_tie (stack_ptr_copy
);
9247 /* We want the CFA independent of the stack pointer for the
9248 duration of the loop. */
9249 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
9250 RTX_FRAME_RELATED_P (insn
) = 1;
9253 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
9254 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
9256 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
9257 stack_pointer_rtx
, temp1
,
9258 probe_const
, guard_const
));
9260 /* Now reset the CFA register if needed. */
9261 if (frame_related_p
)
9263 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9264 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
9265 gen_int_mode (poly_size
, Pmode
)));
9266 RTX_FRAME_RELATED_P (insn
) = 1;
9274 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9275 " bytes, probing will be required.\n", size
);
9277 /* Round size to the nearest multiple of guard_size, and calculate the
9278 residual as the difference between the original size and the rounded
9280 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
9281 HOST_WIDE_INT residual
= size
- rounded_size
;
9283 /* We can handle a small number of allocations/probes inline. Otherwise
9285 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
9287 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
9289 aarch64_sub_sp (NULL
, temp2
, guard_size
, force_isa_mode
, true);
9290 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
9291 guard_used_by_caller
));
9292 emit_insn (gen_blockage ());
9294 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
9298 /* Compute the ending address. */
9299 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
9300 temp1
, NULL
, force_isa_mode
, false, true);
9301 rtx_insn
*insn
= get_last_insn ();
9303 /* For the initial allocation, we don't have a frame pointer
9304 set up, so we always need CFI notes. If we're doing the
9305 final allocation, then we may have a frame pointer, in which
9306 case it is the CFA, otherwise we need CFI notes.
9308 We can determine which allocation we are doing by looking at
9309 the value of FRAME_RELATED_P since the final allocations are not
9311 if (frame_related_p
)
9313 /* We want the CFA independent of the stack pointer for the
9314 duration of the loop. */
9315 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9316 plus_constant (Pmode
, temp1
, rounded_size
));
9317 RTX_FRAME_RELATED_P (insn
) = 1;
9320 /* This allocates and probes the stack. Note that this re-uses some of
9321 the existing Ada stack protection code. However we are guaranteed not
9322 to enter the non loop or residual branches of that code.
9324 The non-loop part won't be entered because if our allocation amount
9325 doesn't require a loop, the case above would handle it.
9327 The residual amount won't be entered because TEMP1 is a mutliple of
9328 the allocation size. The residual will always be 0. As such, the only
9329 part we are actually using from that code is the loop setup. The
9330 actual probing is done in aarch64_output_probe_stack_range. */
9331 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
9332 stack_pointer_rtx
, temp1
));
9334 /* Now reset the CFA register if needed. */
9335 if (frame_related_p
)
9337 add_reg_note (insn
, REG_CFA_DEF_CFA
,
9338 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
9339 RTX_FRAME_RELATED_P (insn
) = 1;
9342 emit_insn (gen_blockage ());
9343 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
9346 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9347 be probed. This maintains the requirement that each page is probed at
9348 least once. For initial probing we probe only if the allocation is
9349 more than GUARD_SIZE - buffer, and below the saved registers we probe
9350 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9351 GUARD_SIZE. This works that for any allocation that is large enough to
9352 trigger a probe here, we'll have at least one, and if they're not large
9353 enough for this code to emit anything for them, The page would have been
9354 probed by the saving of FP/LR either by this function or any callees. If
9355 we don't have any callees then we won't have more stack adjustments and so
9359 gcc_assert (guard_used_by_caller
+ byte_sp_alignment
<= size
);
9361 /* If we're doing final adjustments, and we've done any full page
9362 allocations then any residual needs to be probed. */
9363 if (final_adjustment_p
&& rounded_size
!= 0)
9364 min_probe_threshold
= 0;
9366 aarch64_sub_sp (temp1
, temp2
, residual
, force_isa_mode
, frame_related_p
);
9367 if (residual
>= min_probe_threshold
)
9371 "Stack clash AArch64 prologue residuals: "
9372 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
9375 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
9376 guard_used_by_caller
));
9377 emit_insn (gen_blockage ());
9382 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY. */
9385 aarch64_extra_live_on_entry (bitmap regs
)
9389 bitmap_set_bit (regs
, LOWERING_REGNUM
);
9390 bitmap_set_bit (regs
, SME_STATE_REGNUM
);
9391 bitmap_set_bit (regs
, TPIDR2_SETUP_REGNUM
);
9392 bitmap_set_bit (regs
, ZA_FREE_REGNUM
);
9393 bitmap_set_bit (regs
, ZA_SAVED_REGNUM
);
9395 /* The only time ZA can't have live contents on entry is when
9396 the function explicitly treats it as a pure output. */
9397 auto za_flags
= aarch64_cfun_shared_flags ("za");
9398 if (za_flags
!= (AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
))
9399 bitmap_set_bit (regs
, ZA_REGNUM
);
9401 /* Since ZT0 is call-clobbered, it is only live on input if
9402 it is explicitly shared, and is not a pure output. */
9403 auto zt0_flags
= aarch64_cfun_shared_flags ("zt0");
9405 && zt0_flags
!= (AARCH64_STATE_SHARED
| AARCH64_STATE_OUT
))
9406 bitmap_set_bit (regs
, ZT0_REGNUM
);
9410 /* Return 1 if the register is used by the epilogue. We need to say the
9411 return register is used, but only after epilogue generation is complete.
9412 Note that in the case of sibcalls, the values "used by the epilogue" are
9413 considered live at the start of the called function. */
9416 aarch64_epilogue_uses (int regno
)
9418 if (epilogue_completed
)
9420 if (regno
== LR_REGNUM
)
9423 if (regno
== LOWERING_REGNUM
&& TARGET_ZA
)
9425 if (regno
== SME_STATE_REGNUM
&& TARGET_ZA
)
9427 if (regno
== TPIDR2_SETUP_REGNUM
&& TARGET_ZA
)
9429 /* If the function shares SME state with its caller, ensure that that
9430 data is not in the lazy save buffer on exit. */
9431 if (regno
== ZA_SAVED_REGNUM
&& aarch64_cfun_incoming_pstate_za () != 0)
9433 if (regno
== ZA_REGNUM
&& aarch64_cfun_shared_flags ("za") != 0)
9435 if (regno
== ZT0_REGNUM
&& aarch64_cfun_shared_flags ("zt0") != 0)
9440 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
9443 aarch64_use_late_prologue_epilogue ()
9445 return aarch64_cfun_enables_pstate_sm ();
9448 /* The current function's frame has a save slot for the incoming state
9449 of SVCR. Return a legitimate memory for the slot, based on the hard
9453 aarch64_old_svcr_mem ()
9455 gcc_assert (frame_pointer_needed
9456 && known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0));
9457 rtx base
= hard_frame_pointer_rtx
;
9458 poly_int64 offset
= (0
9459 /* hard fp -> bottom of frame. */
9460 - cfun
->machine
->frame
.bytes_below_hard_fp
9461 /* bottom of frame -> save slot. */
9462 + cfun
->machine
->frame
.old_svcr_offset
);
9463 return gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
9466 /* The current function's frame has a save slot for the incoming state
9467 of SVCR. Load the slot into register REGNO and return the register. */
9470 aarch64_read_old_svcr (unsigned int regno
)
9472 rtx svcr
= gen_rtx_REG (DImode
, regno
);
9473 emit_move_insn (svcr
, aarch64_old_svcr_mem ());
9477 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9478 load the incoming value of SVCR from its save slot into temporary
9482 aarch64_guard_switch_pstate_sm (unsigned int regno
,
9483 aarch64_feature_flags local_mode
)
9485 rtx old_svcr
= aarch64_read_old_svcr (regno
);
9486 return aarch64_guard_switch_pstate_sm (old_svcr
, local_mode
);
9489 /* AArch64 stack frames generated by this compiler look like:
9491 +-------------------------------+
9493 | incoming stack arguments |
9495 +-------------------------------+
9496 | | <-- incoming stack pointer (aligned)
9497 | callee-allocated save area |
9498 | for register varargs |
9500 +-------------------------------+
9501 | local variables (1) | <-- frame_pointer_rtx
9503 +-------------------------------+
9505 +-------------------------------+
9506 | callee-saved registers |
9507 +-------------------------------+
9509 +-------------------------------+
9511 +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9512 | SVE vector registers |
9513 +-------------------------------+
9514 | SVE predicate registers |
9515 +-------------------------------+
9516 | local variables (2) |
9517 +-------------------------------+
9519 +-------------------------------+
9520 | dynamic allocation |
9521 +-------------------------------+
9523 +-------------------------------+
9524 | outgoing stack arguments | <-- arg_pointer
9526 +-------------------------------+
9527 | | <-- stack_pointer_rtx (aligned)
9529 The regions marked (1) and (2) are mutually exclusive. (2) is used
9530 when aarch64_save_regs_above_locals_p is true.
9532 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9533 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9536 By default for stack-clash we assume the guard is at least 64KB, but this
9537 value is configurable to either 4KB or 64KB. We also force the guard size to
9538 be the same as the probing interval and both values are kept in sync.
9540 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9541 on the guard size) of stack space without probing.
9543 When probing is needed, we emit a probe at the start of the prologue
9544 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9546 We can also use register saves as probes. These are stored in
9547 sve_save_and_probe and hard_fp_save_and_probe.
9549 For outgoing arguments we probe if the size is larger than 1KB, such that
9550 the ABI specified buffer is maintained for the next callee.
9552 The following registers are reserved during frame layout and should not be
9553 used for any other purpose:
9555 - r11: Used by stack clash protection when SVE is enabled, and also
9556 as an anchor register when saving and restoring registers
9557 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9558 - r14 and r15: Used for speculation tracking.
9559 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9560 - r30(LR), r29(FP): Used by standard frame layout.
9562 These registers must be avoided in frame layout related code unless the
9563 explicit intention is to interact with one of the features listed above. */
9565 /* Generate the prologue instructions for entry into a function.
9566 Establish the stack frame by decreasing the stack pointer with a
9567 properly calculated size and, if necessary, create a frame record
9568 filled with the values of LR and previous frame pointer. The
9569 current FP is also set up if it is in use. */
9572 aarch64_expand_prologue (void)
9574 aarch64_frame
&frame
= cfun
->machine
->frame
;
9575 poly_int64 frame_size
= frame
.frame_size
;
9576 poly_int64 initial_adjust
= frame
.initial_adjust
;
9577 HOST_WIDE_INT callee_adjust
= frame
.callee_adjust
;
9578 poly_int64 final_adjust
= frame
.final_adjust
;
9579 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
9580 unsigned reg1
= frame
.wb_push_candidate1
;
9581 unsigned reg2
= frame
.wb_push_candidate2
;
9582 bool emit_frame_chain
= frame
.emit_frame_chain
;
9584 aarch64_feature_flags force_isa_mode
= 0;
9585 if (aarch64_cfun_enables_pstate_sm ())
9586 force_isa_mode
= AARCH64_FL_SM_ON
;
9588 if (flag_stack_clash_protection
9589 && known_eq (callee_adjust
, 0)
9590 && known_lt (frame
.reg_offset
[VG_REGNUM
], 0))
9592 /* Fold the SVE allocation into the initial allocation.
9593 We don't do this in aarch64_layout_arg to avoid pessimizing
9594 the epilogue code. */
9595 initial_adjust
+= sve_callee_adjust
;
9596 sve_callee_adjust
= 0;
9599 /* Sign return address for functions. */
9600 if (aarch64_return_address_signing_enabled ())
9602 switch (aarch64_ra_sign_key
)
9605 insn
= emit_insn (gen_paciasp ());
9608 insn
= emit_insn (gen_pacibsp ());
9613 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
9614 RTX_FRAME_RELATED_P (insn
) = 1;
9617 /* Push return address to shadow call stack. */
9618 if (frame
.is_scs_enabled
)
9619 emit_insn (gen_scs_push ());
9621 if (flag_stack_usage_info
)
9622 current_function_static_stack_size
= constant_lower_bound (frame_size
);
9624 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
9626 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
9628 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
9629 && maybe_gt (frame_size
, get_stack_check_protect ()))
9630 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9632 - get_stack_check_protect ()));
9634 else if (maybe_gt (frame_size
, 0))
9635 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
9638 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
9639 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
9641 /* In theory we should never have both an initial adjustment
9642 and a callee save adjustment. Verify that is the case since the
9643 code below does not handle it for -fstack-clash-protection. */
9644 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
9646 /* Will only probe if the initial adjustment is larger than the guard
9647 less the amount of the guard reserved for use by the caller's
9649 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
9650 force_isa_mode
, true, false);
9652 if (callee_adjust
!= 0)
9653 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
9655 /* The offset of the current SP from the bottom of the static frame. */
9656 poly_int64 bytes_below_sp
= frame_size
- initial_adjust
- callee_adjust
;
9658 if (emit_frame_chain
)
9660 /* The offset of the frame chain record (if any) from the current SP. */
9661 poly_int64 chain_offset
= (initial_adjust
+ callee_adjust
9662 - frame
.bytes_above_hard_fp
);
9663 gcc_assert (known_ge (chain_offset
, 0));
9665 gcc_assert (reg1
== R29_REGNUM
&& reg2
== R30_REGNUM
);
9666 if (callee_adjust
== 0)
9667 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_gprs
,
9670 gcc_assert (known_eq (chain_offset
, 0));
9671 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
9672 stack_pointer_rtx
, chain_offset
,
9673 tmp1_rtx
, tmp0_rtx
, force_isa_mode
,
9674 frame_pointer_needed
);
9675 if (frame_pointer_needed
&& !frame_size
.is_constant ())
9677 /* Variable-sized frames need to describe the save slot
9678 address using DW_CFA_expression rather than DW_CFA_offset.
9679 This means that, without taking further action, the
9680 locations of the registers that we've already saved would
9681 remain based on the stack pointer even after we redefine
9682 the CFA based on the frame pointer. We therefore need new
9683 DW_CFA_expressions to re-express the save slots with addresses
9684 based on the frame pointer. */
9685 rtx_insn
*insn
= get_last_insn ();
9686 gcc_assert (RTX_FRAME_RELATED_P (insn
));
9688 /* Add an explicit CFA definition if this was previously
9690 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
9692 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
, chain_offset
);
9693 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
9694 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
9697 /* Change the save slot expressions for the registers that
9698 we've already saved. */
9699 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg2
],
9700 hard_frame_pointer_rtx
, UNITS_PER_WORD
);
9701 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg1
],
9702 hard_frame_pointer_rtx
, 0);
9704 aarch64_emit_stack_tie (hard_frame_pointer_rtx
);
9707 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_gprs
, true,
9709 if (maybe_ge (frame
.reg_offset
[VG_REGNUM
], 0))
9711 unsigned int saved_regs
[] = { VG_REGNUM
};
9712 aarch64_save_callee_saves (bytes_below_sp
, saved_regs
, true,
9715 if (maybe_ne (sve_callee_adjust
, 0))
9717 gcc_assert (!flag_stack_clash_protection
9718 || known_eq (initial_adjust
, 0)
9719 /* The VG save isn't shrink-wrapped and so serves as
9720 a probe of the initial allocation. */
9721 || known_eq (frame
.reg_offset
[VG_REGNUM
], bytes_below_sp
));
9722 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
,
9725 !frame_pointer_needed
, false);
9726 bytes_below_sp
-= sve_callee_adjust
;
9728 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_prs
, true,
9730 aarch64_save_callee_saves (bytes_below_sp
, frame
.saved_fprs
, true,
9733 /* We may need to probe the final adjustment if it is larger than the guard
9734 that is assumed by the called. */
9735 gcc_assert (known_eq (bytes_below_sp
, final_adjust
));
9736 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
9738 !frame_pointer_needed
, true);
9739 if (emit_frame_chain
&& maybe_ne (final_adjust
, 0))
9740 aarch64_emit_stack_tie (hard_frame_pointer_rtx
);
9742 /* Save the incoming value of PSTATE.SM, if required. Code further
9743 down does this for locally-streaming functions. */
9744 if (known_ge (frame
.old_svcr_offset
, 0)
9745 && !aarch64_cfun_enables_pstate_sm ())
9747 rtx mem
= aarch64_old_svcr_mem ();
9748 MEM_VOLATILE_P (mem
) = 1;
9751 rtx reg
= gen_rtx_REG (DImode
, IP0_REGNUM
);
9752 emit_insn (gen_aarch64_read_svcr (reg
));
9753 emit_move_insn (mem
, reg
);
9757 rtx old_r0
= NULL_RTX
, old_r1
= NULL_RTX
;
9758 auto &args
= crtl
->args
.info
;
9759 if (args
.aapcs_ncrn
> 0)
9761 old_r0
= gen_rtx_REG (DImode
, PROBE_STACK_FIRST_REGNUM
);
9762 emit_move_insn (old_r0
, gen_rtx_REG (DImode
, R0_REGNUM
));
9764 if (args
.aapcs_ncrn
> 1)
9766 old_r1
= gen_rtx_REG (DImode
, PROBE_STACK_SECOND_REGNUM
);
9767 emit_move_insn (old_r1
, gen_rtx_REG (DImode
, R1_REGNUM
));
9769 emit_insn (gen_aarch64_get_sme_state ());
9770 emit_move_insn (mem
, gen_rtx_REG (DImode
, R0_REGNUM
));
9772 emit_move_insn (gen_rtx_REG (DImode
, R0_REGNUM
), old_r0
);
9774 emit_move_insn (gen_rtx_REG (DImode
, R1_REGNUM
), old_r1
);
9778 /* Enable PSTATE.SM, if required. */
9779 if (aarch64_cfun_enables_pstate_sm ())
9781 rtx_insn
*guard_label
= nullptr;
9782 if (known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0))
9784 /* The current function is streaming-compatible. Save the
9785 original state of PSTATE.SM. */
9786 rtx svcr
= gen_rtx_REG (DImode
, IP0_REGNUM
);
9787 emit_insn (gen_aarch64_read_svcr (svcr
));
9788 emit_move_insn (aarch64_old_svcr_mem (), svcr
);
9789 guard_label
= aarch64_guard_switch_pstate_sm (svcr
,
9792 aarch64_sme_mode_switch_regs args_switch
;
9793 auto &args
= crtl
->args
.info
;
9794 for (unsigned int i
= 0; i
< args
.num_sme_mode_switch_args
; ++i
)
9796 rtx x
= args
.sme_mode_switch_args
[i
];
9797 args_switch
.add_reg (GET_MODE (x
), REGNO (x
));
9799 args_switch
.emit_prologue ();
9800 emit_insn (gen_aarch64_smstart_sm ());
9801 args_switch
.emit_epilogue ();
9803 emit_label (guard_label
);
9807 /* Return TRUE if we can use a simple_return insn.
9809 This function checks whether the callee saved stack is empty, which
9810 means no restore actions are need. The pro_and_epilogue will use
9811 this to check whether shrink-wrapping opt is feasible. */
9814 aarch64_use_return_insn_p (void)
9816 if (!reload_completed
)
9822 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
9825 /* Generate the epilogue instructions for returning from a function.
9826 This is almost exactly the reverse of the prolog sequence, except
9827 that we need to insert barriers to avoid scheduling loads that read
9828 from a deallocated stack, and we optimize the unwind records by
9829 emitting them all together if possible. */
9831 aarch64_expand_epilogue (rtx_call_insn
*sibcall
)
9833 aarch64_frame
&frame
= cfun
->machine
->frame
;
9834 poly_int64 initial_adjust
= frame
.initial_adjust
;
9835 HOST_WIDE_INT callee_adjust
= frame
.callee_adjust
;
9836 poly_int64 final_adjust
= frame
.final_adjust
;
9837 poly_int64 sve_callee_adjust
= frame
.sve_callee_adjust
;
9838 poly_int64 bytes_below_hard_fp
= frame
.bytes_below_hard_fp
;
9839 unsigned reg1
= frame
.wb_pop_candidate1
;
9840 unsigned reg2
= frame
.wb_pop_candidate2
;
9843 /* A stack clash protection prologue may not have left EP0_REGNUM or
9844 EP1_REGNUM in a usable state. The same is true for allocations
9845 with an SVE component, since we then need both temporary registers
9846 for each allocation. For stack clash we are in a usable state if
9847 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
9848 HOST_WIDE_INT guard_size
9849 = 1 << param_stack_clash_protection_guard_size
;
9850 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
9851 aarch64_feature_flags force_isa_mode
= 0;
9852 if (aarch64_cfun_enables_pstate_sm ())
9853 force_isa_mode
= AARCH64_FL_SM_ON
;
9855 /* We can re-use the registers when:
9857 (a) the deallocation amount is the same as the corresponding
9858 allocation amount (which is false if we combine the initial
9859 and SVE callee save allocations in the prologue); and
9861 (b) the allocation amount doesn't need a probe (which is false
9862 if the amount is guard_size - guard_used_by_caller or greater).
9864 In such situations the register should remain live with the correct
9866 bool can_inherit_p
= (initial_adjust
.is_constant ()
9867 && final_adjust
.is_constant ()
9868 && (!flag_stack_clash_protection
9869 || (known_lt (initial_adjust
,
9870 guard_size
- guard_used_by_caller
)
9871 && known_eq (sve_callee_adjust
, 0))));
9873 /* We need to add memory barrier to prevent read from deallocated stack. */
9875 = maybe_ne (get_frame_size ()
9876 + frame
.saved_varargs_size
, 0);
9878 /* Reset PSTATE.SM, if required. */
9879 if (aarch64_cfun_enables_pstate_sm ())
9881 rtx_insn
*guard_label
= nullptr;
9882 if (known_ge (cfun
->machine
->frame
.old_svcr_offset
, 0))
9883 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
9885 aarch64_sme_mode_switch_regs return_switch
;
9887 return_switch
.add_call_args (sibcall
);
9888 else if (crtl
->return_rtx
&& REG_P (crtl
->return_rtx
))
9889 return_switch
.add_reg (GET_MODE (crtl
->return_rtx
),
9890 REGNO (crtl
->return_rtx
));
9891 return_switch
.emit_prologue ();
9892 emit_insn (gen_aarch64_smstop_sm ());
9893 return_switch
.emit_epilogue ();
9895 emit_label (guard_label
);
9898 /* Emit a barrier to prevent loads from a deallocated stack. */
9899 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
9900 || cfun
->calls_alloca
9901 || crtl
->calls_eh_return
)
9903 aarch64_emit_stack_tie (stack_pointer_rtx
);
9904 need_barrier_p
= false;
9907 /* Restore the stack pointer from the frame pointer if it may not
9908 be the same as the stack pointer. */
9909 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
9910 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
9911 if (frame_pointer_needed
9912 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
9913 /* If writeback is used when restoring callee-saves, the CFA
9914 is restored on the instruction doing the writeback. */
9915 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
9916 hard_frame_pointer_rtx
,
9917 -bytes_below_hard_fp
+ final_adjust
,
9918 tmp1_rtx
, tmp0_rtx
, force_isa_mode
,
9919 callee_adjust
== 0);
9921 /* The case where we need to re-use the register here is very rare, so
9922 avoid the complicated condition and just always emit a move if the
9923 immediate doesn't fit. */
9924 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, force_isa_mode
, true);
9926 /* Restore the vector registers before the predicate registers,
9927 so that we can use P4 as a temporary for big-endian SVE frames. */
9928 aarch64_restore_callee_saves (final_adjust
, frame
.saved_fprs
, &cfi_ops
);
9929 aarch64_restore_callee_saves (final_adjust
, frame
.saved_prs
, &cfi_ops
);
9930 if (maybe_ne (sve_callee_adjust
, 0))
9931 aarch64_add_sp (NULL_RTX
, NULL_RTX
, sve_callee_adjust
,
9932 force_isa_mode
, true);
9934 /* When shadow call stack is enabled, the scs_pop in the epilogue will
9935 restore x30, we don't need to restore x30 again in the traditional
9937 aarch64_restore_callee_saves (final_adjust
+ sve_callee_adjust
,
9938 frame
.saved_gprs
, &cfi_ops
);
9941 aarch64_emit_stack_tie (stack_pointer_rtx
);
9943 if (callee_adjust
!= 0)
9944 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
9946 /* If we have no register restore information, the CFA must have been
9947 defined in terms of the stack pointer since the end of the prologue. */
9948 gcc_assert (cfi_ops
|| !frame_pointer_needed
);
9950 if (cfi_ops
&& (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536)))
9952 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
9953 insn
= get_last_insn ();
9954 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
9955 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
9956 RTX_FRAME_RELATED_P (insn
) = 1;
9960 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9961 add restriction on emit_move optimization to leaf functions. */
9962 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
, force_isa_mode
,
9963 (!can_inherit_p
|| !crtl
->is_leaf
9964 || df_regs_ever_live_p (EP0_REGNUM
)));
9968 /* Emit delayed restores and reset the CFA to be SP. */
9969 insn
= get_last_insn ();
9970 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
9971 REG_NOTES (insn
) = cfi_ops
;
9972 RTX_FRAME_RELATED_P (insn
) = 1;
9975 /* Pop return address from shadow call stack. */
9976 if (frame
.is_scs_enabled
)
9978 machine_mode mode
= aarch64_reg_save_mode (R30_REGNUM
);
9979 rtx reg
= gen_rtx_REG (mode
, R30_REGNUM
);
9981 insn
= emit_insn (gen_scs_pop ());
9982 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
9983 RTX_FRAME_RELATED_P (insn
) = 1;
9986 /* Stack adjustment for exception handler. */
9987 if (crtl
->calls_eh_return
&& !sibcall
)
9989 /* If the EH_RETURN_TAKEN_RTX flag is set then we need
9990 to unwind the stack and jump to the handler, otherwise
9991 skip this eh_return logic and continue with normal
9992 return after the label. We have already reset the CFA
9993 to be SP; letting the CFA move during this adjustment
9994 is just as correct as retaining the CFA from the body
9995 of the function. Therefore, do nothing special. */
9996 rtx_code_label
*label
= gen_label_rtx ();
9997 rtx x
= aarch64_gen_compare_zero_and_branch (EQ
, EH_RETURN_TAKEN_RTX
,
9999 rtx jump
= emit_jump_insn (x
);
10000 JUMP_LABEL (jump
) = label
;
10001 LABEL_NUSES (label
)++;
10002 emit_insn (gen_add2_insn (stack_pointer_rtx
,
10003 EH_RETURN_STACKADJ_RTX
));
10004 emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX
));
10006 emit_label (label
);
10009 /* We prefer to emit the combined return/authenticate instruction RETAA,
10010 however there are three cases in which we must instead emit an explicit
10011 authentication instruction.
10013 1) Sibcalls don't return in a normal way, so if we're about to call one
10014 we must authenticate.
10016 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10017 generating code for !TARGET_ARMV8_3 we can't use it and must
10018 explicitly authenticate.
10020 if (aarch64_return_address_signing_enabled ()
10021 && (sibcall
|| !TARGET_ARMV8_3
))
10023 switch (aarch64_ra_sign_key
)
10025 case AARCH64_KEY_A
:
10026 insn
= emit_insn (gen_autiasp ());
10028 case AARCH64_KEY_B
:
10029 insn
= emit_insn (gen_autibsp ());
10032 gcc_unreachable ();
10034 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
10035 RTX_FRAME_RELATED_P (insn
) = 1;
10038 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
10040 emit_jump_insn (ret_rtx
);
10043 /* Output code to add DELTA to the first argument, and then jump
10044 to FUNCTION. Used for C++ multiple inheritance. */
10046 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
10047 HOST_WIDE_INT delta
,
10048 HOST_WIDE_INT vcall_offset
,
10051 /* The this pointer is always in x0. Note that this differs from
10052 Arm where the this pointer maybe bumped to r1 if r0 is required
10053 to return a pointer to an aggregate. On AArch64 a result value
10054 pointer will be in x8. */
10055 int this_regno
= R0_REGNUM
;
10056 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
10058 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
10060 if (aarch_bti_enabled ())
10061 emit_insn (gen_bti_c());
10063 reload_completed
= 1;
10064 emit_note (NOTE_INSN_PROLOGUE_END
);
10066 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
10067 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
10068 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
10070 if (vcall_offset
== 0)
10071 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
,
10075 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
10080 if (delta
>= -256 && delta
< 256)
10081 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
10082 plus_constant (Pmode
, this_rtx
, delta
));
10084 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
10085 temp1
, temp0
, 0, false);
10088 if (Pmode
== ptr_mode
)
10089 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
10091 aarch64_emit_move (temp0
,
10092 gen_rtx_ZERO_EXTEND (Pmode
,
10093 gen_rtx_MEM (ptr_mode
, addr
)));
10095 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
10096 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
10099 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
10101 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
10104 if (Pmode
== ptr_mode
)
10105 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
10107 aarch64_emit_move (temp1
,
10108 gen_rtx_SIGN_EXTEND (Pmode
,
10109 gen_rtx_MEM (ptr_mode
, addr
)));
10111 emit_insn (gen_add2_insn (this_rtx
, temp1
));
10114 /* Generate a tail call to the target function. */
10115 if (!TREE_USED (function
))
10117 assemble_external (function
);
10118 TREE_USED (function
) = 1;
10120 funexp
= XEXP (DECL_RTL (function
), 0);
10121 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
10122 auto isa_mode
= aarch64_fntype_isa_mode (TREE_TYPE (function
));
10123 auto pcs_variant
= arm_pcs (fndecl_abi (function
).id ());
10124 rtx callee_abi
= aarch64_gen_callee_cookie (isa_mode
, pcs_variant
);
10125 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
10126 SIBLING_CALL_P (insn
) = 1;
10128 insn
= get_insns ();
10129 shorten_branches (insn
);
10131 assemble_start_function (thunk
, fnname
);
10132 final_start_function (insn
, file
, 1);
10133 final (insn
, file
, 1);
10134 final_end_function ();
10135 assemble_end_function (thunk
, fnname
);
10137 /* Stop pretending to be a post-reload pass. */
10138 reload_completed
= 0;
10142 aarch64_tls_referenced_p (rtx x
)
10144 if (!TARGET_HAVE_TLS
)
10146 subrtx_iterator::array_type array
;
10147 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
10149 const_rtx x
= *iter
;
10150 if (SYMBOL_REF_P (x
) && SYMBOL_REF_TLS_MODEL (x
) != 0)
10152 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10153 TLS offsets, not real symbol references. */
10154 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
10155 iter
.skip_subrtxes ();
10162 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
10164 if (GET_CODE (x
) == HIGH
)
10167 /* There's no way to calculate VL-based values using relocations. */
10168 subrtx_iterator::array_type array
;
10169 HOST_WIDE_INT factor
;
10170 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
10171 if (GET_CODE (*iter
) == CONST_POLY_INT
10172 || aarch64_sme_vq_unspec_p (x
, &factor
))
10176 rtx base
= strip_offset_and_salt (x
, &offset
);
10177 if (SYMBOL_REF_P (base
) || LABEL_REF_P (base
))
10179 /* We checked for POLY_INT_CST offsets above. */
10180 if (aarch64_classify_symbol (base
, offset
.to_constant ())
10181 != SYMBOL_FORCE_TO_MEM
)
10184 /* Avoid generating a 64-bit relocation in ILP32; leave
10185 to aarch64_expand_mov_immediate to handle it properly. */
10186 return mode
!= ptr_mode
;
10189 return aarch64_tls_referenced_p (x
);
10192 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10193 The expansion for a table switch is quite expensive due to the number
10194 of instructions, the table lookup and hard to predict indirect jump.
10195 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10196 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10197 performance. When optimizing for size, use 8 for smallest codesize. */
10199 static unsigned int
10200 aarch64_case_values_threshold (void)
10202 /* Use the specified limit for the number of cases before using jump
10203 tables at higher optimization levels. */
10205 && aarch64_tune_params
.max_case_values
!= 0)
10206 return aarch64_tune_params
.max_case_values
;
10208 return optimize_size
? 8 : 11;
10211 /* Return true if register REGNO is a valid index register.
10212 STRICT_P is true if REG_OK_STRICT is in effect. */
10215 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
10217 if (!HARD_REGISTER_NUM_P (regno
))
10225 regno
= reg_renumber
[regno
];
10227 return GP_REGNUM_P (regno
);
10230 /* Return true if register REGNO is a valid base register for mode MODE.
10231 STRICT_P is true if REG_OK_STRICT is in effect. */
10234 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
10236 if (!HARD_REGISTER_NUM_P (regno
))
10244 regno
= reg_renumber
[regno
];
10247 /* The fake registers will be eliminated to either the stack or
10248 hard frame pointer, both of which are usually valid base registers.
10249 Reload deals with the cases where the eliminated form isn't valid. */
10250 return (GP_REGNUM_P (regno
)
10251 || regno
== SP_REGNUM
10252 || regno
== FRAME_POINTER_REGNUM
10253 || regno
== ARG_POINTER_REGNUM
);
10256 /* Return true if X is a valid base register for mode MODE.
10257 STRICT_P is true if REG_OK_STRICT is in effect. */
10260 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
10264 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
10265 x
= SUBREG_REG (x
);
10267 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
10270 /* Return true if address offset is a valid index. If it is, fill in INFO
10271 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10274 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
10275 machine_mode mode
, bool strict_p
)
10277 enum aarch64_address_type type
;
10282 if ((REG_P (x
) || SUBREG_P (x
))
10283 && GET_MODE (x
) == Pmode
)
10285 type
= ADDRESS_REG_REG
;
10289 /* (sign_extend:DI (reg:SI)) */
10290 else if ((GET_CODE (x
) == SIGN_EXTEND
10291 || GET_CODE (x
) == ZERO_EXTEND
)
10292 && GET_MODE (x
) == DImode
10293 && GET_MODE (XEXP (x
, 0)) == SImode
)
10295 type
= (GET_CODE (x
) == SIGN_EXTEND
)
10296 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10297 index
= XEXP (x
, 0);
10300 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10301 else if (GET_CODE (x
) == MULT
10302 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
10303 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
10304 && GET_MODE (XEXP (x
, 0)) == DImode
10305 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
10306 && CONST_INT_P (XEXP (x
, 1)))
10308 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
10309 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10310 index
= XEXP (XEXP (x
, 0), 0);
10311 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
10313 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10314 else if (GET_CODE (x
) == ASHIFT
10315 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
10316 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
10317 && GET_MODE (XEXP (x
, 0)) == DImode
10318 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
10319 && CONST_INT_P (XEXP (x
, 1)))
10321 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
10322 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
10323 index
= XEXP (XEXP (x
, 0), 0);
10324 shift
= INTVAL (XEXP (x
, 1));
10326 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10327 (const_int 0xffffffff<<shift)) */
10328 else if (GET_CODE (x
) == AND
10329 && GET_MODE (x
) == DImode
10330 && GET_CODE (XEXP (x
, 0)) == MULT
10331 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
10332 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10333 && CONST_INT_P (XEXP (x
, 1)))
10335 type
= ADDRESS_REG_UXTW
;
10336 index
= XEXP (XEXP (x
, 0), 0);
10337 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
10338 /* Avoid undefined code dealing with shift being -1. */
10340 && INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
10343 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10344 (const_int 0xffffffff<<shift)) */
10345 else if (GET_CODE (x
) == AND
10346 && GET_MODE (x
) == DImode
10347 && GET_CODE (XEXP (x
, 0)) == ASHIFT
10348 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
10349 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
10350 && CONST_INT_P (XEXP (x
, 1)))
10352 type
= ADDRESS_REG_UXTW
;
10353 index
= XEXP (XEXP (x
, 0), 0);
10354 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
10355 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
10358 /* (mult:P (reg:P) (const_int scale)) */
10359 else if (GET_CODE (x
) == MULT
10360 && GET_MODE (x
) == Pmode
10361 && GET_MODE (XEXP (x
, 0)) == Pmode
10362 && CONST_INT_P (XEXP (x
, 1)))
10364 type
= ADDRESS_REG_REG
;
10365 index
= XEXP (x
, 0);
10366 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
10368 /* (ashift:P (reg:P) (const_int shift)) */
10369 else if (GET_CODE (x
) == ASHIFT
10370 && GET_MODE (x
) == Pmode
10371 && GET_MODE (XEXP (x
, 0)) == Pmode
10372 && CONST_INT_P (XEXP (x
, 1)))
10374 type
= ADDRESS_REG_REG
;
10375 index
= XEXP (x
, 0);
10376 shift
= INTVAL (XEXP (x
, 1));
10382 && SUBREG_P (index
)
10383 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
10384 index
= SUBREG_REG (index
);
10386 if (aarch64_sve_data_mode_p (mode
) || mode
== VNx1TImode
)
10388 if (type
!= ADDRESS_REG_REG
10389 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
10395 && !(IN_RANGE (shift
, 1, 3)
10396 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
10401 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
10404 info
->offset
= index
;
10405 info
->shift
= shift
;
10412 /* Return true if MODE is one of the modes for which we
10413 support LDP/STP operations. */
10416 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
10418 return mode
== SImode
|| mode
== DImode
10419 || mode
== SFmode
|| mode
== DFmode
10420 || mode
== SDmode
|| mode
== DDmode
10421 || (aarch64_vector_mode_supported_p (mode
)
10422 && (known_eq (GET_MODE_SIZE (mode
), 8)
10423 || known_eq (GET_MODE_SIZE (mode
), 16)));
10426 /* Return true if REGNO is a virtual pointer register, or an eliminable
10427 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10428 include stack_pointer or hard_frame_pointer. */
10430 virt_or_elim_regno_p (unsigned regno
)
10432 return ((regno
>= FIRST_VIRTUAL_REGISTER
10433 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
10434 || regno
== FRAME_POINTER_REGNUM
10435 || regno
== ARG_POINTER_REGNUM
);
10438 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10439 If it is, fill in INFO appropriately. STRICT_P is true if
10440 REG_OK_STRICT is in effect. */
10443 aarch64_classify_address (struct aarch64_address_info
*info
,
10444 rtx x
, machine_mode mode
, bool strict_p
,
10445 aarch64_addr_query_type type
)
10447 enum rtx_code code
= GET_CODE (x
);
10451 HOST_WIDE_INT const_size
;
10453 /* Whether a vector mode is partial doesn't affect address legitimacy.
10454 Partial vectors like VNx8QImode allow the same indexed addressing
10455 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10456 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10457 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10458 vec_flags
&= ~VEC_PARTIAL
;
10460 /* On BE, we use load/store pair for all large int mode load/stores.
10461 TI/TF/TDmode may also use a load/store pair. */
10462 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
10463 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
10464 || type
== ADDR_QUERY_LDP_STP_N
10468 || ((!TARGET_SIMD
|| BYTES_BIG_ENDIAN
)
10469 && advsimd_struct_p
));
10470 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10471 corresponds to the actual size of the memory being loaded/stored and the
10472 mode of the corresponding addressing mode is half of that. */
10473 if (type
== ADDR_QUERY_LDP_STP_N
)
10475 if (known_eq (GET_MODE_SIZE (mode
), 32))
10477 else if (known_eq (GET_MODE_SIZE (mode
), 16))
10479 else if (known_eq (GET_MODE_SIZE (mode
), 8))
10484 /* This isn't really an Advanced SIMD struct mode, but a mode
10485 used to represent the complete mem in a load/store pair. */
10486 advsimd_struct_p
= false;
10489 bool allow_reg_index_p
= (!load_store_pair_p
10490 && ((vec_flags
== 0
10491 && known_lt (GET_MODE_SIZE (mode
), 16))
10492 || vec_flags
== VEC_ADVSIMD
10493 || vec_flags
& VEC_SVE_DATA
10494 || mode
== VNx1TImode
));
10496 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10497 The latter is not valid for SVE predicates, and that's rejected through
10498 allow_reg_index_p above. */
10499 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
10500 && (code
!= REG
&& code
!= PLUS
))
10503 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10505 if (advsimd_struct_p
10507 && !BYTES_BIG_ENDIAN
10508 && (code
!= POST_INC
&& code
!= REG
))
10511 gcc_checking_assert (GET_MODE (x
) == VOIDmode
10512 || SCALAR_INT_MODE_P (GET_MODE (x
)));
10518 info
->type
= ADDRESS_REG_IMM
;
10520 info
->offset
= const0_rtx
;
10521 info
->const_offset
= 0;
10522 return aarch64_base_register_rtx_p (x
, strict_p
);
10530 && virt_or_elim_regno_p (REGNO (op0
))
10531 && poly_int_rtx_p (op1
, &offset
))
10533 info
->type
= ADDRESS_REG_IMM
;
10535 info
->offset
= op1
;
10536 info
->const_offset
= offset
;
10541 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
10542 && aarch64_base_register_rtx_p (op0
, strict_p
)
10543 && poly_int_rtx_p (op1
, &offset
))
10545 info
->type
= ADDRESS_REG_IMM
;
10547 info
->offset
= op1
;
10548 info
->const_offset
= offset
;
10550 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10551 registers and individual Q registers. The available
10553 X,X: 7-bit signed scaled offset
10554 Q: 9-bit signed offset
10555 We conservatively require an offset representable in either mode.
10556 When performing the check for pairs of X registers i.e. LDP/STP
10557 pass down DImode since that is the natural size of the LDP/STP
10558 instruction memory accesses. */
10559 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10560 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10561 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
10562 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
10564 if (mode
== V8DImode
)
10565 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10566 && aarch64_offset_7bit_signed_scaled_p (DImode
, offset
+ 48));
10568 /* A 7bit offset check because OImode will emit a ldp/stp
10569 instruction (only !TARGET_SIMD or big endian will get here).
10570 For ldp/stp instructions, the offset is scaled for the size of a
10571 single element of the pair. */
10572 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10573 && known_eq (GET_MODE_SIZE (mode
), 16))
10574 return aarch64_offset_7bit_signed_scaled_p (DImode
, offset
);
10575 if (aarch64_advsimd_full_struct_mode_p (mode
)
10576 && known_eq (GET_MODE_SIZE (mode
), 32))
10577 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
10579 /* Three 9/12 bit offsets checks because CImode will emit three
10580 ldr/str instructions (only !TARGET_SIMD or big endian will
10582 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10583 && known_eq (GET_MODE_SIZE (mode
), 24))
10584 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10585 && (aarch64_offset_9bit_signed_unscaled_p (DImode
,
10587 || offset_12bit_unsigned_scaled_p (DImode
,
10589 if (aarch64_advsimd_full_struct_mode_p (mode
)
10590 && known_eq (GET_MODE_SIZE (mode
), 48))
10591 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
10592 && (aarch64_offset_9bit_signed_unscaled_p (TImode
,
10594 || offset_12bit_unsigned_scaled_p (TImode
,
10597 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10598 instructions (only big endian will get here). */
10599 if (aarch64_advsimd_partial_struct_mode_p (mode
)
10600 && known_eq (GET_MODE_SIZE (mode
), 32))
10601 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
10602 && aarch64_offset_7bit_signed_scaled_p (DImode
,
10604 if (aarch64_advsimd_full_struct_mode_p (mode
)
10605 && known_eq (GET_MODE_SIZE (mode
), 64))
10606 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
10607 && aarch64_offset_7bit_signed_scaled_p (TImode
,
10610 /* Make "m" use the LD1 offset range for SVE data modes, so
10611 that pre-RTL optimizers like ivopts will work to that
10612 instead of the wider LDR/STR range. */
10613 if (vec_flags
== VEC_SVE_DATA
|| mode
== VNx1TImode
)
10614 return (type
== ADDR_QUERY_M
10615 ? offset_4bit_signed_scaled_p (mode
, offset
)
10616 : offset_9bit_signed_scaled_p (mode
, offset
));
10618 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
10620 poly_int64 end_offset
= (offset
10621 + GET_MODE_SIZE (mode
)
10622 - BYTES_PER_SVE_VECTOR
);
10623 return (type
== ADDR_QUERY_M
10624 ? offset_4bit_signed_scaled_p (mode
, offset
)
10625 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
10626 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
10630 if (vec_flags
== VEC_SVE_PRED
)
10631 return offset_9bit_signed_scaled_p (mode
, offset
);
10633 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
10635 poly_int64 end_offset
= (offset
10636 + GET_MODE_SIZE (mode
)
10637 - BYTES_PER_SVE_PRED
);
10638 return (offset_9bit_signed_scaled_p (VNx16BImode
, end_offset
)
10639 && offset_9bit_signed_scaled_p (VNx16BImode
, offset
));
10642 if (load_store_pair_p
)
10643 return ((known_eq (GET_MODE_SIZE (mode
), 4)
10644 || known_eq (GET_MODE_SIZE (mode
), 8)
10645 || known_eq (GET_MODE_SIZE (mode
), 16))
10646 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
10648 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
10649 || offset_12bit_unsigned_scaled_p (mode
, offset
));
10652 if (allow_reg_index_p
)
10654 /* Look for base + (scaled/extended) index register. */
10655 if (aarch64_base_register_rtx_p (op0
, strict_p
)
10656 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
10661 if (aarch64_base_register_rtx_p (op1
, strict_p
)
10662 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
10675 info
->type
= ADDRESS_REG_WB
;
10676 info
->base
= XEXP (x
, 0);
10677 info
->offset
= NULL_RTX
;
10678 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
10682 info
->type
= ADDRESS_REG_WB
;
10683 info
->base
= XEXP (x
, 0);
10684 if (GET_CODE (XEXP (x
, 1)) == PLUS
10685 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
10686 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
10687 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
10689 info
->offset
= XEXP (XEXP (x
, 1), 1);
10690 info
->const_offset
= offset
;
10692 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10693 registers and individual Q registers. The available
10695 X,X: 7-bit signed scaled offset
10696 Q: 9-bit signed offset
10697 We conservatively require an offset representable in either mode.
10699 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10700 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
10701 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
10703 if (load_store_pair_p
)
10704 return ((known_eq (GET_MODE_SIZE (mode
), 4)
10705 || known_eq (GET_MODE_SIZE (mode
), 8)
10706 || known_eq (GET_MODE_SIZE (mode
), 16))
10707 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
10709 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
10716 /* load literal: pc-relative constant pool entry. Only supported
10717 for SI mode or larger. */
10718 info
->type
= ADDRESS_SYMBOLIC
;
10720 if (!load_store_pair_p
10721 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
10722 && const_size
>= 4)
10725 rtx sym
= strip_offset_and_salt (x
, &offset
);
10726 return ((LABEL_REF_P (sym
)
10727 || (SYMBOL_REF_P (sym
)
10728 && CONSTANT_POOL_ADDRESS_P (sym
)
10729 && aarch64_pcrelative_literal_loads
)));
10734 info
->type
= ADDRESS_LO_SUM
;
10735 info
->base
= XEXP (x
, 0);
10736 info
->offset
= XEXP (x
, 1);
10737 if (allow_reg_index_p
10738 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
10741 HOST_WIDE_INT const_offset
;
10742 rtx sym
= strip_offset_and_salt (info
->offset
, &offset
);
10743 if (SYMBOL_REF_P (sym
)
10744 && offset
.is_constant (&const_offset
)
10745 && (aarch64_classify_symbol (sym
, const_offset
)
10746 == SYMBOL_SMALL_ABSOLUTE
))
10748 /* The symbol and offset must be aligned to the access size. */
10749 unsigned int align
;
10751 if (CONSTANT_POOL_ADDRESS_P (sym
))
10752 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
10753 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
10755 tree exp
= SYMBOL_REF_DECL (sym
);
10756 align
= TYPE_ALIGN (TREE_TYPE (exp
));
10757 align
= aarch64_constant_alignment (exp
, align
);
10759 else if (SYMBOL_REF_DECL (sym
))
10760 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
10761 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
10762 && SYMBOL_REF_BLOCK (sym
) != NULL
)
10763 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
10765 align
= BITS_PER_UNIT
;
10767 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
10768 if (known_eq (ref_size
, 0))
10769 ref_size
= GET_MODE_SIZE (DImode
);
10771 return (multiple_p (const_offset
, ref_size
)
10772 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
10782 /* Return true if the address X is valid for a PRFM instruction.
10783 STRICT_P is true if we should do strict checking with
10784 aarch64_classify_address. */
10787 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
10789 struct aarch64_address_info addr
;
10791 /* PRFM accepts the same addresses as DImode... */
10792 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
10796 /* ... except writeback forms. */
10797 return addr
.type
!= ADDRESS_REG_WB
;
10801 aarch64_symbolic_address_p (rtx x
)
10804 x
= strip_offset_and_salt (x
, &offset
);
10805 return SYMBOL_REF_P (x
) || LABEL_REF_P (x
);
10808 /* Classify the base of symbolic expression X. */
10810 enum aarch64_symbol_type
10811 aarch64_classify_symbolic_expression (rtx x
)
10815 split_const (x
, &x
, &offset
);
10816 return aarch64_classify_symbol (x
, INTVAL (offset
));
10820 /* Return TRUE if X is a legitimate address for accessing memory in
10823 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
,
10824 code_helper
= ERROR_MARK
)
10826 struct aarch64_address_info addr
;
10828 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
10831 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10832 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
10834 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
10835 aarch64_addr_query_type type
)
10837 struct aarch64_address_info addr
;
10839 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
10842 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
10845 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
10846 poly_int64 orig_offset
,
10849 HOST_WIDE_INT size
;
10850 if (GET_MODE_SIZE (mode
).is_constant (&size
))
10852 HOST_WIDE_INT const_offset
, second_offset
;
10854 /* A general SVE offset is A * VQ + B. Remove the A component from
10855 coefficient 0 in order to get the constant B. */
10856 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
10858 /* Split an out-of-range address displacement into a base and
10859 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
10860 range otherwise to increase opportunities for sharing the base
10861 address of different sizes. Unaligned accesses use the signed
10862 9-bit range, TImode/TFmode/TDmode use the intersection of signed
10863 scaled 7-bit and signed 9-bit offset. */
10864 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
10865 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
10866 else if ((const_offset
& (size
- 1)) != 0)
10867 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
10869 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
10871 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
10874 /* Split the offset into second_offset and the rest. */
10875 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
10876 *offset2
= gen_int_mode (second_offset
, Pmode
);
10881 /* Get the mode we should use as the basis of the range. For structure
10882 modes this is the mode of one vector. */
10883 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10884 machine_mode step_mode
10885 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
10887 /* Get the "mul vl" multiplier we'd like to use. */
10888 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
10889 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
10890 if (vec_flags
& VEC_SVE_DATA
)
10891 /* LDR supports a 9-bit range, but the move patterns for
10892 structure modes require all vectors to be in range of the
10893 same base. The simplest way of accomodating that while still
10894 promoting reuse of anchor points between different modes is
10895 to use an 8-bit range unconditionally. */
10896 vnum
= ((vnum
+ 128) & 255) - 128;
10898 /* Predicates are only handled singly, so we might as well use
10900 vnum
= ((vnum
+ 256) & 511) - 256;
10904 /* Convert the "mul vl" multiplier into a byte offset. */
10905 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
10906 if (known_eq (second_offset
, orig_offset
))
10909 /* Split the offset into second_offset and the rest. */
10910 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
10911 *offset2
= gen_int_mode (second_offset
, Pmode
);
10916 /* Return the binary representation of floating point constant VALUE in INTVAL.
10917 If the value cannot be converted, return false without setting INTVAL.
10918 The conversion is done in the given MODE. */
10920 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
10923 /* We make a general exception for 0. */
10924 if (aarch64_float_const_zero_rtx_p (value
))
10930 scalar_float_mode mode
;
10931 if (!CONST_DOUBLE_P (value
)
10932 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
10933 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
10934 /* Only support up to DF mode. */
10935 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
10938 unsigned HOST_WIDE_INT ival
= 0;
10941 real_to_target (res
,
10942 CONST_DOUBLE_REAL_VALUE (value
),
10943 REAL_MODE_FORMAT (mode
));
10945 if (mode
== DFmode
|| mode
== DDmode
)
10947 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
10948 ival
= zext_hwi (res
[order
], 32);
10949 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
10952 ival
= zext_hwi (res
[0], 32);
10958 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10959 single MOV(+MOVK) followed by an FMOV. */
10961 aarch64_float_const_rtx_p (rtx x
)
10963 machine_mode mode
= GET_MODE (x
);
10964 if (mode
== VOIDmode
)
10967 /* Determine whether it's cheaper to write float constants as
10968 mov/movk pairs over ldr/adrp pairs. */
10969 unsigned HOST_WIDE_INT ival
;
10971 if (CONST_DOUBLE_P (x
)
10972 && SCALAR_FLOAT_MODE_P (mode
)
10973 && aarch64_reinterpret_float_as_int (x
, &ival
))
10975 machine_mode imode
= known_eq (GET_MODE_SIZE (mode
), 8) ? DImode
: SImode
;
10976 int num_instr
= aarch64_internal_mov_immediate
10977 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
10978 return num_instr
< 3;
10984 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
10985 Floating Point). */
10987 aarch64_float_const_zero_rtx_p (rtx x
)
10989 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
10990 zr as our callers expect, so no need to check the actual
10991 value if X is of Decimal Floating Point type. */
10992 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_DECIMAL_FLOAT
)
10995 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
10996 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
10997 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
11000 /* Return true if X is any kind of constant zero rtx. */
11003 aarch64_const_zero_rtx_p (rtx x
)
11005 return (x
== CONST0_RTX (GET_MODE (x
))
11006 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)));
11009 /* Return TRUE if rtx X is immediate constant that fits in a single
11010 MOVI immediate operation. */
11012 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
11017 machine_mode vmode
;
11018 scalar_int_mode imode
;
11019 unsigned HOST_WIDE_INT ival
;
11021 if (CONST_DOUBLE_P (x
)
11022 && SCALAR_FLOAT_MODE_P (mode
))
11024 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
11027 /* We make a general exception for 0. */
11028 if (aarch64_float_const_zero_rtx_p (x
))
11031 imode
= int_mode_for_mode (mode
).require ();
11033 else if (CONST_INT_P (x
)
11034 && is_a
<scalar_int_mode
> (mode
, &imode
))
11039 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11040 a 128 bit vector mode. */
11041 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
11043 vmode
= aarch64_simd_container_mode (imode
, width
);
11044 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
11046 return aarch64_simd_valid_immediate (v_op
, NULL
);
11050 /* Return the fixed registers used for condition codes. */
11053 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
11056 *p2
= INVALID_REGNUM
;
11060 /* Return a fresh memory reference to the current function's TPIDR2 block,
11061 creating a block if necessary. */
11064 aarch64_get_tpidr2_block ()
11066 if (!cfun
->machine
->tpidr2_block
)
11067 /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11069 cfun
->machine
->tpidr2_block
= assign_stack_local (V16QImode
, 16, 128);
11070 return copy_rtx (cfun
->machine
->tpidr2_block
);
11073 /* Return a fresh register that points to the current function's
11074 TPIDR2 block, creating a block if necessary. */
11077 aarch64_get_tpidr2_ptr ()
11079 rtx block
= aarch64_get_tpidr2_block ();
11080 return force_reg (Pmode
, XEXP (block
, 0));
11083 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11084 current function's TPIDR2 block. */
11087 aarch64_init_tpidr2_block ()
11089 rtx block
= aarch64_get_tpidr2_block ();
11091 /* The ZA save buffer is SVL.B*SVL.B bytes in size. */
11092 rtx svl_bytes
= aarch64_sme_vq_immediate (Pmode
, 16, AARCH64_ISA_MODE
);
11093 rtx svl_bytes_reg
= force_reg (DImode
, svl_bytes
);
11094 rtx za_size
= expand_simple_binop (Pmode
, MULT
, svl_bytes_reg
,
11095 svl_bytes_reg
, NULL
, 0, OPTAB_LIB_WIDEN
);
11096 rtx za_save_buffer
= allocate_dynamic_stack_space (za_size
, 128,
11097 BITS_PER_UNIT
, -1, true);
11098 za_save_buffer
= force_reg (Pmode
, za_save_buffer
);
11099 cfun
->machine
->za_save_buffer
= za_save_buffer
;
11101 /* The first word of the block points to the save buffer and the second
11102 word is the number of ZA slices to save. */
11103 rtx block_0
= adjust_address (block
, DImode
, 0);
11104 emit_insn (aarch64_gen_store_pair (block_0
, za_save_buffer
, svl_bytes_reg
));
11106 if (!memory_operand (block
, V16QImode
))
11107 block
= replace_equiv_address (block
, force_reg (Pmode
, XEXP (block
, 0)));
11108 emit_insn (gen_aarch64_setup_local_tpidr2 (block
));
11111 /* Restore the contents of ZA from the lazy save buffer, given that
11112 register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11113 PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null. */
11116 aarch64_restore_za (rtx tpidr2_block
)
11118 emit_insn (gen_aarch64_smstart_za ());
11119 if (REGNO (tpidr2_block
) != R0_REGNUM
)
11120 emit_move_insn (gen_rtx_REG (Pmode
, R0_REGNUM
), tpidr2_block
);
11121 emit_insn (gen_aarch64_tpidr2_restore ());
11124 /* Return the ZT0 save buffer, creating one if necessary. */
11127 aarch64_get_zt0_save_buffer ()
11129 if (!cfun
->machine
->zt0_save_buffer
)
11130 cfun
->machine
->zt0_save_buffer
= assign_stack_local (V8DImode
, 64, 128);
11131 return cfun
->machine
->zt0_save_buffer
;
11134 /* Save ZT0 to the current function's save buffer. */
11137 aarch64_save_zt0 ()
11139 rtx mem
= aarch64_get_zt0_save_buffer ();
11140 mem
= replace_equiv_address (mem
, force_reg (Pmode
, XEXP (mem
, 0)));
11141 emit_insn (gen_aarch64_sme_str_zt0 (mem
));
11144 /* Restore ZT0 from the current function's save buffer. FROM_LAZY_SAVE_P
11145 is true if the load is happening after a call to a private-ZA function,
11146 false if it can be treated as a normal load. */
11149 aarch64_restore_zt0 (bool from_lazy_save_p
)
11151 rtx mem
= aarch64_get_zt0_save_buffer ();
11152 mem
= replace_equiv_address (mem
, force_reg (Pmode
, XEXP (mem
, 0)));
11153 emit_insn (from_lazy_save_p
11154 ? gen_aarch64_restore_zt0 (mem
)
11155 : gen_aarch64_sme_ldr_zt0 (mem
));
11158 /* Implement TARGET_START_CALL_ARGS. */
11161 aarch64_start_call_args (cumulative_args_t ca_v
)
11163 CUMULATIVE_ARGS
*ca
= get_cumulative_args (ca_v
);
11165 if (!TARGET_SME
&& (ca
->isa_mode
& AARCH64_FL_SM_ON
))
11167 error ("calling a streaming function requires the ISA extension %qs",
11169 inform (input_location
, "you can enable %qs using the command-line"
11170 " option %<-march%>, or by using the %<target%>"
11171 " attribute or pragma", "sme");
11174 if ((ca
->shared_za_flags
& (AARCH64_STATE_IN
| AARCH64_STATE_OUT
))
11175 && !aarch64_cfun_has_state ("za"))
11176 error ("call to a function that shares %qs state from a function"
11177 " that has no %qs state", "za", "za");
11178 else if ((ca
->shared_zt0_flags
& (AARCH64_STATE_IN
| AARCH64_STATE_OUT
))
11179 && !aarch64_cfun_has_state ("zt0"))
11180 error ("call to a function that shares %qs state from a function"
11181 " that has no %qs state", "zt0", "zt0");
11182 else if (!TARGET_ZA
&& (ca
->isa_mode
& AARCH64_FL_ZA_ON
))
11183 error ("call to a function that shares SME state from a function"
11184 " that has no SME state");
11186 /* If this is a call to a private ZA function, emit a marker to
11187 indicate where any necessary set-up code could be inserted.
11188 The code itself is inserted by the mode-switching pass. */
11189 if (TARGET_ZA
&& !(ca
->isa_mode
& AARCH64_FL_ZA_ON
))
11190 emit_insn (gen_aarch64_start_private_za_call ());
11192 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11193 save and restore ZT0 around the call. */
11194 if (aarch64_cfun_has_state ("zt0")
11195 && (ca
->isa_mode
& AARCH64_FL_ZA_ON
)
11196 && ca
->shared_zt0_flags
== 0)
11197 aarch64_save_zt0 ();
11200 /* This function is used by the call expanders of the machine description.
11201 RESULT is the register in which the result is returned. It's NULL for
11202 "call" and "sibcall".
11203 MEM is the location of the function call.
11205 - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11206 - a PARALLEL that contains such a const_int as its first element.
11207 The second element is a PARALLEL that lists all the argument
11208 registers that need to be saved and restored around a change
11209 in PSTATE.SM, or const0_rtx if no such switch is needed.
11210 The third and fourth elements are const_ints that contain the
11211 sharing flags for ZA and ZT0 respectively.
11212 SIBCALL indicates whether this function call is normal call or sibling call.
11213 It will generate different pattern accordingly. */
11216 aarch64_expand_call (rtx result
, rtx mem
, rtx cookie
, bool sibcall
)
11218 rtx call
, callee
, tmp
;
11222 rtx callee_abi
= cookie
;
11223 rtx sme_mode_switch_args
= const0_rtx
;
11224 unsigned int shared_za_flags
= 0;
11225 unsigned int shared_zt0_flags
= 0;
11226 if (GET_CODE (cookie
) == PARALLEL
)
11228 callee_abi
= XVECEXP (cookie
, 0, 0);
11229 sme_mode_switch_args
= XVECEXP (cookie
, 0, 1);
11230 shared_za_flags
= INTVAL (XVECEXP (cookie
, 0, 2));
11231 shared_zt0_flags
= INTVAL (XVECEXP (cookie
, 0, 3));
11234 gcc_assert (CONST_INT_P (callee_abi
));
11235 auto callee_isa_mode
= aarch64_callee_isa_mode (callee_abi
);
11237 if (aarch64_cfun_has_state ("za")
11238 && (callee_isa_mode
& AARCH64_FL_ZA_ON
)
11239 && !shared_za_flags
)
11241 sorry ("call to a function that shares state other than %qs"
11242 " from a function that has %qs state", "za", "za");
11243 inform (input_location
, "use %<__arm_preserves(\"za\")%> if the"
11244 " callee preserves ZA");
11247 gcc_assert (MEM_P (mem
));
11248 callee
= XEXP (mem
, 0);
11251 tmp
= legitimize_pe_coff_symbol (callee
, false);
11256 mode
= GET_MODE (callee
);
11257 gcc_assert (mode
== Pmode
);
11259 /* Decide if we should generate indirect calls by loading the
11260 address of the callee into a register before performing
11261 the branch-and-link. */
11262 if (SYMBOL_REF_P (callee
)
11263 ? (aarch64_is_long_call_p (callee
)
11264 || aarch64_is_noplt_call_p (callee
))
11266 XEXP (mem
, 0) = force_reg (mode
, callee
);
11268 /* Accumulate the return values, including state that is shared via
11270 auto_vec
<rtx
, 8> return_values
;
11273 if (GET_CODE (result
) == PARALLEL
)
11274 for (int i
= 0; i
< XVECLEN (result
, 0); ++i
)
11275 return_values
.safe_push (XVECEXP (result
, 0, i
));
11277 return_values
.safe_push (result
);
11279 unsigned int orig_num_return_values
= return_values
.length ();
11280 if (shared_za_flags
& AARCH64_STATE_OUT
)
11281 return_values
.safe_push (gen_rtx_REG (VNx16BImode
, ZA_REGNUM
));
11282 /* When calling private-ZA functions from functions with ZA state,
11283 we want to know whether the call committed a lazy save. */
11284 if (TARGET_ZA
&& !shared_za_flags
)
11285 return_values
.safe_push (gen_rtx_REG (VNx16BImode
, ZA_SAVED_REGNUM
));
11286 if (shared_zt0_flags
& AARCH64_STATE_OUT
)
11287 return_values
.safe_push (gen_rtx_REG (V8DImode
, ZT0_REGNUM
));
11289 /* Create the new return value, if necessary. */
11290 if (orig_num_return_values
!= return_values
.length ())
11292 if (return_values
.length () == 1)
11293 result
= return_values
[0];
11296 for (rtx
&x
: return_values
)
11297 if (GET_CODE (x
) != EXPR_LIST
)
11298 x
= gen_rtx_EXPR_LIST (VOIDmode
, x
, const0_rtx
);
11299 rtvec v
= gen_rtvec_v (return_values
.length (),
11300 return_values
.address ());
11301 result
= gen_rtx_PARALLEL (VOIDmode
, v
);
11305 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
11307 if (result
!= NULL_RTX
)
11308 call
= gen_rtx_SET (result
, call
);
11313 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
11315 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
11316 UNSPEC_CALLEE_ABI
);
11318 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
11319 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
11321 auto call_insn
= aarch64_emit_call_insn (call
);
11323 /* Check whether the call requires a change to PSTATE.SM. We can't
11324 emit the instructions to change PSTATE.SM yet, since they involve
11325 a change in vector length and a change in instruction set, which
11326 cannot be represented in RTL.
11328 For now, just record which registers will be clobbered and used
11329 by the changes to PSTATE.SM. */
11330 if (!sibcall
&& aarch64_call_switches_pstate_sm (callee_isa_mode
))
11332 aarch64_sme_mode_switch_regs args_switch
;
11333 if (sme_mode_switch_args
!= const0_rtx
)
11335 unsigned int num_args
= XVECLEN (sme_mode_switch_args
, 0);
11336 for (unsigned int i
= 0; i
< num_args
; ++i
)
11338 rtx x
= XVECEXP (sme_mode_switch_args
, 0, i
);
11339 args_switch
.add_reg (GET_MODE (x
), REGNO (x
));
11343 aarch64_sme_mode_switch_regs result_switch
;
11345 result_switch
.add_call_result (call_insn
);
11347 unsigned int num_gprs
= MAX (args_switch
.num_gprs (),
11348 result_switch
.num_gprs ());
11349 for (unsigned int i
= 0; i
< num_gprs
; ++i
)
11350 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11351 gen_rtx_REG (DImode
, args_switch
.FIRST_GPR
+ i
));
11353 for (int regno
= V0_REGNUM
; regno
< V0_REGNUM
+ 32; regno
+= 4)
11354 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11355 gen_rtx_REG (V4x16QImode
, regno
));
11357 for (int regno
= P0_REGNUM
; regno
< P0_REGNUM
+ 16; regno
+= 1)
11358 clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11359 gen_rtx_REG (VNx16BImode
, regno
));
11361 /* Ensure that the VG save slot has been initialized. Also emit
11362 an instruction to model the effect of the temporary clobber
11363 of VG, so that the prologue/epilogue pass sees the need to
11364 save the old value. */
11365 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11366 gen_rtx_REG (DImode
, VG_REGNUM
));
11367 emit_insn_before (gen_aarch64_update_vg (), call_insn
);
11369 cfun
->machine
->call_switches_pstate_sm
= true;
11372 /* Add any ZA-related information.
11374 ZA_REGNUM represents the current function's ZA state, rather than
11375 the contents of the ZA register itself. We ensure that the function's
11376 ZA state is preserved by private-ZA call sequences, so the call itself
11377 does not use or clobber ZA_REGNUM. The same thing applies to
11381 /* The callee requires ZA to be active if the callee is shared-ZA,
11382 otherwise it requires ZA to be dormant or off. The state of ZA is
11383 captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11384 and ZA_SAVED_REGNUM. */
11385 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11386 gen_rtx_REG (DImode
, SME_STATE_REGNUM
));
11387 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11388 gen_rtx_REG (DImode
, TPIDR2_SETUP_REGNUM
));
11389 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11390 gen_rtx_REG (VNx16BImode
, ZA_SAVED_REGNUM
));
11392 /* Keep the aarch64_start/end_private_za_call markers live. */
11393 if (!(callee_isa_mode
& AARCH64_FL_ZA_ON
))
11394 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11395 gen_rtx_REG (VNx16BImode
, LOWERING_REGNUM
));
11397 /* If the callee is a shared-ZA function, record whether it uses the
11398 current value of ZA and ZT0. */
11399 if (shared_za_flags
& AARCH64_STATE_IN
)
11400 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11401 gen_rtx_REG (VNx16BImode
, ZA_REGNUM
));
11403 if (shared_zt0_flags
& AARCH64_STATE_IN
)
11404 use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn
),
11405 gen_rtx_REG (V8DImode
, ZT0_REGNUM
));
11409 /* Implement TARGET_END_CALL_ARGS. */
11412 aarch64_end_call_args (cumulative_args_t ca_v
)
11414 CUMULATIVE_ARGS
*ca
= get_cumulative_args (ca_v
);
11416 /* If this is a call to a private ZA function, emit a marker to
11417 indicate where any necessary restoration code could be inserted.
11418 The code itself is inserted by the mode-switching pass. */
11419 if (TARGET_ZA
&& !(ca
->isa_mode
& AARCH64_FL_ZA_ON
))
11420 emit_insn (gen_aarch64_end_private_za_call ());
11422 /* If this is a call to a shared-ZA function that doesn't share ZT0,
11423 save and restore ZT0 around the call. */
11424 if (aarch64_cfun_has_state ("zt0")
11425 && (ca
->isa_mode
& AARCH64_FL_ZA_ON
)
11426 && ca
->shared_zt0_flags
== 0)
11427 aarch64_restore_zt0 (false);
11430 /* Emit call insn with PAT and do aarch64-specific handling. */
11433 aarch64_emit_call_insn (rtx pat
)
11435 auto insn
= emit_call_insn (pat
);
11437 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
11438 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
11439 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
11440 return as_a
<rtx_call_insn
*> (insn
);
11444 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
11446 machine_mode mode_x
= GET_MODE (x
);
11447 rtx_code code_x
= GET_CODE (x
);
11449 /* All floating point compares return CCFP if it is an equality
11450 comparison, and CCFPE otherwise. */
11451 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
11474 gcc_unreachable ();
11478 /* Equality comparisons of short modes against zero can be performed
11479 using the TST instruction with the appropriate bitmask. */
11480 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
11481 && (code
== EQ
|| code
== NE
)
11482 && (mode_x
== HImode
|| mode_x
== QImode
))
11485 /* Similarly, comparisons of zero_extends from shorter modes can
11486 be performed using an ANDS with an immediate mask. */
11487 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
11488 && (mode_x
== SImode
|| mode_x
== DImode
)
11489 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
11490 && (code
== EQ
|| code
== NE
))
11493 /* Zero extracts support equality comparisons. */
11494 if ((mode_x
== SImode
|| mode_x
== DImode
)
11496 && (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
11497 && CONST_INT_P (XEXP (x
, 2)))
11498 && (code
== EQ
|| code
== NE
))
11501 /* ANDS/BICS/TST support equality and all signed comparisons. */
11502 if ((mode_x
== SImode
|| mode_x
== DImode
)
11505 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
11506 || code
== GT
|| code
== LE
))
11509 /* ADDS/SUBS correctly set N and Z flags. */
11510 if ((mode_x
== SImode
|| mode_x
== DImode
)
11512 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
11513 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== NEG
))
11516 /* A compare with a shifted operand. Because of canonicalization,
11517 the comparison will have to be swapped when we emit the assembly
11519 if ((mode_x
== SImode
|| mode_x
== DImode
)
11520 && (REG_P (y
) || SUBREG_P (y
) || y
== const0_rtx
)
11521 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
11522 || code_x
== LSHIFTRT
11523 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
11526 /* Similarly for a negated operand, but we can only do this for
11528 if ((mode_x
== SImode
|| mode_x
== DImode
)
11529 && (REG_P (y
) || SUBREG_P (y
))
11530 && (code
== EQ
|| code
== NE
)
11534 /* A test for unsigned overflow from an addition. */
11535 if ((mode_x
== DImode
|| mode_x
== TImode
)
11536 && (code
== LTU
|| code
== GEU
)
11538 && rtx_equal_p (XEXP (x
, 0), y
))
11541 /* A test for unsigned overflow from an add with carry. */
11542 if ((mode_x
== DImode
|| mode_x
== TImode
)
11543 && (code
== LTU
|| code
== GEU
)
11545 && CONST_SCALAR_INT_P (y
)
11546 && (rtx_mode_t (y
, mode_x
)
11547 == (wi::shwi (1, mode_x
)
11548 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
11551 /* A test for signed overflow. */
11552 if ((mode_x
== DImode
|| mode_x
== TImode
)
11555 && GET_CODE (y
) == SIGN_EXTEND
)
11558 /* For everything else, return CCmode. */
11563 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
11566 aarch64_get_condition_code (rtx x
)
11568 machine_mode mode
= GET_MODE (XEXP (x
, 0));
11569 enum rtx_code comp_code
= GET_CODE (x
);
11571 if (GET_MODE_CLASS (mode
) != MODE_CC
)
11572 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
11573 return aarch64_get_condition_code_1 (mode
, comp_code
);
11577 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
11585 case GE
: return AARCH64_GE
;
11586 case GT
: return AARCH64_GT
;
11587 case LE
: return AARCH64_LS
;
11588 case LT
: return AARCH64_MI
;
11589 case NE
: return AARCH64_NE
;
11590 case EQ
: return AARCH64_EQ
;
11591 case ORDERED
: return AARCH64_VC
;
11592 case UNORDERED
: return AARCH64_VS
;
11593 case UNLT
: return AARCH64_LT
;
11594 case UNLE
: return AARCH64_LE
;
11595 case UNGT
: return AARCH64_HI
;
11596 case UNGE
: return AARCH64_PL
;
11597 default: return -1;
11604 case NE
: return AARCH64_NE
;
11605 case EQ
: return AARCH64_EQ
;
11606 case GE
: return AARCH64_GE
;
11607 case GT
: return AARCH64_GT
;
11608 case LE
: return AARCH64_LE
;
11609 case LT
: return AARCH64_LT
;
11610 case GEU
: return AARCH64_CS
;
11611 case GTU
: return AARCH64_HI
;
11612 case LEU
: return AARCH64_LS
;
11613 case LTU
: return AARCH64_CC
;
11614 default: return -1;
11621 case NE
: return AARCH64_NE
;
11622 case EQ
: return AARCH64_EQ
;
11623 case GE
: return AARCH64_LE
;
11624 case GT
: return AARCH64_LT
;
11625 case LE
: return AARCH64_GE
;
11626 case LT
: return AARCH64_GT
;
11627 case GEU
: return AARCH64_LS
;
11628 case GTU
: return AARCH64_CC
;
11629 case LEU
: return AARCH64_CS
;
11630 case LTU
: return AARCH64_HI
;
11631 default: return -1;
11638 case NE
: return AARCH64_NE
; /* = any */
11639 case EQ
: return AARCH64_EQ
; /* = none */
11640 case GE
: return AARCH64_PL
; /* = nfrst */
11641 case LT
: return AARCH64_MI
; /* = first */
11642 case GEU
: return AARCH64_CS
; /* = nlast */
11643 case GTU
: return AARCH64_HI
; /* = pmore */
11644 case LEU
: return AARCH64_LS
; /* = plast */
11645 case LTU
: return AARCH64_CC
; /* = last */
11646 default: return -1;
11653 case NE
: return AARCH64_NE
;
11654 case EQ
: return AARCH64_EQ
;
11655 case GE
: return AARCH64_PL
;
11656 case LT
: return AARCH64_MI
;
11657 case GT
: return AARCH64_GT
;
11658 case LE
: return AARCH64_LE
;
11659 default: return -1;
11666 case NE
: return AARCH64_NE
;
11667 case EQ
: return AARCH64_EQ
;
11668 case GE
: return AARCH64_PL
;
11669 case LT
: return AARCH64_MI
;
11670 default: return -1;
11677 case NE
: return AARCH64_NE
;
11678 case EQ
: return AARCH64_EQ
;
11679 default: return -1;
11686 case LTU
: return AARCH64_CS
;
11687 case GEU
: return AARCH64_CC
;
11688 default: return -1;
11695 case GEU
: return AARCH64_CS
;
11696 case LTU
: return AARCH64_CC
;
11697 default: return -1;
11704 case NE
: return AARCH64_VS
;
11705 case EQ
: return AARCH64_VC
;
11706 default: return -1;
11717 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11718 duplicate of such constants. If so, store in RET_WI the wide_int
11719 representation of the constant paired with the inner mode of the vector mode
11720 or MODE for scalar X constants. If MODE is not provided then TImode is
11724 aarch64_extract_vec_duplicate_wide_int (rtx x
, wide_int
*ret_wi
,
11725 scalar_mode mode
= TImode
)
11727 rtx elt
= unwrap_const_vec_duplicate (x
);
11728 if (!CONST_SCALAR_INT_P (elt
))
11731 = CONST_SCALAR_INT_P (x
) ? mode
: GET_MODE_INNER (GET_MODE (x
));
11732 *ret_wi
= rtx_mode_t (elt
, smode
);
11736 /* Return true if X is a scalar or a constant vector of integer
11737 immediates that represent the rounding constant used in the fixed-point
11738 arithmetic instructions.
11739 The accepted form of the constant is (1 << (C - 1)) where C is in the range
11740 [1, MODE_WIDTH/2]. */
11743 aarch64_rnd_imm_p (rtx x
)
11746 if (!aarch64_extract_vec_duplicate_wide_int (x
, &rnd_cst
))
11748 int log2
= wi::exact_log2 (rnd_cst
);
11751 return IN_RANGE (log2
, 0, rnd_cst
.get_precision () / 2 - 1);
11754 /* Return true if RND is a constant vector of integer rounding constants
11755 corresponding to a constant vector of shifts, SHIFT.
11756 The relationship should be RND == (1 << (SHIFT - 1)). */
11759 aarch64_const_vec_rnd_cst_p (rtx rnd
, rtx shift
)
11761 wide_int rnd_cst
, shft_cst
;
11762 if (!aarch64_extract_vec_duplicate_wide_int (rnd
, &rnd_cst
)
11763 || !aarch64_extract_vec_duplicate_wide_int (shift
, &shft_cst
))
11766 return rnd_cst
== (wi::shwi (1, rnd_cst
.get_precision ()) << (shft_cst
- 1));
11770 aarch64_const_vec_all_same_in_range_p (rtx x
,
11771 HOST_WIDE_INT minval
,
11772 HOST_WIDE_INT maxval
)
11775 return (const_vec_duplicate_p (x
, &elt
)
11776 && CONST_INT_P (elt
)
11777 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
11780 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11781 but we can still create them in various ways. If the constant in VAL can be
11782 created using alternate methods then if possible then return true and
11783 additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11784 Otherwise return false if sequence is not possible. */
11787 aarch64_maybe_generate_simd_constant (rtx target
, rtx val
, machine_mode mode
)
11790 auto smode
= GET_MODE_INNER (mode
);
11791 if (!aarch64_extract_vec_duplicate_wide_int (val
, &wval
, smode
))
11794 /* For Advanced SIMD we can create an integer with only the top bit set
11795 using fneg (0.0f). */
11799 && wi::only_sign_bit_p (wval
))
11804 /* Use the same base type as aarch64_gen_shareable_zero. */
11805 rtx zero
= CONST0_RTX (V4SImode
);
11806 emit_move_insn (lowpart_subreg (V4SImode
, target
, mode
), zero
);
11807 rtx neg
= lowpart_subreg (V2DFmode
, target
, mode
);
11808 emit_insn (gen_negv2df2 (neg
, copy_rtx (neg
)));
11815 /* Check if the value in VAL with mode MODE can be created using special
11816 instruction sequences. */
11818 bool aarch64_simd_special_constant_p (rtx val
, machine_mode mode
)
11820 return aarch64_maybe_generate_simd_constant (NULL_RTX
, val
, mode
);
11824 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
11826 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
11829 /* Return true if VEC is a constant in which every element is in the range
11830 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11833 aarch64_const_vec_all_in_range_p (rtx vec
,
11834 HOST_WIDE_INT minval
,
11835 HOST_WIDE_INT maxval
)
11837 if (!CONST_VECTOR_P (vec
)
11838 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
11842 if (!CONST_VECTOR_STEPPED_P (vec
))
11843 nunits
= const_vector_encoded_nelts (vec
);
11844 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
11847 for (int i
= 0; i
< nunits
; i
++)
11849 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
11850 if (!CONST_INT_P (vec_elem
)
11851 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
11858 #define AARCH64_CC_V 1
11859 #define AARCH64_CC_C (1 << 1)
11860 #define AARCH64_CC_Z (1 << 2)
11861 #define AARCH64_CC_N (1 << 3)
11863 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11864 static const int aarch64_nzcv_codes
[] =
11866 0, /* EQ, Z == 1. */
11867 AARCH64_CC_Z
, /* NE, Z == 0. */
11868 0, /* CS, C == 1. */
11869 AARCH64_CC_C
, /* CC, C == 0. */
11870 0, /* MI, N == 1. */
11871 AARCH64_CC_N
, /* PL, N == 0. */
11872 0, /* VS, V == 1. */
11873 AARCH64_CC_V
, /* VC, V == 0. */
11874 0, /* HI, C ==1 && Z == 0. */
11875 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
11876 AARCH64_CC_V
, /* GE, N == V. */
11877 0, /* LT, N != V. */
11878 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
11879 0, /* LE, !(Z == 0 && N == V). */
11884 /* Print floating-point vector immediate operand X to F, negating it
11885 first if NEGATE is true. Return true on success, false if it isn't
11886 a constant we can handle. */
11889 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
11893 if (!const_vec_duplicate_p (x
, &elt
))
11896 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
11898 r
= real_value_negate (&r
);
11900 /* Handle the SVE single-bit immediates specially, since they have a
11901 fixed form in the assembly syntax. */
11902 if (real_equal (&r
, &dconst0
))
11903 asm_fprintf (f
, "0.0");
11904 else if (real_equal (&r
, &dconst2
))
11905 asm_fprintf (f
, "2.0");
11906 else if (real_equal (&r
, &dconst1
))
11907 asm_fprintf (f
, "1.0");
11908 else if (real_equal (&r
, &dconsthalf
))
11909 asm_fprintf (f
, "0.5");
11912 const int buf_size
= 20;
11913 char float_buf
[buf_size
] = {'\0'};
11914 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
11915 1, GET_MODE (elt
));
11916 asm_fprintf (f
, "%s", float_buf
);
11922 /* Return the equivalent letter for size. */
11924 sizetochar (int size
)
11928 case 64: return 'd';
11929 case 32: return 's';
11930 case 16: return 'h';
11931 case 8 : return 'b';
11932 default: gcc_unreachable ();
11936 /* Print operand X to file F in a target specific manner according to CODE.
11937 The acceptable formatting commands given by CODE are:
11938 'c': An integer or symbol address without a preceding #
11940 'C': Take the duplicated element in a vector constant
11941 and print it in hex.
11942 'D': Take the duplicated element in a vector constant
11943 and print it as an unsigned integer, in decimal.
11944 'e': Print the sign/zero-extend size as a character 8->b,
11945 16->h, 32->w. Can also be used for masks:
11946 0xff->b, 0xffff->h, 0xffffffff->w.
11947 'I': If the operand is a duplicated vector constant,
11948 replace it with the duplicated scalar. If the
11949 operand is then a floating-point constant, replace
11950 it with the integer bit representation. Print the
11951 transformed constant as a signed decimal number.
11952 'p': Prints N such that 2^N == X (X must be power of 2 and
11954 'P': Print the number of non-zero bits in X (a const_int).
11955 'H': Print the higher numbered register of a pair (TImode)
11957 'm': Print a condition (eq, ne, etc).
11958 'M': Same as 'm', but invert condition.
11959 'N': Take the duplicated element in a vector constant
11960 and print the negative of it in decimal.
11961 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11962 'Z': Same for SVE registers. ('z' was already taken.)
11963 Note that it is not necessary to use %Z for operands
11964 that have SVE modes. The convention is to use %Z
11965 only for non-SVE (or potentially non-SVE) modes.
11966 'S/T/U/V': Print a FP/SIMD register name for a register list.
11967 The register printed is the FP/SIMD register name
11968 of X + 0/1/2/3 for S/T/U/V.
11969 'R': Print a scalar Integer/FP/SIMD register name + 1.
11970 'X': Print bottom 16 bits of integer constant in hex.
11971 'w/x': Print a general register name or the zero register
11972 (32-bit or 64-bit).
11973 '0': Print a normal operand, if it's a general register,
11974 then we assume DImode.
11975 'k': Print NZCV for conditional compare instructions.
11976 'K': Print a predicate register as pn<N> rather than p<N>
11977 'A': Output address constant representing the first
11978 argument of X, specifying a relocation offset
11980 'L': Output constant address specified by X
11981 with a relocation offset if appropriate.
11982 'G': Prints address of X, specifying a PC relative
11983 relocation mode if appropriate.
11984 'y': Output address of LDP or STP - this is used for
11985 some LDP/STPs which don't use a PARALLEL in their
11986 pattern (so the mode needs to be adjusted).
11987 'z': Output address of a typical LDP or STP. */
11990 aarch64_print_operand (FILE *f
, rtx x
, int code
)
11996 if (CONST_INT_P (x
))
11997 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
12001 rtx base
= strip_offset_and_salt (x
, &offset
);
12002 if (SYMBOL_REF_P (base
))
12003 output_addr_const (f
, x
);
12005 output_operand_lossage ("unsupported operand for code '%c'", code
);
12011 x
= unwrap_const_vec_duplicate (x
);
12012 if (!CONST_INT_P (x
))
12014 output_operand_lossage ("invalid operand for '%%%c'", code
);
12018 HOST_WIDE_INT val
= INTVAL (x
);
12019 if ((val
& ~7) == 8 || val
== 0xff)
12021 else if ((val
& ~7) == 16 || val
== 0xffff)
12023 else if ((val
& ~7) == 32 || val
== 0xffffffff)
12027 output_operand_lossage ("invalid operand for '%%%c'", code
);
12037 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
12039 output_operand_lossage ("invalid operand for '%%%c'", code
);
12043 asm_fprintf (f
, "%d", n
);
12048 if (!CONST_INT_P (x
))
12050 output_operand_lossage ("invalid operand for '%%%c'", code
);
12054 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
12058 if (x
== const0_rtx
)
12060 asm_fprintf (f
, "xzr");
12064 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
12066 output_operand_lossage ("invalid operand for '%%%c'", code
);
12070 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
12075 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
12076 if (CONST_INT_P (x
))
12077 asm_fprintf (f
, "%wd", INTVAL (x
));
12080 output_operand_lossage ("invalid operand for '%%%c'", code
);
12090 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
12091 if (x
== const_true_rtx
)
12098 if (!COMPARISON_P (x
))
12100 output_operand_lossage ("invalid operand for '%%%c'", code
);
12104 cond_code
= aarch64_get_condition_code (x
);
12105 gcc_assert (cond_code
>= 0);
12107 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
12108 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
12109 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
12111 fputs (aarch64_condition_codes
[cond_code
], f
);
12116 if (!const_vec_duplicate_p (x
, &elt
))
12118 output_operand_lossage ("invalid vector constant");
12122 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
12123 asm_fprintf (f
, "%wd", (HOST_WIDE_INT
) -UINTVAL (elt
));
12124 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
12125 && aarch64_print_vector_float_operand (f
, x
, true))
12129 output_operand_lossage ("invalid vector constant");
12140 code
= TOLOWER (code
);
12141 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
12143 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
12146 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
12153 if (!REG_P (x
) || (!FP_REGNUM_P (REGNO (x
)) && !PR_REGNUM_P (REGNO (x
))))
12155 output_operand_lossage ("incompatible operand for '%%%c'", code
);
12158 if (PR_REGNUM_P (REGNO (x
)))
12159 asm_fprintf (f
, "p%d", REGNO (x
) - P0_REGNUM
+ (code
- 'S'));
12161 asm_fprintf (f
, "%c%d",
12162 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
12163 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
12167 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
))
12168 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x
))))
12169 asm_fprintf (f
, "d%d", REGNO (x
) - V0_REGNUM
+ 1);
12170 else if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
12171 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
12172 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
12173 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
12175 output_operand_lossage ("incompatible register operand for '%%%c'",
12180 if (!CONST_INT_P (x
))
12182 output_operand_lossage ("invalid operand for '%%%c'", code
);
12185 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
12190 /* Print a replicated constant in hex. */
12191 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
12193 output_operand_lossage ("invalid operand for '%%%c'", code
);
12196 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
12197 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
12203 /* Print a replicated constant in decimal, treating it as
12205 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
12207 output_operand_lossage ("invalid operand for '%%%c'", code
);
12210 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
12211 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
12217 if (aarch64_const_zero_rtx_p (x
))
12219 asm_fprintf (f
, "%czr", code
);
12223 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
12225 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
12229 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
12231 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
12240 output_operand_lossage ("missing operand");
12244 switch (GET_CODE (x
))
12248 asm_fprintf (f
, "%s", XSTR (x
, 0));
12252 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
12254 if (REG_NREGS (x
) == 1)
12255 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
12259 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
12260 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
12261 REGNO (x
) - V0_REGNUM
, suffix
,
12262 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
12266 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
12270 output_address (GET_MODE (x
), XEXP (x
, 0));
12275 output_addr_const (asm_out_file
, x
);
12279 asm_fprintf (f
, "%wd", INTVAL (x
));
12283 if (!VECTOR_MODE_P (GET_MODE (x
)))
12285 output_addr_const (asm_out_file
, x
);
12291 if (!const_vec_duplicate_p (x
, &elt
))
12293 output_operand_lossage ("invalid vector constant");
12297 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
12298 asm_fprintf (f
, "%wd", INTVAL (elt
));
12299 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
12300 && aarch64_print_vector_float_operand (f
, x
, false))
12304 output_operand_lossage ("invalid vector constant");
12310 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12311 be getting CONST_DOUBLEs holding integers. */
12312 gcc_assert (GET_MODE (x
) != VOIDmode
);
12313 if (aarch64_float_const_zero_rtx_p (x
))
12318 else if (aarch64_float_const_representable_p (x
))
12320 #define buf_size 20
12321 char float_buf
[buf_size
] = {'\0'};
12322 real_to_decimal_for_mode (float_buf
,
12323 CONST_DOUBLE_REAL_VALUE (x
),
12324 buf_size
, buf_size
,
12326 asm_fprintf (asm_out_file
, "%s", float_buf
);
12330 output_operand_lossage ("invalid constant");
12333 output_operand_lossage ("invalid operand");
12339 if (GET_CODE (x
) == HIGH
)
12342 switch (aarch64_classify_symbolic_expression (x
))
12344 case SYMBOL_SMALL_GOT_4G
:
12345 asm_fprintf (asm_out_file
, ":got:");
12348 case SYMBOL_SMALL_TLSGD
:
12349 asm_fprintf (asm_out_file
, ":tlsgd:");
12352 case SYMBOL_SMALL_TLSDESC
:
12353 asm_fprintf (asm_out_file
, ":tlsdesc:");
12356 case SYMBOL_SMALL_TLSIE
:
12357 asm_fprintf (asm_out_file
, ":gottprel:");
12360 case SYMBOL_TLSLE24
:
12361 asm_fprintf (asm_out_file
, ":tprel:");
12364 case SYMBOL_TINY_GOT
:
12365 gcc_unreachable ();
12371 output_addr_const (asm_out_file
, x
);
12375 switch (aarch64_classify_symbolic_expression (x
))
12377 case SYMBOL_SMALL_GOT_4G
:
12378 asm_fprintf (asm_out_file
, ":got_lo12:");
12381 case SYMBOL_SMALL_TLSGD
:
12382 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
12385 case SYMBOL_SMALL_TLSDESC
:
12386 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
12389 case SYMBOL_SMALL_TLSIE
:
12390 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
12393 case SYMBOL_TLSLE12
:
12394 asm_fprintf (asm_out_file
, ":tprel_lo12:");
12397 case SYMBOL_TLSLE24
:
12398 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
12401 case SYMBOL_TINY_GOT
:
12402 asm_fprintf (asm_out_file
, ":got:");
12405 case SYMBOL_TINY_TLSIE
:
12406 asm_fprintf (asm_out_file
, ":gottprel:");
12412 output_addr_const (asm_out_file
, x
);
12416 switch (aarch64_classify_symbolic_expression (x
))
12418 case SYMBOL_TLSLE24
:
12419 asm_fprintf (asm_out_file
, ":tprel_hi12:");
12424 output_addr_const (asm_out_file
, x
);
12429 HOST_WIDE_INT cond_code
;
12431 if (!CONST_INT_P (x
))
12433 output_operand_lossage ("invalid operand for '%%%c'", code
);
12437 cond_code
= INTVAL (x
);
12438 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
12439 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
12444 if (!REG_P (x
) || !PR_REGNUM_P (REGNO (x
)))
12446 output_operand_lossage ("invalid operand for '%%%c'", code
);
12449 asm_fprintf (f
, "pn%d", REGNO (x
) - P0_REGNUM
);
12455 machine_mode mode
= GET_MODE (x
);
12459 && maybe_ne (GET_MODE_SIZE (mode
), 8)
12460 && maybe_ne (GET_MODE_SIZE (mode
), 16)
12461 && maybe_ne (GET_MODE_SIZE (mode
), 32)))
12463 output_operand_lossage ("invalid operand for '%%%c'", code
);
12467 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
12469 ? ADDR_QUERY_LDP_STP_N
12470 : ADDR_QUERY_LDP_STP
))
12471 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
12476 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
12481 /* Print address 'x' of a memory access with mode 'mode'.
12482 'op' is the context required by aarch64_classify_address. It can either be
12483 MEM for a normal memory access or PARALLEL for LDP/STP. */
12485 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
12486 aarch64_addr_query_type type
)
12488 struct aarch64_address_info addr
;
12489 unsigned int size
, vec_flags
;
12491 /* Check all addresses are Pmode - including ILP32. */
12492 if (GET_MODE (x
) != Pmode
12493 && (!CONST_INT_P (x
)
12494 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
12496 output_operand_lossage ("invalid address mode");
12500 const bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
12501 || type
== ADDR_QUERY_LDP_STP_N
);
12503 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
12506 case ADDRESS_REG_IMM
:
12507 if (known_eq (addr
.const_offset
, 0))
12509 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
12513 vec_flags
= aarch64_classify_vector_mode (mode
);
12514 if ((vec_flags
& VEC_ANY_SVE
) && !load_store_pair_p
)
12517 = exact_div (addr
.const_offset
,
12518 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
12519 asm_fprintf (f
, "[%s, #%wd, mul vl]",
12520 reg_names
[REGNO (addr
.base
)], vnum
);
12524 if (!CONST_INT_P (addr
.offset
))
12527 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
12528 INTVAL (addr
.offset
));
12531 case ADDRESS_REG_REG
:
12532 if (addr
.shift
== 0)
12533 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
12534 reg_names
[REGNO (addr
.offset
)]);
12536 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
12537 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
12540 case ADDRESS_REG_UXTW
:
12541 if (addr
.shift
== 0)
12542 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
12543 REGNO (addr
.offset
) - R0_REGNUM
);
12545 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
12546 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
12549 case ADDRESS_REG_SXTW
:
12550 if (addr
.shift
== 0)
12551 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
12552 REGNO (addr
.offset
) - R0_REGNUM
);
12554 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
12555 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
12558 case ADDRESS_REG_WB
:
12559 /* Writeback is only supported for fixed-width modes. */
12560 size
= GET_MODE_SIZE (mode
).to_constant ();
12561 switch (GET_CODE (x
))
12564 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
12567 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
12570 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
12573 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
12576 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
12577 INTVAL (addr
.offset
));
12580 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
12581 INTVAL (addr
.offset
));
12588 case ADDRESS_LO_SUM
:
12589 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
12590 output_addr_const (f
, addr
.offset
);
12591 asm_fprintf (f
, "]");
12594 case ADDRESS_SYMBOLIC
:
12595 output_addr_const (f
, x
);
12602 /* Print address 'x' of a memory access with mode 'mode'. */
12604 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
12606 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
12607 output_addr_const (f
, x
);
12610 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12613 aarch64_output_addr_const_extra (FILE *file
, rtx x
)
12615 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SALT_ADDR
)
12617 output_addr_const (file
, XVECEXP (x
, 0, 0));
12624 aarch64_label_mentioned_p (rtx x
)
12629 if (LABEL_REF_P (x
))
12632 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12633 referencing instruction, but they are constant offsets, not
12635 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
12638 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
12639 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
12645 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
12646 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
12649 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
12656 /* Implement REGNO_REG_CLASS. */
12659 aarch64_regno_regclass (unsigned regno
)
12661 if (W8_W11_REGNUM_P (regno
))
12662 return W8_W11_REGS
;
12664 if (W12_W15_REGNUM_P (regno
))
12665 return W12_W15_REGS
;
12667 if (STUB_REGNUM_P (regno
))
12670 if (GP_REGNUM_P (regno
))
12671 return GENERAL_REGS
;
12673 if (regno
== SP_REGNUM
)
12676 if (regno
== FRAME_POINTER_REGNUM
12677 || regno
== ARG_POINTER_REGNUM
)
12678 return POINTER_REGS
;
12680 if (FP_REGNUM_P (regno
))
12681 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
12682 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
12684 if (PR_REGNUM_P (regno
))
12685 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
12687 if (regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
)
12690 if (FAKE_REGNUM_P (regno
))
12696 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12697 If OFFSET is out of range, return an offset of an anchor point
12698 that is in range. Return 0 otherwise. */
12700 static HOST_WIDE_INT
12701 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
12704 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12706 return (offset
+ 0x400) & ~0x7f0;
12708 /* For offsets that aren't a multiple of the access size, the limit is
12710 if (offset
& (size
- 1))
12712 /* BLKmode typically uses LDP of X-registers. */
12713 if (mode
== BLKmode
)
12714 return (offset
+ 512) & ~0x3ff;
12715 return (offset
+ 0x100) & ~0x1ff;
12718 /* Small negative offsets are supported. */
12719 if (IN_RANGE (offset
, -256, 0))
12722 if (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
12723 return (offset
+ 0x100) & ~0x1ff;
12725 /* Use 12-bit offset by access size. */
12726 return offset
& (~0xfff * size
);
12730 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
12733 rtx tmp
= legitimize_pe_coff_symbol (x
, true);
12738 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12739 where mask is selected by alignment and size of the offset.
12740 We try to pick as large a range for the offset as possible to
12741 maximize the chance of a CSE. However, for aligned addresses
12742 we limit the range to 4k so that structures with different sized
12743 elements are likely to use the same base. We need to be careful
12744 not to split a CONST for some forms of address expression, otherwise
12745 it will generate sub-optimal code. */
12747 /* First split X + CONST (base, offset) into (base + X) + offset. */
12748 if (GET_CODE (x
) == PLUS
&& GET_CODE (XEXP (x
, 1)) == CONST
)
12751 rtx base
= strip_offset (XEXP (x
, 1), &offset
);
12753 base
= expand_binop (Pmode
, add_optab
, base
, XEXP (x
, 0),
12754 NULL_RTX
, true, OPTAB_DIRECT
);
12755 x
= plus_constant (Pmode
, base
, offset
);
12758 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
12760 rtx base
= XEXP (x
, 0);
12761 rtx offset_rtx
= XEXP (x
, 1);
12762 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
12764 if (GET_CODE (base
) == PLUS
)
12766 rtx op0
= XEXP (base
, 0);
12767 rtx op1
= XEXP (base
, 1);
12769 /* Force any scaling into a temp for CSE. */
12770 op0
= force_reg (Pmode
, op0
);
12771 op1
= force_reg (Pmode
, op1
);
12773 /* Let the pointer register be in op0. */
12774 if (REG_POINTER (op1
))
12775 std::swap (op0
, op1
);
12777 /* If the pointer is virtual or frame related, then we know that
12778 virtual register instantiation or register elimination is going
12779 to apply a second constant. We want the two constants folded
12780 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12781 if (virt_or_elim_regno_p (REGNO (op0
)))
12783 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
12784 NULL_RTX
, true, OPTAB_DIRECT
);
12785 return gen_rtx_PLUS (Pmode
, base
, op1
);
12788 /* Otherwise, in order to encourage CSE (and thence loop strength
12789 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12790 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
12791 NULL_RTX
, true, OPTAB_DIRECT
);
12792 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
12795 HOST_WIDE_INT size
;
12796 if (GET_MODE_SIZE (mode
).is_constant (&size
))
12798 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
12800 if (base_offset
!= 0)
12802 base
= plus_constant (Pmode
, base
, base_offset
);
12803 base
= force_operand (base
, NULL_RTX
);
12804 return plus_constant (Pmode
, base
, offset
- base_offset
);
12813 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
12814 reg_class_t rclass
,
12816 secondary_reload_info
*sri
)
12818 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12819 LDR and STR. See the comment at the head of aarch64-sve.md for
12820 more details about the big-endian handling. */
12821 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12822 if (reg_class_subset_p (rclass
, FP_REGS
)
12823 && !((REG_P (x
) && HARD_REGISTER_P (x
))
12824 || aarch64_simd_valid_immediate (x
, NULL
))
12825 && mode
!= VNx16QImode
12826 && (vec_flags
& VEC_SVE_DATA
)
12827 && ((vec_flags
& VEC_PARTIAL
) || BYTES_BIG_ENDIAN
))
12829 sri
->icode
= CODE_FOR_aarch64_sve_reload_mem
;
12833 /* If we have to disable direct literal pool loads and stores because the
12834 function is too big, then we need a scratch register. */
12835 if (MEM_P (x
) && SYMBOL_REF_P (x
) && CONSTANT_POOL_ADDRESS_P (x
)
12836 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
12837 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
12838 && !aarch64_pcrelative_literal_loads
)
12840 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
12844 /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12845 Q register to a Q register directly. We need a scratch. */
12850 || (vec_flags
== VEC_ADVSIMD
&& known_eq (GET_MODE_SIZE (mode
), 16)))
12851 && mode
== GET_MODE (x
)
12853 && FP_REGNUM_P (REGNO (x
))
12854 && reg_class_subset_p (rclass
, FP_REGS
))
12856 sri
->icode
= code_for_aarch64_reload_mov (mode
);
12860 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12861 because AArch64 has richer addressing modes for LDR/STR instructions
12862 than LDP/STP instructions. */
12863 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
12864 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
12867 if (rclass
== FP_REGS
12868 && (mode
== TImode
|| mode
== TFmode
|| mode
== TDmode
)
12870 return GENERAL_REGS
;
12875 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12878 aarch64_secondary_memory_needed (machine_mode mode
, reg_class_t class1
,
12879 reg_class_t class2
)
12882 && reg_classes_intersect_p (class1
, FP_REGS
)
12883 && reg_classes_intersect_p (class2
, FP_REGS
))
12885 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12886 so we can't easily split a move involving tuples of 128-bit
12887 vectors. Force the copy through memory instead.
12889 (Tuples of 64-bit vectors are fine.) */
12890 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12891 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
12897 /* Implement TARGET_FRAME_POINTER_REQUIRED. */
12900 aarch64_frame_pointer_required ()
12902 /* If the function needs to record the incoming value of PSTATE.SM,
12903 make sure that the slot is accessible from the frame pointer. */
12904 return aarch64_need_old_pstate_sm ();
12908 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
12910 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
12912 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12913 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12914 if (frame_pointer_needed
)
12915 return to
== HARD_FRAME_POINTER_REGNUM
;
12920 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
12922 aarch64_frame
&frame
= cfun
->machine
->frame
;
12924 if (to
== HARD_FRAME_POINTER_REGNUM
)
12926 if (from
== ARG_POINTER_REGNUM
)
12927 return frame
.bytes_above_hard_fp
;
12929 if (from
== FRAME_POINTER_REGNUM
)
12930 return frame
.bytes_above_hard_fp
- frame
.bytes_above_locals
;
12933 if (to
== STACK_POINTER_REGNUM
)
12935 if (from
== FRAME_POINTER_REGNUM
)
12936 return frame
.frame_size
- frame
.bytes_above_locals
;
12939 return frame
.frame_size
;
12943 /* Get return address without mangling. */
12946 aarch64_return_addr_rtx (void)
12948 rtx val
= get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
12949 /* Note: aarch64_return_address_signing_enabled only
12950 works after cfun->machine->frame.laid_out is set,
12951 so here we don't know if the return address will
12952 be signed or not. */
12953 rtx lr
= gen_rtx_REG (Pmode
, LR_REGNUM
);
12954 emit_move_insn (lr
, val
);
12955 emit_insn (GEN_FCN (CODE_FOR_xpaclri
) ());
12960 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12964 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
12968 return aarch64_return_addr_rtx ();
12972 aarch64_asm_trampoline_template (FILE *f
)
12974 /* Even if the current function doesn't have branch protection, some
12975 later function might, so since this template is only generated once
12976 we have to add a BTI just in case. */
12977 asm_fprintf (f
, "\thint\t34 // bti c\n");
12981 asm_fprintf (f
, "\tldr\tw%d, .+20\n", IP1_REGNUM
- R0_REGNUM
);
12982 asm_fprintf (f
, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
12986 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[IP1_REGNUM
]);
12987 asm_fprintf (f
, "\tldr\t%s, .+24\n", reg_names
[STATIC_CHAIN_REGNUM
]);
12989 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
12991 /* We always emit a speculation barrier.
12992 This is because the same trampoline template is used for every nested
12993 function. Since nested functions are not particularly common or
12994 performant we don't worry too much about the extra instructions to copy
12996 This is not yet a problem, since we have not yet implemented function
12997 specific attributes to choose between hardening against straight line
12998 speculation or not, but such function specific attributes are likely to
12999 happen in the future. */
13000 asm_fprintf (f
, "\tdsb\tsy\n\tisb\n");
13002 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
13003 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
13007 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
13009 rtx fnaddr
, mem
, a_tramp
;
13010 const int tramp_code_sz
= 24;
13012 /* Don't need to copy the trailing D-words, we fill those in below. */
13013 /* We create our own memory address in Pmode so that `emit_block_move` can
13014 use parts of the backend which expect Pmode addresses. */
13015 rtx temp
= convert_memory_address (Pmode
, XEXP (m_tramp
, 0));
13016 emit_block_move (gen_rtx_MEM (BLKmode
, temp
),
13017 assemble_trampoline_template (),
13018 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
13019 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
13020 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
13021 if (GET_MODE (fnaddr
) != ptr_mode
)
13022 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
13023 emit_move_insn (mem
, fnaddr
);
13025 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
13026 emit_move_insn (mem
, chain_value
);
13028 /* XXX We should really define a "clear_cache" pattern and use
13029 gen_clear_cache(). */
13030 a_tramp
= XEXP (m_tramp
, 0);
13031 maybe_emit_call_builtin___clear_cache (a_tramp
,
13032 plus_constant (ptr_mode
,
13037 static unsigned char
13038 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
13040 /* ??? Logically we should only need to provide a value when
13041 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13042 can hold MODE, but at the moment we need to handle all modes.
13043 Just ignore any runtime parts for registers that can't store them. */
13044 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
13045 unsigned int nregs
, vec_flags
;
13051 case TAILCALL_ADDR_REGS
:
13055 case POINTER_AND_FP_REGS
:
13059 vec_flags
= aarch64_classify_vector_mode (mode
);
13060 if ((vec_flags
& VEC_SVE_DATA
)
13061 && constant_multiple_p (GET_MODE_SIZE (mode
),
13062 aarch64_vl_bytes (mode
, vec_flags
), &nregs
))
13064 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
))
13065 return GET_MODE_SIZE (mode
).to_constant () / 8;
13066 return (vec_flags
& VEC_ADVSIMD
13067 ? CEIL (lowest_size
, UNITS_PER_VREG
)
13068 : CEIL (lowest_size
, UNITS_PER_WORD
));
13073 return mode
== VNx32BImode
? 2 : 1;
13077 case PR_AND_FFR_REGS
:
13087 gcc_unreachable ();
13091 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
13093 if (regclass
== POINTER_REGS
)
13094 return GENERAL_REGS
;
13096 if (regclass
== STACK_REG
)
13099 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
13105 /* Register eliminiation can result in a request for
13106 SP+constant->FP_REGS. We cannot support such operations which
13107 use SP as source and an FP_REG as destination, so reject out
13109 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
13111 rtx lhs
= XEXP (x
, 0);
13113 /* Look through a possible SUBREG introduced by ILP32. */
13114 if (SUBREG_P (lhs
))
13115 lhs
= SUBREG_REG (lhs
);
13117 gcc_assert (REG_P (lhs
));
13118 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
13127 aarch64_asm_output_labelref (FILE* f
, const char *name
)
13129 asm_fprintf (f
, "%U%s", name
);
13133 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
13135 if (priority
== DEFAULT_INIT_PRIORITY
)
13136 default_ctor_section_asm_out_constructor (symbol
, priority
);
13140 /* While priority is known to be in range [0, 65535], so 18 bytes
13141 would be enough, the compiler might not know that. To avoid
13142 -Wformat-truncation false positive, use a larger size. */
13144 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
13145 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
13146 switch_to_section (s
);
13147 assemble_align (POINTER_SIZE
);
13148 assemble_aligned_integer (POINTER_BYTES
, symbol
);
13153 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
13155 if (priority
== DEFAULT_INIT_PRIORITY
)
13156 default_dtor_section_asm_out_destructor (symbol
, priority
);
13160 /* While priority is known to be in range [0, 65535], so 18 bytes
13161 would be enough, the compiler might not know that. To avoid
13162 -Wformat-truncation false positive, use a larger size. */
13164 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
13165 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
13166 switch_to_section (s
);
13167 assemble_align (POINTER_SIZE
);
13168 assemble_aligned_integer (POINTER_BYTES
, symbol
);
13173 aarch64_output_casesi (rtx
*operands
)
13177 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
13179 static const char *const patterns
[4][2] =
13182 "ldrb\t%w3, [%0,%w1,uxtw]",
13183 "add\t%3, %4, %w3, sxtb #2"
13186 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13187 "add\t%3, %4, %w3, sxth #2"
13190 "ldr\t%w3, [%0,%w1,uxtw #2]",
13191 "add\t%3, %4, %w3, sxtw #2"
13193 /* We assume that DImode is only generated when not optimizing and
13194 that we don't really need 64-bit address offsets. That would
13195 imply an object file with 8GB of code in a single function! */
13197 "ldr\t%w3, [%0,%w1,uxtw #2]",
13198 "add\t%3, %4, %w3, sxtw #2"
13202 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
13204 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
13205 index
= exact_log2 (GET_MODE_SIZE (mode
));
13207 gcc_assert (index
>= 0 && index
<= 3);
13209 /* Need to implement table size reduction, by chaning the code below. */
13210 output_asm_insn (patterns
[index
][0], operands
);
13211 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
13212 snprintf (buf
, sizeof (buf
),
13213 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
13214 output_asm_insn (buf
, operands
);
13215 output_asm_insn (patterns
[index
][1], operands
);
13216 output_asm_insn ("br\t%3", operands
);
13217 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13219 assemble_label (asm_out_file
, label
);
13223 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13224 operand is MASK. */
13226 aarch64_output_sme_zero_za (rtx mask
)
13228 auto mask_val
= UINTVAL (mask
);
13232 if (mask_val
== 0xff)
13233 return "zero\t{ za }";
13235 static constexpr struct { unsigned char mask
; char letter
; } tiles
[] = {
13241 /* The last entry in the list has the form "za7.d }", but that's the
13242 same length as "za7.d, ". */
13243 static char buffer
[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13244 for (auto &tile
: tiles
)
13246 unsigned int tile_mask
= tile
.mask
;
13247 unsigned int tile_index
= 0;
13248 unsigned int i
= snprintf (buffer
, sizeof (buffer
), "zero\t");
13249 const char *prefix
= "{ ";
13250 auto remaining_mask
= mask_val
;
13251 while (tile_mask
< 0x100)
13253 if ((remaining_mask
& tile_mask
) == tile_mask
)
13255 i
+= snprintf (buffer
+ i
, sizeof (buffer
) - i
, "%sza%d.%c",
13256 prefix
, tile_index
, tile
.letter
);
13258 remaining_mask
&= ~tile_mask
;
13263 if (remaining_mask
== 0)
13265 gcc_assert (i
+ 3 <= sizeof (buffer
));
13266 snprintf (buffer
+ i
, sizeof (buffer
) - i
, " }");
13270 gcc_unreachable ();
13273 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13274 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13278 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
13280 if (shift
>= 0 && shift
<= 4)
13283 for (size
= 8; size
<= 32; size
*= 2)
13285 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
13286 if (mask
== bits
<< shift
)
13293 /* Constant pools are per function only when PC relative
13294 literal loads are true or we are in the large memory
13298 aarch64_can_use_per_function_literal_pools_p (void)
13300 return (aarch64_pcrelative_literal_loads
13301 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
13305 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
13307 /* We can't use blocks for constants when we're using a per-function
13309 return !aarch64_can_use_per_function_literal_pools_p ();
13312 /* Select appropriate section for constants depending
13313 on where we place literal pools. */
13316 aarch64_select_rtx_section (machine_mode mode
,
13318 unsigned HOST_WIDE_INT align
)
13320 if (aarch64_can_use_per_function_literal_pools_p ())
13321 return function_section (current_function_decl
);
13323 return default_elf_select_rtx_section (mode
, x
, align
);
13326 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13328 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
13329 HOST_WIDE_INT offset
)
13331 /* When using per-function literal pools, we must ensure that any code
13332 section is aligned to the minimal instruction length, lest we get
13333 errors from the assembler re "unaligned instructions". */
13334 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
13335 ASM_OUTPUT_ALIGN (f
, 2);
13340 /* Helper function for rtx cost calculation. Strip a shift expression
13341 from X. Returns the inner operand if successful, or the original
13342 expression on failure. */
13344 aarch64_strip_shift (rtx x
)
13348 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13349 we can convert both to ROR during final output. */
13350 if ((GET_CODE (op
) == ASHIFT
13351 || GET_CODE (op
) == ASHIFTRT
13352 || GET_CODE (op
) == LSHIFTRT
13353 || GET_CODE (op
) == ROTATERT
13354 || GET_CODE (op
) == ROTATE
)
13355 && CONST_INT_P (XEXP (op
, 1)))
13356 return XEXP (op
, 0);
13358 if (GET_CODE (op
) == MULT
13359 && CONST_INT_P (XEXP (op
, 1))
13360 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
13361 return XEXP (op
, 0);
13366 /* Helper function for rtx cost calculation. Strip an extend
13367 expression from X. Returns the inner operand if successful, or the
13368 original expression on failure. We deal with a number of possible
13369 canonicalization variations here. If STRIP_SHIFT is true, then
13370 we can strip off a shift also. */
13372 aarch64_strip_extend (rtx x
, bool strip_shift
)
13374 scalar_int_mode mode
;
13377 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
13380 if (GET_CODE (op
) == AND
13381 && GET_CODE (XEXP (op
, 0)) == MULT
13382 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
13383 && CONST_INT_P (XEXP (op
, 1))
13384 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
13385 INTVAL (XEXP (op
, 1))) != 0)
13386 return XEXP (XEXP (op
, 0), 0);
13388 /* Now handle extended register, as this may also have an optional
13389 left shift by 1..4. */
13391 && GET_CODE (op
) == ASHIFT
13392 && CONST_INT_P (XEXP (op
, 1))
13393 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
13396 if (GET_CODE (op
) == ZERO_EXTEND
13397 || GET_CODE (op
) == SIGN_EXTEND
)
13406 /* Helper function for rtx cost calculation. Strip extension as well as any
13407 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13408 successful, or the original expression on failure. */
13410 aarch64_strip_extend_vec_half (rtx x
)
13412 if (GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
)
13415 if (GET_CODE (x
) == VEC_SELECT
13416 && vec_series_highpart_p (GET_MODE (x
), GET_MODE (XEXP (x
, 0)),
13423 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13424 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13425 operand if successful, or the original expression on failure. */
13427 aarch64_strip_duplicate_vec_elt (rtx x
)
13429 if (GET_CODE (x
) == VEC_DUPLICATE
13430 && is_a
<scalar_mode
> (GET_MODE (XEXP (x
, 0))))
13433 if (GET_CODE (x
) == VEC_SELECT
)
13435 else if ((GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
)
13436 && GET_CODE (XEXP (x
, 0)) == VEC_SELECT
)
13437 x
= XEXP (XEXP (x
, 0), 0);
13442 /* Return true iff CODE is a shift supported in combination
13443 with arithmetic instructions. */
13446 aarch64_shift_p (enum rtx_code code
)
13448 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
13452 /* Return true iff X is a cheap shift without a sign extend. */
13455 aarch64_cheap_mult_shift_p (rtx x
)
13462 if (!(aarch64_tune_params
.extra_tuning_flags
13463 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
13466 if (GET_CODE (op0
) == SIGN_EXTEND
)
13469 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
13470 && UINTVAL (op1
) <= 4)
13473 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
13476 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
13478 if (l2
> 0 && l2
<= 4)
13484 /* Helper function for rtx cost calculation. Calculate the cost of
13485 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13486 Return the calculated cost of the expression, recursing manually in to
13487 operands where needed. */
13490 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
13493 const struct cpu_cost_table
*extra_cost
13494 = aarch64_tune_params
.insn_extra_cost
;
13496 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
13497 machine_mode mode
= GET_MODE (x
);
13499 gcc_checking_assert (code
== MULT
);
13504 if (VECTOR_MODE_P (mode
))
13506 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13507 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
13509 /* The select-operand-high-half versions of the instruction have the
13510 same cost as the three vector version - don't add the costs of the
13511 extension or selection into the costs of the multiply. */
13512 op0
= aarch64_strip_extend_vec_half (op0
);
13513 op1
= aarch64_strip_extend_vec_half (op1
);
13514 /* The by-element versions of the instruction have the same costs as
13515 the normal 3-vector version. We make an assumption that the input
13516 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13517 costing of a MUL by element pre RA is a bit optimistic. */
13518 op0
= aarch64_strip_duplicate_vec_elt (op0
);
13519 op1
= aarch64_strip_duplicate_vec_elt (op1
);
13521 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13522 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13525 if (GET_CODE (x
) == MULT
)
13526 cost
+= extra_cost
->vect
.mult
;
13527 /* This is to catch the SSRA costing currently flowing here. */
13529 cost
+= extra_cost
->vect
.alu
;
13534 /* Integer multiply/fma. */
13535 if (GET_MODE_CLASS (mode
) == MODE_INT
)
13537 /* The multiply will be canonicalized as a shift, cost it as such. */
13538 if (aarch64_shift_p (GET_CODE (x
))
13539 || (CONST_INT_P (op1
)
13540 && exact_log2 (INTVAL (op1
)) > 0))
13542 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
13543 || GET_CODE (op0
) == SIGN_EXTEND
;
13548 /* If the shift is considered cheap,
13549 then don't add any cost. */
13550 if (aarch64_cheap_mult_shift_p (x
))
13552 else if (REG_P (op1
))
13553 /* ARITH + shift-by-register. */
13554 cost
+= extra_cost
->alu
.arith_shift_reg
;
13555 else if (is_extend
)
13556 /* ARITH + extended register. We don't have a cost field
13557 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13558 cost
+= extra_cost
->alu
.extend_arith
;
13560 /* ARITH + shift-by-immediate. */
13561 cost
+= extra_cost
->alu
.arith_shift
;
13564 /* LSL (immediate). */
13565 cost
+= extra_cost
->alu
.shift
;
13568 /* Strip extends as we will have costed them in the case above. */
13570 op0
= aarch64_strip_extend (op0
, true);
13572 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
13577 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13578 compound and let the below cases handle it. After all, MNEG is a
13579 special-case alias of MSUB. */
13580 if (GET_CODE (op0
) == NEG
)
13582 op0
= XEXP (op0
, 0);
13586 /* Integer multiplies or FMAs have zero/sign extending variants. */
13587 if ((GET_CODE (op0
) == ZERO_EXTEND
13588 && GET_CODE (op1
) == ZERO_EXTEND
)
13589 || (GET_CODE (op0
) == SIGN_EXTEND
13590 && GET_CODE (op1
) == SIGN_EXTEND
))
13592 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
13593 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
13598 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13599 cost
+= extra_cost
->mult
[0].extend_add
;
13601 /* MUL/SMULL/UMULL. */
13602 cost
+= extra_cost
->mult
[0].extend
;
13608 /* This is either an integer multiply or a MADD. In both cases
13609 we want to recurse and cost the operands. */
13610 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13611 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13617 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
13620 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
13629 /* Floating-point FMA/FMUL can also support negations of the
13630 operands, unless the rounding mode is upward or downward in
13631 which case FNMUL is different than FMUL with operand negation. */
13632 bool neg0
= GET_CODE (op0
) == NEG
;
13633 bool neg1
= GET_CODE (op1
) == NEG
;
13634 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
13637 op0
= XEXP (op0
, 0);
13639 op1
= XEXP (op1
, 0);
13643 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13644 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
13647 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
13650 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
13651 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
13657 aarch64_address_cost (rtx x
,
13659 addr_space_t as ATTRIBUTE_UNUSED
,
13662 enum rtx_code c
= GET_CODE (x
);
13663 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
13664 struct aarch64_address_info info
;
13668 if (!aarch64_classify_address (&info
, x
, mode
, false))
13670 if (GET_CODE (x
) == CONST
|| SYMBOL_REF_P (x
))
13672 /* This is a CONST or SYMBOL ref which will be split
13673 in a different way depending on the code model in use.
13674 Cost it through the generic infrastructure. */
13675 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
13676 /* Divide through by the cost of one instruction to
13677 bring it to the same units as the address costs. */
13678 cost_symbol_ref
/= COSTS_N_INSNS (1);
13679 /* The cost is then the cost of preparing the address,
13680 followed by an immediate (possibly 0) offset. */
13681 return cost_symbol_ref
+ addr_cost
->imm_offset
;
13685 /* This is most likely a jump table from a case
13687 return addr_cost
->register_offset
;
13693 case ADDRESS_LO_SUM
:
13694 case ADDRESS_SYMBOLIC
:
13695 case ADDRESS_REG_IMM
:
13696 cost
+= addr_cost
->imm_offset
;
13699 case ADDRESS_REG_WB
:
13700 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
13701 cost
+= addr_cost
->pre_modify
;
13702 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
13704 unsigned int nvectors
= aarch64_ldn_stn_vectors (mode
);
13706 cost
+= addr_cost
->post_modify_ld3_st3
;
13707 else if (nvectors
== 4)
13708 cost
+= addr_cost
->post_modify_ld4_st4
;
13710 cost
+= addr_cost
->post_modify
;
13713 gcc_unreachable ();
13717 case ADDRESS_REG_REG
:
13718 cost
+= addr_cost
->register_offset
;
13721 case ADDRESS_REG_SXTW
:
13722 cost
+= addr_cost
->register_sextend
;
13725 case ADDRESS_REG_UXTW
:
13726 cost
+= addr_cost
->register_zextend
;
13730 gcc_unreachable ();
13734 if (info
.shift
> 0)
13736 /* For the sake of calculating the cost of the shifted register
13737 component, we can treat same sized modes in the same way. */
13738 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
13739 cost
+= addr_cost
->addr_scale_costs
.hi
;
13740 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
13741 cost
+= addr_cost
->addr_scale_costs
.si
;
13742 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
13743 cost
+= addr_cost
->addr_scale_costs
.di
;
13745 /* We can't tell, or this is a 128-bit vector. */
13746 cost
+= addr_cost
->addr_scale_costs
.ti
;
13752 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13753 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13757 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
13759 /* When optimizing for speed, use the cost of unpredictable branches. */
13760 const struct cpu_branch_cost
*branch_costs
=
13761 aarch64_tune_params
.branch_costs
;
13763 if (!speed_p
|| predictable_p
)
13764 return branch_costs
->predictable
;
13766 return branch_costs
->unpredictable
;
13769 /* Return true if X is a zero or sign extract
13770 usable in an ADD or SUB (extended register) instruction. */
13772 aarch64_rtx_arith_op_extract_p (rtx x
)
13774 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13776 if (GET_CODE (x
) == SIGN_EXTEND
13777 || GET_CODE (x
) == ZERO_EXTEND
)
13778 return REG_P (XEXP (x
, 0));
13784 aarch64_frint_unspec_p (unsigned int u
)
13788 case UNSPEC_FRINTZ
:
13789 case UNSPEC_FRINTP
:
13790 case UNSPEC_FRINTM
:
13791 case UNSPEC_FRINTA
:
13792 case UNSPEC_FRINTN
:
13793 case UNSPEC_FRINTX
:
13794 case UNSPEC_FRINTI
:
13802 /* Return true iff X is an rtx that will match an extr instruction
13803 i.e. as described in the *extr<mode>5_insn family of patterns.
13804 OP0 and OP1 will be set to the operands of the shifts involved
13805 on success and will be NULL_RTX otherwise. */
13808 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
13811 scalar_int_mode mode
;
13812 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
13815 *res_op0
= NULL_RTX
;
13816 *res_op1
= NULL_RTX
;
13818 if (GET_CODE (x
) != IOR
)
13824 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
13825 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
13827 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13828 if (GET_CODE (op1
) == ASHIFT
)
13829 std::swap (op0
, op1
);
13831 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
13834 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
13835 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
13837 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
13838 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
13840 *res_op0
= XEXP (op0
, 0);
13841 *res_op1
= XEXP (op1
, 0);
13849 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13850 storing it in *COST. Result is true if the total cost of the operation
13851 has now been calculated. */
13853 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
13857 enum rtx_code cmpcode
;
13858 const struct cpu_cost_table
*extra_cost
13859 = aarch64_tune_params
.insn_extra_cost
;
13861 if (COMPARISON_P (op0
))
13863 inner
= XEXP (op0
, 0);
13864 comparator
= XEXP (op0
, 1);
13865 cmpcode
= GET_CODE (op0
);
13870 comparator
= const0_rtx
;
13874 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
13876 /* Conditional branch. */
13877 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
13881 if (cmpcode
== NE
|| cmpcode
== EQ
)
13883 if (comparator
== const0_rtx
)
13885 /* TBZ/TBNZ/CBZ/CBNZ. */
13886 if (GET_CODE (inner
) == ZERO_EXTRACT
)
13888 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
13889 ZERO_EXTRACT
, 0, speed
);
13892 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
13896 if (register_operand (inner
, VOIDmode
)
13897 && aarch64_imm24 (comparator
, VOIDmode
))
13899 /* SUB and SUBS. */
13900 *cost
+= COSTS_N_INSNS (2);
13902 *cost
+= extra_cost
->alu
.arith
* 2;
13906 else if (cmpcode
== LT
|| cmpcode
== GE
)
13909 if (comparator
== const0_rtx
)
13914 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
13917 if (GET_CODE (op1
) == COMPARE
)
13919 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13920 if (XEXP (op1
, 1) == const0_rtx
)
13924 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
13926 if (GET_MODE_CLASS (mode
) == MODE_INT
)
13927 *cost
+= extra_cost
->alu
.arith
;
13929 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
13934 /* It's a conditional operation based on the status flags,
13935 so it must be some flavor of CSEL. */
13937 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13938 if (GET_CODE (op1
) == NEG
13939 || GET_CODE (op1
) == NOT
13940 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
13941 op1
= XEXP (op1
, 0);
13942 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
13944 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13945 op1
= XEXP (op1
, 0);
13946 op2
= XEXP (op2
, 0);
13948 else if (GET_CODE (op1
) == ZERO_EXTEND
&& op2
== const0_rtx
)
13950 inner
= XEXP (op1
, 0);
13951 if (GET_CODE (inner
) == NEG
|| GET_CODE (inner
) == NOT
)
13952 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13953 op1
= XEXP (inner
, 0);
13955 else if (op1
== constm1_rtx
|| op1
== const1_rtx
)
13957 /* Use CSINV or CSINC. */
13958 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
13961 else if (op2
== constm1_rtx
|| op2
== const1_rtx
)
13963 /* Use CSINV or CSINC. */
13964 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
13968 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
13969 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
13973 /* We don't know what this is, cost all operands. */
13977 /* Check whether X is a bitfield operation of the form shift + extend that
13978 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13979 operand to which the bitfield operation is applied. Otherwise return
13983 aarch64_extend_bitfield_pattern_p (rtx x
)
13985 rtx_code outer_code
= GET_CODE (x
);
13986 machine_mode outer_mode
= GET_MODE (x
);
13988 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
13989 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
13992 rtx inner
= XEXP (x
, 0);
13993 rtx_code inner_code
= GET_CODE (inner
);
13994 machine_mode inner_mode
= GET_MODE (inner
);
13997 switch (inner_code
)
14000 if (CONST_INT_P (XEXP (inner
, 1))
14001 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14002 op
= XEXP (inner
, 0);
14005 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
14006 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14007 op
= XEXP (inner
, 0);
14010 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
14011 && (inner_mode
== QImode
|| inner_mode
== HImode
))
14012 op
= XEXP (inner
, 0);
14021 /* Return true if the mask and a shift amount from an RTX of the form
14022 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
14023 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
14026 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
14029 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
14030 && INTVAL (mask
) > 0
14031 && UINTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
14032 && exact_log2 ((UINTVAL (mask
) >> UINTVAL (shft_amnt
)) + 1) >= 0
14034 & ((HOST_WIDE_INT_1U
<< UINTVAL (shft_amnt
)) - 1)) == 0;
14037 /* Return true if the masks and a shift amount from an RTX of the form
14038 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14039 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
14042 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
14043 unsigned HOST_WIDE_INT mask1
,
14044 unsigned HOST_WIDE_INT shft_amnt
,
14045 unsigned HOST_WIDE_INT mask2
)
14047 unsigned HOST_WIDE_INT t
;
14049 /* Verify that there is no overlap in what bits are set in the two masks. */
14050 if (mask1
!= ~mask2
)
14053 /* Verify that mask2 is not all zeros or ones. */
14054 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
14057 /* The shift amount should always be less than the mode size. */
14058 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
14060 /* Verify that the mask being shifted is contiguous and would be in the
14061 least significant bits after shifting by shft_amnt. */
14062 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
14063 return (t
== (t
& -t
));
14066 /* Return true if X is an RTX representing an operation in the ABD family
14067 of instructions. */
14070 aarch64_abd_rtx_p (rtx x
)
14072 if (GET_CODE (x
) != MINUS
)
14074 rtx max_arm
= XEXP (x
, 0);
14075 rtx min_arm
= XEXP (x
, 1);
14076 if (GET_CODE (max_arm
) != SMAX
&& GET_CODE (max_arm
) != UMAX
)
14078 bool signed_p
= GET_CODE (max_arm
) == SMAX
;
14079 if (signed_p
&& GET_CODE (min_arm
) != SMIN
)
14081 else if (!signed_p
&& GET_CODE (min_arm
) != UMIN
)
14084 rtx maxop0
= XEXP (max_arm
, 0);
14085 rtx maxop1
= XEXP (max_arm
, 1);
14086 rtx minop0
= XEXP (min_arm
, 0);
14087 rtx minop1
= XEXP (min_arm
, 1);
14088 return rtx_equal_p (maxop0
, minop0
) && rtx_equal_p (maxop1
, minop1
);
14091 /* Calculate the cost of calculating X, storing it in *COST. Result
14092 is true if the total cost of the operation has now been calculated. */
14094 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
14095 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
14098 const struct cpu_cost_table
*extra_cost
14099 = aarch64_tune_params
.insn_extra_cost
;
14100 rtx_code code
= GET_CODE (x
);
14101 scalar_int_mode int_mode
;
14103 /* By default, assume that everything has equivalent cost to the
14104 cheapest instruction. Any additional costs are applied as a delta
14105 above this default. */
14106 *cost
= COSTS_N_INSNS (1);
14111 /* The cost depends entirely on the operands to SET. */
14113 op0
= SET_DEST (x
);
14116 switch (GET_CODE (op0
))
14121 rtx address
= XEXP (op0
, 0);
14122 if (VECTOR_MODE_P (mode
))
14123 *cost
+= extra_cost
->ldst
.storev
;
14124 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14125 *cost
+= extra_cost
->ldst
.store
;
14126 else if (mode
== SFmode
|| mode
== SDmode
)
14127 *cost
+= extra_cost
->ldst
.storef
;
14128 else if (mode
== DFmode
|| mode
== DDmode
)
14129 *cost
+= extra_cost
->ldst
.stored
;
14132 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14136 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
14140 if (! REG_P (SUBREG_REG (op0
)))
14141 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
14143 /* Fall through. */
14145 /* The cost is one per vector-register copied. */
14146 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
14148 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
14149 *cost
= COSTS_N_INSNS (nregs
);
14151 /* const0_rtx is in general free, but we will use an
14152 instruction to set a register to 0. */
14153 else if (REG_P (op1
) || op1
== const0_rtx
)
14155 /* The cost is 1 per register copied. */
14156 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
14157 *cost
= COSTS_N_INSNS (nregs
);
14160 /* Cost is just the cost of the RHS of the set. */
14161 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
14166 /* Bit-field insertion. Strip any redundant widening of
14167 the RHS to meet the width of the target. */
14168 if (SUBREG_P (op1
))
14169 op1
= SUBREG_REG (op1
);
14170 if ((GET_CODE (op1
) == ZERO_EXTEND
14171 || GET_CODE (op1
) == SIGN_EXTEND
)
14172 && CONST_INT_P (XEXP (op0
, 1))
14173 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
14174 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
14175 op1
= XEXP (op1
, 0);
14177 if (CONST_INT_P (op1
))
14179 /* MOV immediate is assumed to always be cheap. */
14180 *cost
= COSTS_N_INSNS (1);
14186 *cost
+= extra_cost
->alu
.bfi
;
14187 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
14193 /* We can't make sense of this, assume default cost. */
14194 *cost
= COSTS_N_INSNS (1);
14200 /* If an instruction can incorporate a constant within the
14201 instruction, the instruction's expression avoids calling
14202 rtx_cost() on the constant. If rtx_cost() is called on a
14203 constant, then it is usually because the constant must be
14204 moved into a register by one or more instructions.
14206 The exception is constant 0, which can be expressed
14207 as XZR/WZR and is therefore free. The exception to this is
14208 if we have (set (reg) (const0_rtx)) in which case we must cost
14209 the move. However, we can catch that when we cost the SET, so
14210 we don't need to consider that here. */
14211 if (x
== const0_rtx
)
14215 /* To an approximation, building any other constant is
14216 proportionally expensive to the number of instructions
14217 required to build that constant. This is true whether we
14218 are compiling for SPEED or otherwise. */
14219 machine_mode imode
= known_le (GET_MODE_SIZE (mode
), 4)
14221 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
14222 (NULL_RTX
, x
, false, imode
));
14228 /* First determine number of instructions to do the move
14229 as an integer constant. */
14230 if (!aarch64_float_const_representable_p (x
)
14231 && !aarch64_can_const_movi_rtx_p (x
, mode
)
14232 && aarch64_float_const_rtx_p (x
))
14234 unsigned HOST_WIDE_INT ival
;
14235 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
14236 gcc_assert (succeed
);
14238 machine_mode imode
= known_eq (GET_MODE_SIZE (mode
), 8)
14240 int ncost
= aarch64_internal_mov_immediate
14241 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
14242 *cost
+= COSTS_N_INSNS (ncost
);
14248 /* mov[df,sf]_aarch64. */
14249 if (aarch64_float_const_representable_p (x
))
14250 /* FMOV (scalar immediate). */
14251 *cost
+= extra_cost
->fp
[mode
== DFmode
|| mode
== DDmode
].fpconst
;
14252 else if (!aarch64_float_const_zero_rtx_p (x
))
14254 /* This will be a load from memory. */
14255 if (mode
== DFmode
|| mode
== DDmode
)
14256 *cost
+= extra_cost
->ldst
.loadd
;
14258 *cost
+= extra_cost
->ldst
.loadf
;
14261 /* Otherwise this is +0.0. We get this using MOVI d0, #0
14262 or MOV v0.s[0], wzr - neither of which are modeled by the
14263 cost tables. Just use the default cost. */
14273 /* For loads we want the base cost of a load, plus an
14274 approximation for the additional cost of the addressing
14276 rtx address
= XEXP (x
, 0);
14277 if (VECTOR_MODE_P (mode
))
14278 *cost
+= extra_cost
->ldst
.loadv
;
14279 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14280 *cost
+= extra_cost
->ldst
.load
;
14281 else if (mode
== SFmode
|| mode
== SDmode
)
14282 *cost
+= extra_cost
->ldst
.loadf
;
14283 else if (mode
== DFmode
|| mode
== DDmode
)
14284 *cost
+= extra_cost
->ldst
.loadd
;
14287 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14296 if (VECTOR_MODE_P (mode
))
14298 /* Many vector comparison operations are represented as NEG
14299 of a comparison. */
14300 if (COMPARISON_P (op0
))
14302 rtx op00
= XEXP (op0
, 0);
14303 rtx op01
= XEXP (op0
, 1);
14304 machine_mode inner_mode
= GET_MODE (op00
);
14306 if (GET_MODE_CLASS (inner_mode
) == MODE_VECTOR_FLOAT
14307 && GET_CODE (op00
) == ABS
14308 && GET_CODE (op01
) == ABS
)
14310 op00
= XEXP (op00
, 0);
14311 op01
= XEXP (op01
, 0);
14313 *cost
+= rtx_cost (op00
, inner_mode
, GET_CODE (op0
), 0, speed
);
14314 *cost
+= rtx_cost (op01
, inner_mode
, GET_CODE (op0
), 1, speed
);
14316 *cost
+= extra_cost
->vect
.alu
;
14322 *cost
+= extra_cost
->vect
.alu
;
14327 if (GET_MODE_CLASS (mode
) == MODE_INT
)
14329 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
14330 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
14333 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
14337 /* Cost this as SUB wzr, X. */
14338 op0
= CONST0_RTX (mode
);
14343 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14345 /* Support (neg(fma...)) as a single instruction only if
14346 sign of zeros is unimportant. This matches the decision
14347 making in aarch64.md. */
14348 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
14351 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
14354 if (GET_CODE (op0
) == MULT
)
14357 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
14362 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
14372 if (VECTOR_MODE_P (mode
))
14373 *cost
+= extra_cost
->vect
.alu
;
14375 *cost
+= extra_cost
->alu
.clz
;
14381 if (VECTOR_MODE_P (mode
))
14383 *cost
= COSTS_N_INSNS (3);
14385 *cost
+= extra_cost
->vect
.alu
* 3;
14387 else if (TARGET_CSSC
)
14389 *cost
= COSTS_N_INSNS (1);
14391 *cost
+= extra_cost
->alu
.clz
;
14395 *cost
= COSTS_N_INSNS (2);
14397 *cost
+= extra_cost
->alu
.clz
+ extra_cost
->alu
.rev
;
14405 if (op1
== const0_rtx
14406 && GET_CODE (op0
) == AND
)
14409 mode
= GET_MODE (op0
);
14413 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
14415 /* TODO: A write to the CC flags possibly costs extra, this
14416 needs encoding in the cost tables. */
14418 mode
= GET_MODE (op0
);
14420 if (GET_CODE (op0
) == AND
)
14426 if (GET_CODE (op0
) == PLUS
)
14428 /* ADDS (and CMN alias). */
14433 if (GET_CODE (op0
) == MINUS
)
14440 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
14441 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
14442 && CONST_INT_P (XEXP (op0
, 2)))
14444 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14445 Handle it here directly rather than going to cost_logic
14446 since we know the immediate generated for the TST is valid
14447 so we can avoid creating an intermediate rtx for it only
14448 for costing purposes. */
14450 *cost
+= extra_cost
->alu
.logical
;
14452 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
14453 ZERO_EXTRACT
, 0, speed
);
14457 if (GET_CODE (op1
) == NEG
)
14461 *cost
+= extra_cost
->alu
.arith
;
14463 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
14464 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
14470 Compare can freely swap the order of operands, and
14471 canonicalization puts the more complex operation first.
14472 But the integer MINUS logic expects the shift/extend
14473 operation in op1. */
14475 || (SUBREG_P (op0
) && REG_P (SUBREG_REG (op0
)))))
14483 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
14487 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
14489 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
14491 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
14492 /* FCMP supports constant 0.0 for no extra cost. */
14498 if (VECTOR_MODE_P (mode
))
14500 /* Vector compare. */
14502 *cost
+= extra_cost
->vect
.alu
;
14504 if (aarch64_float_const_zero_rtx_p (op1
))
14506 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14520 if (VECTOR_MODE_P (mode
))
14522 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14523 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
14525 /* Recognise the SABD and UABD operation here.
14526 Recursion from the PLUS case will catch the accumulating
14528 if (aarch64_abd_rtx_p (x
))
14531 *cost
+= extra_cost
->vect
.alu
;
14534 /* SUBL2 and SUBW2.
14535 The select-operand-high-half versions of the sub instruction
14536 have the same cost as the regular three vector version -
14537 don't add the costs of the select into the costs of the sub.
14539 op0
= aarch64_strip_extend_vec_half (op0
);
14540 op1
= aarch64_strip_extend_vec_half (op1
);
14544 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
14546 /* Detect valid immediates. */
14547 if ((GET_MODE_CLASS (mode
) == MODE_INT
14548 || (GET_MODE_CLASS (mode
) == MODE_CC
14549 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
14550 && CONST_INT_P (op1
)
14551 && aarch64_uimm12_shift (INTVAL (op1
)))
14554 /* SUB(S) (immediate). */
14555 *cost
+= extra_cost
->alu
.arith
;
14559 /* Look for SUB (extended register). */
14560 if (is_a
<scalar_int_mode
> (mode
)
14561 && aarch64_rtx_arith_op_extract_p (op1
))
14564 *cost
+= extra_cost
->alu
.extend_arith
;
14566 op1
= aarch64_strip_extend (op1
, true);
14567 *cost
+= rtx_cost (op1
, VOIDmode
,
14568 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
14572 rtx new_op1
= aarch64_strip_extend (op1
, false);
14574 /* Cost this as an FMA-alike operation. */
14575 if ((GET_CODE (new_op1
) == MULT
14576 || aarch64_shift_p (GET_CODE (new_op1
)))
14577 && code
!= COMPARE
)
14579 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
14580 (enum rtx_code
) code
,
14585 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
14589 if (VECTOR_MODE_P (mode
))
14592 *cost
+= extra_cost
->vect
.alu
;
14594 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14597 *cost
+= extra_cost
->alu
.arith
;
14599 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14602 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
14616 if (VECTOR_MODE_P (mode
))
14618 /* ADDL2 and ADDW2. */
14619 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14620 if (TARGET_SIMD
&& (vec_flags
& VEC_ADVSIMD
))
14622 /* The select-operand-high-half versions of the add instruction
14623 have the same cost as the regular three vector version -
14624 don't add the costs of the select into the costs of the add.
14626 op0
= aarch64_strip_extend_vec_half (op0
);
14627 op1
= aarch64_strip_extend_vec_half (op1
);
14631 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
14632 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
14635 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
14636 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
14640 if (GET_MODE_CLASS (mode
) == MODE_INT
14641 && (aarch64_plus_immediate (op1
, mode
)
14642 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
14644 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
14648 /* ADD (immediate). */
14649 *cost
+= extra_cost
->alu
.arith
;
14651 /* Some tunings prefer to not use the VL-based scalar ops.
14652 Increase the cost of the poly immediate to prevent their
14654 if (GET_CODE (op1
) == CONST_POLY_INT
14655 && (aarch64_tune_params
.extra_tuning_flags
14656 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
))
14657 *cost
+= COSTS_N_INSNS (1);
14662 if (aarch64_pluslong_immediate (op1
, mode
))
14664 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14665 if ((INTVAL (op1
) & 0xfff) != 0)
14666 *cost
+= COSTS_N_INSNS (1);
14668 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
14672 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
14674 /* Look for ADD (extended register). */
14675 if (is_a
<scalar_int_mode
> (mode
)
14676 && aarch64_rtx_arith_op_extract_p (op0
))
14679 *cost
+= extra_cost
->alu
.extend_arith
;
14681 op0
= aarch64_strip_extend (op0
, true);
14682 *cost
+= rtx_cost (op0
, VOIDmode
,
14683 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
14687 /* Strip any extend, leave shifts behind as we will
14688 cost them through mult_cost. */
14689 new_op0
= aarch64_strip_extend (op0
, false);
14691 if (GET_CODE (new_op0
) == MULT
14692 || aarch64_shift_p (GET_CODE (new_op0
)))
14694 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
14699 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
14703 if (VECTOR_MODE_P (mode
))
14706 *cost
+= extra_cost
->vect
.alu
;
14708 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
14711 *cost
+= extra_cost
->alu
.arith
;
14713 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14716 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
14724 *cost
= COSTS_N_INSNS (1);
14728 if (VECTOR_MODE_P (mode
))
14729 *cost
+= extra_cost
->vect
.alu
;
14731 *cost
+= extra_cost
->alu
.rev
;
14736 if (aarch_rev16_p (x
))
14738 *cost
= COSTS_N_INSNS (1);
14742 if (VECTOR_MODE_P (mode
))
14743 *cost
+= extra_cost
->vect
.alu
;
14745 *cost
+= extra_cost
->alu
.rev
;
14750 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
14752 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
14753 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
14755 *cost
+= extra_cost
->alu
.shift
;
14759 /* Fall through. */
14766 if (VECTOR_MODE_P (mode
))
14769 *cost
+= extra_cost
->vect
.alu
;
14774 && GET_CODE (op0
) == MULT
14775 && CONST_INT_P (XEXP (op0
, 1))
14776 && CONST_INT_P (op1
)
14777 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
14778 INTVAL (op1
)) != 0)
14780 /* This is a UBFM/SBFM. */
14781 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
14783 *cost
+= extra_cost
->alu
.bfx
;
14787 if (is_int_mode (mode
, &int_mode
))
14789 if (CONST_INT_P (op1
))
14791 /* We have a mask + shift version of a UBFIZ
14792 i.e. the *andim_ashift<mode>_bfiz pattern. */
14793 if (GET_CODE (op0
) == ASHIFT
14794 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
14797 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
14798 (enum rtx_code
) code
, 0, speed
);
14800 *cost
+= extra_cost
->alu
.bfx
;
14804 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
14806 /* We possibly get the immediate for free, this is not
14808 *cost
+= rtx_cost (op0
, int_mode
,
14809 (enum rtx_code
) code
, 0, speed
);
14811 *cost
+= extra_cost
->alu
.logical
;
14820 /* Handle ORN, EON, or BIC. */
14821 if (GET_CODE (op0
) == NOT
)
14822 op0
= XEXP (op0
, 0);
14824 new_op0
= aarch64_strip_shift (op0
);
14826 /* If we had a shift on op0 then this is a logical-shift-
14827 by-register/immediate operation. Otherwise, this is just
14828 a logical operation. */
14831 if (new_op0
!= op0
)
14833 /* Shift by immediate. */
14834 if (CONST_INT_P (XEXP (op0
, 1)))
14835 *cost
+= extra_cost
->alu
.log_shift
;
14837 *cost
+= extra_cost
->alu
.log_shift_reg
;
14840 *cost
+= extra_cost
->alu
.logical
;
14843 /* In both cases we want to cost both operands. */
14844 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
14846 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
14856 op0
= aarch64_strip_shift (x
);
14858 if (VECTOR_MODE_P (mode
))
14861 *cost
+= extra_cost
->vect
.alu
;
14865 /* MVN-shifted-reg. */
14868 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
14871 *cost
+= extra_cost
->alu
.log_shift
;
14875 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14876 Handle the second form here taking care that 'a' in the above can
14878 else if (GET_CODE (op0
) == XOR
)
14880 rtx newop0
= XEXP (op0
, 0);
14881 rtx newop1
= XEXP (op0
, 1);
14882 rtx op0_stripped
= aarch64_strip_shift (newop0
);
14884 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
14885 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
14889 if (op0_stripped
!= newop0
)
14890 *cost
+= extra_cost
->alu
.log_shift
;
14892 *cost
+= extra_cost
->alu
.logical
;
14899 *cost
+= extra_cost
->alu
.logical
;
14906 /* If a value is written in SI mode, then zero extended to DI
14907 mode, the operation will in general be free as a write to
14908 a 'w' register implicitly zeroes the upper bits of an 'x'
14909 register. However, if this is
14911 (set (reg) (zero_extend (reg)))
14913 we must cost the explicit register move. */
14915 && GET_MODE (op0
) == SImode
)
14917 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
14919 /* If OP_COST is non-zero, then the cost of the zero extend
14920 is effectively the cost of the inner operation. Otherwise
14921 we have a MOV instruction and we take the cost from the MOV
14922 itself. This is true independently of whether we are
14923 optimizing for space or time. */
14929 else if (MEM_P (op0
))
14931 /* All loads can zero extend to any size for free. */
14932 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
14936 op0
= aarch64_extend_bitfield_pattern_p (x
);
14939 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
14941 *cost
+= extra_cost
->alu
.bfx
;
14947 if (VECTOR_MODE_P (mode
))
14950 *cost
+= extra_cost
->vect
.alu
;
14954 /* We generate an AND instead of UXTB/UXTH. */
14955 *cost
+= extra_cost
->alu
.logical
;
14961 if (MEM_P (XEXP (x
, 0)))
14966 rtx address
= XEXP (XEXP (x
, 0), 0);
14967 *cost
+= extra_cost
->ldst
.load_sign_extend
;
14970 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
14976 op0
= aarch64_extend_bitfield_pattern_p (x
);
14979 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
14981 *cost
+= extra_cost
->alu
.bfx
;
14987 if (VECTOR_MODE_P (mode
))
14988 *cost
+= extra_cost
->vect
.alu
;
14990 *cost
+= extra_cost
->alu
.extend
;
15002 if (CONST_INT_P (op1
))
15006 if (VECTOR_MODE_P (mode
))
15008 /* Vector shift (immediate). */
15009 *cost
+= extra_cost
->vect
.alu
;
15013 /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
15014 These are all aliases. */
15015 *cost
+= extra_cost
->alu
.shift
;
15019 /* We can incorporate zero/sign extend for free. */
15020 if (GET_CODE (op0
) == ZERO_EXTEND
15021 || GET_CODE (op0
) == SIGN_EXTEND
)
15022 op0
= XEXP (op0
, 0);
15024 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
15029 if (VECTOR_MODE_P (mode
))
15032 /* Vector shift (register). */
15033 *cost
+= extra_cost
->vect
.alu
;
15039 *cost
+= extra_cost
->alu
.shift_reg
;
15041 /* The register shift amount may be in a shorter mode expressed
15042 as a lowpart SUBREG. For costing purposes just look inside. */
15043 if (SUBREG_P (op1
) && subreg_lowpart_p (op1
))
15044 op1
= SUBREG_REG (op1
);
15045 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
15046 && CONST_INT_P (XEXP (op1
, 1))
15047 && known_eq (INTVAL (XEXP (op1
, 1)),
15048 GET_MODE_BITSIZE (mode
) - 1))
15050 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
15051 /* We already demanded XEXP (op1, 0) to be REG_P, so
15052 don't recurse into it. */
15056 return false; /* All arguments need to be in registers. */
15061 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
15062 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
15066 *cost
+= extra_cost
->ldst
.load
;
15068 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
15069 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
15071 /* ADRP, followed by ADD. */
15072 *cost
+= COSTS_N_INSNS (1);
15074 *cost
+= 2 * extra_cost
->alu
.arith
;
15076 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
15077 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
15081 *cost
+= extra_cost
->alu
.arith
;
15086 /* One extra load instruction, after accessing the GOT. */
15087 *cost
+= COSTS_N_INSNS (1);
15089 *cost
+= extra_cost
->ldst
.load
;
15095 /* ADRP/ADD (immediate). */
15097 *cost
+= extra_cost
->alu
.arith
;
15105 if (VECTOR_MODE_P (mode
))
15106 *cost
+= extra_cost
->vect
.alu
;
15108 *cost
+= extra_cost
->alu
.bfx
;
15111 /* We can trust that the immediates used will be correct (there
15112 are no by-register forms), so we need only cost op0. */
15113 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
15117 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
15118 /* aarch64_rtx_mult_cost always handles recursion to its
15123 /* We can expand signed mod by power of 2 using a NEGS, two parallel
15124 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
15125 an unconditional negate. This case should only ever be reached through
15126 the set_smod_pow2_cheap check in expmed.cc. */
15127 if (CONST_INT_P (XEXP (x
, 1))
15128 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
15129 && (mode
== SImode
|| mode
== DImode
))
15131 /* We expand to 4 instructions. Reset the baseline. */
15132 *cost
= COSTS_N_INSNS (4);
15135 *cost
+= 2 * extra_cost
->alu
.logical
15136 + 2 * extra_cost
->alu
.arith
;
15141 /* Fall-through. */
15145 /* Slighly prefer UMOD over SMOD. */
15146 if (VECTOR_MODE_P (mode
))
15147 *cost
+= extra_cost
->vect
.alu
;
15148 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
15149 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
15150 + extra_cost
->mult
[mode
== DImode
].idiv
15151 + (code
== MOD
? 1 : 0));
15153 return false; /* All arguments need to be in registers. */
15160 if (VECTOR_MODE_P (mode
))
15161 *cost
+= extra_cost
->vect
.alu
;
15162 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
15163 /* There is no integer SQRT, so only DIV and UDIV can get
15165 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
15166 /* Slighly prefer UDIV over SDIV. */
15167 + (code
== DIV
? 1 : 0));
15169 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
15171 return false; /* All arguments need to be in registers. */
15174 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
15175 XEXP (x
, 2), cost
, speed
);
15188 return false; /* All arguments must be in registers. */
15197 if (VECTOR_MODE_P (mode
))
15198 *cost
+= extra_cost
->vect
.alu
;
15200 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
15203 /* FMSUB, FNMADD, and FNMSUB are free. */
15204 if (GET_CODE (op0
) == NEG
)
15205 op0
= XEXP (op0
, 0);
15207 if (GET_CODE (op2
) == NEG
)
15208 op2
= XEXP (op2
, 0);
15210 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15211 and the by-element operand as operand 0. */
15212 if (GET_CODE (op1
) == NEG
)
15213 op1
= XEXP (op1
, 0);
15215 /* Catch vector-by-element operations. The by-element operand can
15216 either be (vec_duplicate (vec_select (x))) or just
15217 (vec_select (x)), depending on whether we are multiplying by
15218 a vector or a scalar.
15220 Canonicalization is not very good in these cases, FMA4 will put the
15221 by-element operand as operand 0, FNMA4 will have it as operand 1. */
15222 if (GET_CODE (op0
) == VEC_DUPLICATE
)
15223 op0
= XEXP (op0
, 0);
15224 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
15225 op1
= XEXP (op1
, 0);
15227 if (GET_CODE (op0
) == VEC_SELECT
)
15228 op0
= XEXP (op0
, 0);
15229 else if (GET_CODE (op1
) == VEC_SELECT
)
15230 op1
= XEXP (op1
, 0);
15232 /* If the remaining parameters are not registers,
15233 get the cost to put them into registers. */
15234 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
15235 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
15236 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
15240 case UNSIGNED_FLOAT
:
15242 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
15248 if (VECTOR_MODE_P (mode
))
15250 /*Vector truncate. */
15251 *cost
+= extra_cost
->vect
.alu
;
15254 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
15258 case FLOAT_TRUNCATE
:
15261 if (VECTOR_MODE_P (mode
))
15263 /*Vector conversion. */
15264 *cost
+= extra_cost
->vect
.alu
;
15267 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
15274 /* Strip the rounding part. They will all be implemented
15275 by the fcvt* family of instructions anyway. */
15276 if (GET_CODE (x
) == UNSPEC
)
15278 unsigned int uns_code
= XINT (x
, 1);
15280 if (uns_code
== UNSPEC_FRINTA
15281 || uns_code
== UNSPEC_FRINTM
15282 || uns_code
== UNSPEC_FRINTN
15283 || uns_code
== UNSPEC_FRINTP
15284 || uns_code
== UNSPEC_FRINTZ
)
15285 x
= XVECEXP (x
, 0, 0);
15290 if (VECTOR_MODE_P (mode
))
15291 *cost
+= extra_cost
->vect
.alu
;
15293 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
15296 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15297 fixed-point fcvt. */
15298 if (GET_CODE (x
) == MULT
15299 && ((VECTOR_MODE_P (mode
)
15300 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
15301 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
15303 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
15308 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
15312 if (VECTOR_MODE_P (mode
))
15314 /* ABS (vector). */
15316 *cost
+= extra_cost
->vect
.alu
;
15318 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15322 /* FABD, which is analogous to FADD. */
15323 if (GET_CODE (op0
) == MINUS
)
15325 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
15326 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
15328 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15332 /* Simple FABS is analogous to FNEG. */
15334 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
15338 /* Integer ABS will either be split to
15339 two arithmetic instructions, or will be an ABS
15340 (scalar), which we don't model. */
15341 *cost
= COSTS_N_INSNS (2);
15343 *cost
+= 2 * extra_cost
->alu
.arith
;
15351 if (VECTOR_MODE_P (mode
))
15352 *cost
+= extra_cost
->vect
.alu
;
15355 /* FMAXNM/FMINNM/FMAX/FMIN.
15356 TODO: This may not be accurate for all implementations, but
15357 we do not model this in the cost tables. */
15358 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
15364 /* The floating point round to integer frint* instructions. */
15365 if (aarch64_frint_unspec_p (XINT (x
, 1)))
15368 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
15376 /* Decompose <su>muldi3_highpart. */
15377 if (/* (truncate:DI */
15380 && GET_MODE (XEXP (x
, 0)) == TImode
15381 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
15383 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
15384 /* (ANY_EXTEND:TI (reg:DI))
15385 (ANY_EXTEND:TI (reg:DI))) */
15386 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
15387 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
15388 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
15389 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
15390 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
15391 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
15392 /* (const_int 64) */
15393 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
15394 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
15398 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
15399 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
15400 mode
, MULT
, 0, speed
);
15401 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
15402 mode
, MULT
, 1, speed
);
15408 /* Load using MOVI/MVNI. */
15409 if (aarch64_simd_valid_immediate (x
, NULL
))
15410 *cost
= extra_cost
->vect
.movi
;
15411 else /* Load using constant pool. */
15412 *cost
= extra_cost
->ldst
.load
;
15416 /* depending on the operation, either DUP or INS.
15417 For now, keep default costing. */
15419 case VEC_DUPLICATE
:
15420 /* Load using a DUP. */
15421 *cost
= extra_cost
->vect
.dup
;
15425 rtx op0
= XEXP (x
, 0);
15426 *cost
= rtx_cost (op0
, GET_MODE (op0
), VEC_SELECT
, 0, speed
);
15428 /* cost subreg of 0 as free, otherwise as DUP */
15429 rtx op1
= XEXP (x
, 1);
15430 if (vec_series_lowpart_p (mode
, GET_MODE (op1
), op1
))
15432 else if (vec_series_highpart_p (mode
, GET_MODE (op1
), op1
))
15433 *cost
= extra_cost
->vect
.dup
;
15435 *cost
= extra_cost
->vect
.extract
;
15443 && flag_aarch64_verbose_cost
)
15444 fprintf (dump_file
,
15445 "\nFailed to cost RTX. Assuming default cost.\n");
15450 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15451 calculated for X. This cost is stored in *COST. Returns true
15452 if the total cost of X was calculated. */
15454 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
15455 int param
, int *cost
, bool speed
)
15457 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
15460 && flag_aarch64_verbose_cost
)
15462 print_rtl_single (dump_file
, x
);
15463 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
15464 speed
? "Hot" : "Cold",
15465 *cost
, result
? "final" : "partial");
15472 aarch64_register_move_cost (machine_mode mode
,
15473 reg_class_t from_i
, reg_class_t to_i
)
15475 enum reg_class from
= (enum reg_class
) from_i
;
15476 enum reg_class to
= (enum reg_class
) to_i
;
15477 const struct cpu_regmove_cost
*regmove_cost
15478 = aarch64_tune_params
.regmove_cost
;
15480 /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS. */
15481 if (reg_class_subset_p (to
, POINTER_REGS
))
15484 if (reg_class_subset_p (from
, POINTER_REGS
))
15485 from
= GENERAL_REGS
;
15487 /* Make RDFFR very expensive. In particular, if we know that the FFR
15488 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15489 as a way of obtaining a PTRUE. */
15490 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
15491 && hard_reg_set_subset_p (reg_class_contents
[from_i
],
15492 reg_class_contents
[FFR_REGS
]))
15495 /* Moving between GPR and stack cost is the same as GP2GP. */
15496 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
15497 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
15498 return regmove_cost
->GP2GP
;
15500 /* To/From the stack register, we move via the gprs. */
15501 if (to
== STACK_REG
|| from
== STACK_REG
)
15502 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
15503 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
15505 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15506 if (vec_flags
!= (VEC_ADVSIMD
| VEC_STRUCT
| VEC_PARTIAL
)
15507 && known_eq (GET_MODE_SIZE (mode
), 16))
15509 /* 128-bit operations on general registers require 2 instructions. */
15510 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
15511 return regmove_cost
->GP2GP
* 2;
15512 else if (from
== GENERAL_REGS
)
15513 return regmove_cost
->GP2FP
* 2;
15514 else if (to
== GENERAL_REGS
)
15515 return regmove_cost
->FP2GP
* 2;
15517 /* When AdvSIMD instructions are disabled it is not possible to move
15518 a 128-bit value directly between Q registers. This is handled in
15519 secondary reload. A general register is used as a scratch to move
15520 the upper DI value and the lower DI value is moved directly,
15521 hence the cost is the sum of three moves. */
15522 if (!TARGET_SIMD
&& !TARGET_SVE
)
15523 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
15525 return regmove_cost
->FP2FP
;
15528 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
15529 return regmove_cost
->GP2GP
;
15530 else if (from
== GENERAL_REGS
)
15531 return regmove_cost
->GP2FP
;
15532 else if (to
== GENERAL_REGS
)
15533 return regmove_cost
->FP2GP
;
15535 if (!TARGET_SIMD
&& vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15537 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15538 The cost must be greater than 2 units to indicate that direct
15539 moves aren't possible. */
15540 auto per_vector
= (aarch64_tune_params
.memmov_cost
.load_fp
15541 + aarch64_tune_params
.memmov_cost
.store_fp
);
15542 return MIN (CEIL (per_vector
, 2), 4);
15545 return regmove_cost
->FP2FP
;
15548 /* Implements TARGET_MEMORY_MOVE_COST. */
15550 aarch64_memory_move_cost (machine_mode mode
, reg_class_t rclass_i
, bool in
)
15552 enum reg_class rclass
= (enum reg_class
) rclass_i
;
15553 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
15554 ? reg_classes_intersect_p (rclass
, PR_REGS
)
15555 : reg_class_subset_p (rclass
, PR_REGS
))
15557 ? aarch64_tune_params
.memmov_cost
.load_pred
15558 : aarch64_tune_params
.memmov_cost
.store_pred
);
15560 if (VECTOR_MODE_P (mode
) || FLOAT_MODE_P (mode
)
15561 ? reg_classes_intersect_p (rclass
, FP_REGS
)
15562 : reg_class_subset_p (rclass
, FP_REGS
))
15564 ? aarch64_tune_params
.memmov_cost
.load_fp
15565 : aarch64_tune_params
.memmov_cost
.store_fp
);
15568 ? aarch64_tune_params
.memmov_cost
.load_int
15569 : aarch64_tune_params
.memmov_cost
.store_int
);
15572 /* Implement TARGET_INSN_COST. We have the opportunity to do something
15573 much more productive here, such as using insn attributes to cost things.
15574 But we don't, not yet.
15576 The main point of this current definition is to make calling insn_cost
15577 on one instruction equivalent to calling seq_cost on a sequence that
15578 contains only that instruction. The default definition would instead
15579 only look at SET_SRCs, ignoring SET_DESTs.
15581 This ensures that, for example, storing a 128-bit zero vector is more
15582 expensive than storing a 128-bit vector register. A move of zero
15583 into a 128-bit vector register followed by multiple stores of that
15584 register is then cheaper than multiple stores of zero (which would
15585 use STP of XZR). This in turn allows STP Qs to be formed. */
15587 aarch64_insn_cost (rtx_insn
*insn
, bool speed
)
15589 if (rtx set
= single_set (insn
))
15590 return set_rtx_cost (set
, speed
);
15591 return pattern_cost (PATTERN (insn
), speed
);
15594 /* Implement TARGET_INIT_BUILTINS. */
15596 aarch64_init_builtins ()
15598 aarch64_general_init_builtins ();
15599 aarch64_sve::init_builtins ();
15600 #ifdef SUBTARGET_INIT_BUILTINS
15601 SUBTARGET_INIT_BUILTINS
;
15605 /* Implement TARGET_FOLD_BUILTIN. */
15607 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
15609 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15610 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15611 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
15612 switch (code
& AARCH64_BUILTIN_CLASS
)
15614 case AARCH64_BUILTIN_GENERAL
:
15615 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
15617 case AARCH64_BUILTIN_SVE
:
15620 gcc_unreachable ();
15623 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15625 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
15627 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
15628 tree fndecl
= gimple_call_fndecl (stmt
);
15629 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15630 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15631 gimple
*new_stmt
= NULL
;
15632 switch (code
& AARCH64_BUILTIN_CLASS
)
15634 case AARCH64_BUILTIN_GENERAL
:
15635 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
, gsi
);
15638 case AARCH64_BUILTIN_SVE
:
15639 new_stmt
= aarch64_sve::gimple_fold_builtin (subcode
, gsi
, stmt
);
15646 gsi_replace (gsi
, new_stmt
, false);
15650 /* Implement TARGET_EXPAND_BUILTIN. */
15652 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int ignore
)
15654 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
15655 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15656 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15657 switch (code
& AARCH64_BUILTIN_CLASS
)
15659 case AARCH64_BUILTIN_GENERAL
:
15660 return aarch64_general_expand_builtin (subcode
, exp
, target
, ignore
);
15662 case AARCH64_BUILTIN_SVE
:
15663 return aarch64_sve::expand_builtin (subcode
, exp
, target
);
15665 gcc_unreachable ();
15668 /* Implement TARGET_BUILTIN_DECL. */
15670 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
15672 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15673 switch (code
& AARCH64_BUILTIN_CLASS
)
15675 case AARCH64_BUILTIN_GENERAL
:
15676 return aarch64_general_builtin_decl (subcode
, initialize_p
);
15678 case AARCH64_BUILTIN_SVE
:
15679 return aarch64_sve::builtin_decl (subcode
, initialize_p
);
15681 gcc_unreachable ();
15684 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15685 to optimize 1.0/sqrt. */
15688 use_rsqrt_p (machine_mode mode
)
15690 return (!flag_trapping_math
15691 && flag_unsafe_math_optimizations
15692 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
15693 & AARCH64_APPROX_MODE (mode
))
15694 || flag_mrecip_low_precision_sqrt
));
15697 /* Function to decide when to use the approximate reciprocal square root
15701 aarch64_builtin_reciprocal (tree fndecl
)
15703 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
15705 if (!use_rsqrt_p (mode
))
15707 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
15708 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
15709 switch (code
& AARCH64_BUILTIN_CLASS
)
15711 case AARCH64_BUILTIN_GENERAL
:
15712 return aarch64_general_builtin_rsqrt (subcode
);
15714 case AARCH64_BUILTIN_SVE
:
15717 gcc_unreachable ();
15720 /* Emit code to perform the floating-point operation:
15724 where all three operands are already known to be registers.
15725 If the operation is an SVE one, PTRUE is a suitable all-true
15729 aarch64_emit_mult (rtx dst
, rtx ptrue
, rtx src1
, rtx src2
)
15732 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL
, GET_MODE (dst
),
15733 dst
, ptrue
, src1
, src2
,
15734 gen_int_mode (SVE_RELAXED_GP
, SImode
)));
15736 emit_set_insn (dst
, gen_rtx_MULT (GET_MODE (dst
), src1
, src2
));
15739 /* Emit instruction sequence to compute either the approximate square root
15740 or its approximate reciprocal, depending on the flag RECP, and return
15741 whether the sequence was emitted or not. */
15744 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
15746 machine_mode mode
= GET_MODE (dst
);
15748 if (GET_MODE_INNER (mode
) == HFmode
)
15750 gcc_assert (!recp
);
15756 if (!(flag_mlow_precision_sqrt
15757 || (aarch64_tune_params
.approx_modes
->sqrt
15758 & AARCH64_APPROX_MODE (mode
))))
15761 if (!flag_finite_math_only
15762 || flag_trapping_math
15763 || !flag_unsafe_math_optimizations
15764 || optimize_function_for_size_p (cfun
))
15768 /* Caller assumes we cannot fail. */
15769 gcc_assert (use_rsqrt_p (mode
));
15772 if (aarch64_sve_mode_p (mode
))
15773 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
15774 machine_mode mmsk
= (VECTOR_MODE_P (mode
)
15775 ? related_int_vector_mode (mode
).require ()
15776 : int_mode_for_mode (mode
).require ());
15777 rtx xmsk
= NULL_RTX
;
15780 /* When calculating the approximate square root, compare the
15781 argument with 0.0 and create a mask. */
15782 rtx zero
= CONST0_RTX (mode
);
15785 xmsk
= gen_reg_rtx (GET_MODE (pg
));
15786 rtx hint
= gen_int_mode (SVE_KNOWN_PTRUE
, SImode
);
15787 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE
, mode
,
15788 xmsk
, pg
, hint
, src
, zero
));
15792 xmsk
= gen_reg_rtx (mmsk
);
15793 emit_insn (gen_rtx_SET (xmsk
,
15795 gen_rtx_EQ (mmsk
, src
, zero
))));
15799 /* Estimate the approximate reciprocal square root. */
15800 rtx xdst
= gen_reg_rtx (mode
);
15801 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
15803 /* Iterate over the series twice for SF and thrice for DF. */
15804 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
15806 /* Optionally iterate over the series once less for faster performance
15807 while sacrificing the accuracy. */
15808 if ((recp
&& flag_mrecip_low_precision_sqrt
)
15809 || (!recp
&& flag_mlow_precision_sqrt
))
15812 /* Iterate over the series to calculate the approximate reciprocal square
15814 rtx x1
= gen_reg_rtx (mode
);
15815 while (iterations
--)
15817 rtx x2
= gen_reg_rtx (mode
);
15818 aarch64_emit_mult (x2
, pg
, xdst
, xdst
);
15820 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
15822 if (iterations
> 0)
15823 aarch64_emit_mult (xdst
, pg
, xdst
, x1
);
15829 /* Multiply nonzero source values by the corresponding intermediate
15830 result elements, so that the final calculation is the approximate
15831 square root rather than its reciprocal. Select a zero result for
15832 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15834 emit_insn (gen_cond (UNSPEC_COND_FMUL
, mode
,
15835 xdst
, xmsk
, xdst
, src
, CONST0_RTX (mode
)));
15838 /* Qualify the approximate reciprocal square root when the
15839 argument is 0.0 by squashing the intermediary result to 0.0. */
15840 rtx xtmp
= gen_reg_rtx (mmsk
);
15841 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
15842 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
15843 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
15845 /* Calculate the approximate square root. */
15846 aarch64_emit_mult (xdst
, pg
, xdst
, src
);
15850 /* Finalize the approximation. */
15851 aarch64_emit_mult (dst
, pg
, xdst
, x1
);
15856 /* Emit the instruction sequence to compute the approximation for the division
15857 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15860 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
15862 machine_mode mode
= GET_MODE (quo
);
15864 if (GET_MODE_INNER (mode
) == HFmode
)
15867 bool use_approx_division_p
= (flag_mlow_precision_div
15868 || (aarch64_tune_params
.approx_modes
->division
15869 & AARCH64_APPROX_MODE (mode
)));
15871 if (!flag_finite_math_only
15872 || flag_trapping_math
15873 || !flag_unsafe_math_optimizations
15874 || optimize_function_for_size_p (cfun
)
15875 || !use_approx_division_p
)
15878 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
15882 if (aarch64_sve_mode_p (mode
))
15883 pg
= aarch64_ptrue_reg (aarch64_sve_pred_mode (mode
));
15885 /* Estimate the approximate reciprocal. */
15886 rtx xrcp
= gen_reg_rtx (mode
);
15887 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
15889 /* Iterate over the series twice for SF and thrice for DF. */
15890 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
15892 /* Optionally iterate over the series less for faster performance,
15893 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15894 if (flag_mlow_precision_div
)
15895 iterations
= (GET_MODE_INNER (mode
) == DFmode
15896 ? aarch64_double_recp_precision
15897 : aarch64_float_recp_precision
);
15899 /* Iterate over the series to calculate the approximate reciprocal. */
15900 rtx xtmp
= gen_reg_rtx (mode
);
15901 while (iterations
--)
15903 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
15905 if (iterations
> 0)
15906 aarch64_emit_mult (xrcp
, pg
, xrcp
, xtmp
);
15909 if (num
!= CONST1_RTX (mode
))
15911 /* As the approximate reciprocal of DEN is already calculated, only
15912 calculate the approximate division when NUM is not 1.0. */
15913 rtx xnum
= force_reg (mode
, num
);
15914 aarch64_emit_mult (xrcp
, pg
, xrcp
, xnum
);
15917 /* Finalize the approximation. */
15918 aarch64_emit_mult (quo
, pg
, xrcp
, xtmp
);
15922 /* Return the number of instructions that can be issued per cycle. */
15924 aarch64_sched_issue_rate (void)
15926 return aarch64_tune_params
.issue_rate
;
15929 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15931 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
15933 if (DEBUG_INSN_P (insn
))
15936 rtx_code code
= GET_CODE (PATTERN (insn
));
15937 if (code
== USE
|| code
== CLOBBER
)
15940 if (get_attr_type (insn
) == TYPE_NO_INSN
)
15947 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15949 int issue_rate
= aarch64_sched_issue_rate ();
15951 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
15955 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15956 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15957 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15960 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
15963 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
15967 /* Vectorizer cost model target hooks. */
15969 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15970 return the decl that should be recorded. Return null otherwise. */
15972 aarch64_vector_load_decl (tree addr
)
15974 if (TREE_CODE (addr
) != ADDR_EXPR
)
15976 tree base
= get_base_address (TREE_OPERAND (addr
, 0));
15977 if (TREE_CODE (base
) != VAR_DECL
)
15982 /* Return true if STMT_INFO accesses a decl that is known to be the
15983 argument to a vld1 in the same function. */
15985 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info
)
15987 if (!cfun
->machine
->vector_load_decls
)
15989 auto dr
= STMT_VINFO_DATA_REF (stmt_info
);
15992 tree decl
= aarch64_vector_load_decl (DR_BASE_ADDRESS (dr
));
15993 return decl
&& cfun
->machine
->vector_load_decls
->contains (decl
);
15996 /* Information about how the CPU would issue the scalar, Advanced SIMD
15997 or SVE version of a vector loop, using the scheme defined by the
15998 aarch64_base_vec_issue_info hierarchy of structures. */
15999 class aarch64_vec_op_count
16002 aarch64_vec_op_count () = default;
16003 aarch64_vec_op_count (const aarch64_vec_issue_info
*, unsigned int,
16006 unsigned int vec_flags () const { return m_vec_flags
; }
16007 unsigned int vf_factor () const { return m_vf_factor
; }
16009 const aarch64_base_vec_issue_info
*base_issue_info () const;
16010 const aarch64_simd_vec_issue_info
*simd_issue_info () const;
16011 const aarch64_sve_vec_issue_info
*sve_issue_info () const;
16013 fractional_cost
rename_cycles_per_iter () const;
16014 fractional_cost
min_nonpred_cycles_per_iter () const;
16015 fractional_cost
min_pred_cycles_per_iter () const;
16016 fractional_cost
min_cycles_per_iter () const;
16018 void dump () const;
16020 /* The number of individual "general" operations. See the comments
16021 in aarch64_base_vec_issue_info for details. */
16022 unsigned int general_ops
= 0;
16024 /* The number of load and store operations, under the same scheme
16026 unsigned int loads
= 0;
16027 unsigned int stores
= 0;
16029 /* The minimum number of cycles needed to execute all loop-carried
16030 operations, which in the vector code become associated with
16032 unsigned int reduction_latency
= 0;
16034 /* The number of individual predicate operations. See the comments
16035 in aarch64_sve_vec_issue_info for details. */
16036 unsigned int pred_ops
= 0;
16039 /* The issue information for the core. */
16040 const aarch64_vec_issue_info
*m_issue_info
= nullptr;
16042 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16043 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16044 Advanced SIMD code.
16045 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16047 unsigned int m_vec_flags
= 0;
16049 /* Assume that, when the code is executing on the core described
16050 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16051 times more data than the vectorizer anticipates.
16053 This is only ever different from 1 for SVE. It allows us to consider
16054 what would happen on a 256-bit SVE target even when the -mtune
16055 parameters say that the “likely” SVE length is 128 bits. */
16056 unsigned int m_vf_factor
= 1;
16059 aarch64_vec_op_count::
16060 aarch64_vec_op_count (const aarch64_vec_issue_info
*issue_info
,
16061 unsigned int vec_flags
, unsigned int vf_factor
)
16062 : m_issue_info (issue_info
),
16063 m_vec_flags (vec_flags
),
16064 m_vf_factor (vf_factor
)
16068 /* Return the base issue information (i.e. the parts that make sense
16069 for both scalar and vector code). Return null if we have no issue
16071 const aarch64_base_vec_issue_info
*
16072 aarch64_vec_op_count::base_issue_info () const
16074 if (auto *ret
= simd_issue_info ())
16076 return m_issue_info
->scalar
;
16079 /* If the structure describes vector code and we have associated issue
16080 information, return that issue information, otherwise return null. */
16081 const aarch64_simd_vec_issue_info
*
16082 aarch64_vec_op_count::simd_issue_info () const
16084 if (auto *ret
= sve_issue_info ())
16087 return m_issue_info
->advsimd
;
16091 /* If the structure describes SVE code and we have associated issue
16092 information, return that issue information, otherwise return null. */
16093 const aarch64_sve_vec_issue_info
*
16094 aarch64_vec_op_count::sve_issue_info () const
16096 if (m_vec_flags
& VEC_ANY_SVE
)
16097 return m_issue_info
->sve
;
16101 /* Estimate the minimum number of cycles per iteration needed to rename
16104 ??? For now this is done inline rather than via cost tables, since it
16105 isn't clear how it should be parameterized for the general case. */
16107 aarch64_vec_op_count::rename_cycles_per_iter () const
16109 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16110 || sve_issue_info () == &neoversen2_sve_issue_info
16111 || sve_issue_info () == &neoversev2_sve_issue_info
)
16112 /* + 1 for an addition. We've already counted a general op for each
16113 store, so we don't need to account for stores separately. The branch
16114 reads no registers and so does not need to be counted either.
16116 ??? This value is very much on the pessimistic side, but seems to work
16117 pretty well in practice. */
16118 return { general_ops
+ loads
+ pred_ops
+ 1, 5 };
16123 /* Like min_cycles_per_iter, but excluding predicate operations. */
16125 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16127 auto *issue_info
= base_issue_info ();
16129 fractional_cost cycles
= MAX (reduction_latency
, 1);
16130 cycles
= std::max (cycles
, { stores
, issue_info
->stores_per_cycle
});
16131 cycles
= std::max (cycles
, { loads
+ stores
,
16132 issue_info
->loads_stores_per_cycle
});
16133 cycles
= std::max (cycles
, { general_ops
,
16134 issue_info
->general_ops_per_cycle
});
16135 cycles
= std::max (cycles
, rename_cycles_per_iter ());
16139 /* Like min_cycles_per_iter, but including only the predicate operations. */
16141 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16143 if (auto *issue_info
= sve_issue_info ())
16144 return { pred_ops
, issue_info
->pred_ops_per_cycle
};
16148 /* Estimate the minimum number of cycles needed to issue the operations.
16149 This is a very simplistic model! */
16151 aarch64_vec_op_count::min_cycles_per_iter () const
16153 return std::max (min_nonpred_cycles_per_iter (),
16154 min_pred_cycles_per_iter ());
16157 /* Dump information about the structure. */
16159 aarch64_vec_op_count::dump () const
16161 dump_printf_loc (MSG_NOTE
, vect_location
,
16162 " load operations = %d\n", loads
);
16163 dump_printf_loc (MSG_NOTE
, vect_location
,
16164 " store operations = %d\n", stores
);
16165 dump_printf_loc (MSG_NOTE
, vect_location
,
16166 " general operations = %d\n", general_ops
);
16167 if (sve_issue_info ())
16168 dump_printf_loc (MSG_NOTE
, vect_location
,
16169 " predicate operations = %d\n", pred_ops
);
16170 dump_printf_loc (MSG_NOTE
, vect_location
,
16171 " reduction latency = %d\n", reduction_latency
);
16172 if (auto rcpi
= rename_cycles_per_iter ())
16173 dump_printf_loc (MSG_NOTE
, vect_location
,
16174 " estimated cycles per iteration to rename = %f\n",
16175 rcpi
.as_double ());
16176 if (auto pred_cpi
= min_pred_cycles_per_iter ())
16178 dump_printf_loc (MSG_NOTE
, vect_location
,
16179 " estimated min cycles per iteration"
16180 " without predication = %f\n",
16181 min_nonpred_cycles_per_iter ().as_double ());
16182 dump_printf_loc (MSG_NOTE
, vect_location
,
16183 " estimated min cycles per iteration"
16184 " for predication = %f\n", pred_cpi
.as_double ());
16186 if (auto cpi
= min_cycles_per_iter ())
16187 dump_printf_loc (MSG_NOTE
, vect_location
,
16188 " estimated min cycles per iteration = %f\n",
16192 /* Information about vector code that we're in the process of costing. */
16193 class aarch64_vector_costs
: public vector_costs
16196 aarch64_vector_costs (vec_info
*, bool);
16198 unsigned int add_stmt_cost (int count
, vect_cost_for_stmt kind
,
16199 stmt_vec_info stmt_info
, slp_tree
, tree vectype
,
16201 vect_cost_model_location where
) override
;
16202 void finish_cost (const vector_costs
*) override
;
16203 bool better_main_loop_than_p (const vector_costs
*other
) const override
;
16206 void record_potential_advsimd_unrolling (loop_vec_info
);
16207 void analyze_loop_vinfo (loop_vec_info
);
16208 void count_ops (unsigned int, vect_cost_for_stmt
, stmt_vec_info
,
16209 aarch64_vec_op_count
*);
16210 fractional_cost
adjust_body_cost_sve (const aarch64_vec_op_count
*,
16211 fractional_cost
, unsigned int,
16212 unsigned int *, bool *);
16213 unsigned int adjust_body_cost (loop_vec_info
, const aarch64_vector_costs
*,
16215 bool prefer_unrolled_loop () const;
16216 unsigned int determine_suggested_unroll_factor ();
16218 /* True if we have performed one-time initialization based on the
16220 bool m_analyzed_vinfo
= false;
16222 /* This loop uses an average operation that is not supported by SVE, but is
16223 supported by Advanced SIMD and SVE2. */
16224 bool m_has_avg
= false;
16226 /* True if the vector body contains a store to a decl and if the
16227 function is known to have a vld1 from the same decl.
16229 In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16230 initializing a vector is:
16232 float f[4] = { elts };
16233 float32x4_t x = vld1q_f32(f);
16235 We should strongly prefer vectorization of the initialization of f,
16236 so that the store to f and the load back can be optimized away,
16237 leaving a vectorization of { elts }. */
16238 bool m_stores_to_vector_load_decl
= false;
16240 /* Non-zero if the last operation we costed is a vector promotion or demotion.
16241 In this case the value is the number of insns in the last operation.
16243 On AArch64 vector promotion and demotions require us to first widen or
16244 narrow the input and only after that emit conversion instructions. For
16245 costing this means we need to emit the cost of the final conversions as
16247 unsigned int m_num_last_promote_demote
= 0;
16249 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16250 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16252 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
16253 unsigned int m_vec_flags
= 0;
16255 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16256 This means that code such as:
16261 will be costed as two scalar instructions and two vector instructions
16262 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
16263 wins if the costs are equal, because of the fact that the vector costs
16264 include constant initializations whereas the scalar costs don't.
16265 We would therefore tend to vectorize the code above, even though
16266 the scalar version can use a single STP.
16268 We should eventually fix this and model LDP and STP in the main costs;
16269 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16270 Until then, we look specifically for code that does nothing more than
16271 STP-like operations. We cost them on that basis in addition to the
16272 normal latency-based costs.
16274 If the scalar or vector code could be a sequence of STPs +
16275 initialization, this variable counts the cost of the sequence,
16276 with 2 units per instruction. The variable is ~0U for other
16278 unsigned int m_stp_sequence_cost
= 0;
16280 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16281 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
16282 situations, we try to predict whether an Advanced SIMD implementation
16283 of the loop could be completely unrolled and become straight-line code.
16284 If so, it is generally better to use the Advanced SIMD version rather
16285 than length-agnostic SVE, since the SVE loop would execute an unknown
16286 number of times and so could not be completely unrolled in the same way.
16288 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16289 number of Advanced SIMD loop iterations that would be unrolled and
16290 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16291 in the unrolled loop. Both values are zero if we're not applying
16293 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters
= 0;
16294 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts
= 0;
16296 /* If we're vectorizing a loop that executes a constant number of times,
16297 this variable gives the number of times that the vector loop would
16298 iterate, otherwise it is zero. */
16299 uint64_t m_num_vector_iterations
= 0;
16301 /* Used only when vectorizing loops. Estimates the number and kind of
16302 operations that would be needed by one iteration of the scalar
16303 or vector loop. There is one entry for each tuning option of
16305 auto_vec
<aarch64_vec_op_count
, 2> m_ops
;
16308 aarch64_vector_costs::aarch64_vector_costs (vec_info
*vinfo
,
16309 bool costing_for_scalar
)
16310 : vector_costs (vinfo
, costing_for_scalar
),
16311 m_vec_flags (costing_for_scalar
? 0
16312 : aarch64_classify_vector_mode (vinfo
->vector_mode
))
16314 if (auto *issue_info
= aarch64_tune_params
.vec_costs
->issue_info
)
16316 m_ops
.quick_push ({ issue_info
, m_vec_flags
});
16317 if (aarch64_tune_params
.vec_costs
== &neoverse512tvb_vector_cost
)
16319 unsigned int vf_factor
= (m_vec_flags
& VEC_ANY_SVE
) ? 2 : 1;
16320 m_ops
.quick_push ({ &neoversev1_vec_issue_info
, m_vec_flags
,
16326 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
16328 aarch64_vectorize_create_costs (vec_info
*vinfo
, bool costing_for_scalar
)
16330 return new aarch64_vector_costs (vinfo
, costing_for_scalar
);
16333 /* Return true if the current CPU should use the new costs defined
16334 in GCC 11. This should be removed for GCC 12 and above, with the
16335 costs applying to all CPUs instead. */
16337 aarch64_use_new_vector_costs_p ()
16339 return (aarch64_tune_params
.extra_tuning_flags
16340 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
);
16343 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16344 static const simd_vec_cost
*
16345 aarch64_simd_vec_costs (tree vectype
)
16347 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16348 if (vectype
!= NULL
16349 && aarch64_sve_mode_p (TYPE_MODE (vectype
))
16350 && costs
->sve
!= NULL
)
16352 return costs
->advsimd
;
16355 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16356 static const simd_vec_cost
*
16357 aarch64_simd_vec_costs_for_flags (unsigned int flags
)
16359 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16360 if ((flags
& VEC_ANY_SVE
) && costs
->sve
)
16362 return costs
->advsimd
;
16365 /* If STMT_INFO is a memory reference, return the scalar memory type,
16366 otherwise return null. */
16368 aarch64_dr_type (stmt_vec_info stmt_info
)
16370 if (auto dr
= STMT_VINFO_DATA_REF (stmt_info
))
16371 return TREE_TYPE (DR_REF (dr
));
16375 /* Decide whether to use the unrolling heuristic described above
16376 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16377 describes the loop that we're vectorizing. */
16379 aarch64_vector_costs::
16380 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo
)
16382 /* The heuristic only makes sense on targets that have the same
16383 vector throughput for SVE and Advanced SIMD. */
16384 if (!(aarch64_tune_params
.extra_tuning_flags
16385 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
))
16388 /* We only want to apply the heuristic if LOOP_VINFO is being
16389 vectorized for SVE. */
16390 if (!(m_vec_flags
& VEC_ANY_SVE
))
16393 /* Check whether it is possible in principle to use Advanced SIMD
16395 if (aarch64_autovec_preference
== 2)
16398 /* We don't want to apply the heuristic to outer loops, since it's
16399 harder to track two levels of unrolling. */
16400 if (LOOP_VINFO_LOOP (loop_vinfo
)->inner
)
16403 /* Only handle cases in which the number of Advanced SIMD iterations
16404 would be known at compile time but the number of SVE iterations
16406 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
16407 || aarch64_sve_vg
.is_constant ())
16410 /* Guess how many times the Advanced SIMD loop would iterate and make
16411 sure that it is within the complete unrolling limit. Even if the
16412 number of iterations is small enough, the number of statements might
16413 not be, which is why we need to estimate the number of statements too. */
16414 unsigned int estimated_vq
= aarch64_estimated_sve_vq ();
16415 unsigned int advsimd_vf
= CEIL (vect_vf_for_cost (loop_vinfo
), estimated_vq
);
16416 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16417 = LOOP_VINFO_INT_NITERS (loop_vinfo
) / advsimd_vf
;
16418 if (unrolled_advsimd_niters
> (unsigned int) param_max_completely_peel_times
)
16421 /* Record that we're applying the heuristic and should try to estimate
16422 the number of statements in the Advanced SIMD loop. */
16423 m_unrolled_advsimd_niters
= unrolled_advsimd_niters
;
16426 /* Do one-time initialization of the aarch64_vector_costs given that we're
16427 costing the loop vectorization described by LOOP_VINFO. */
16429 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo
)
16431 /* Record the number of times that the vector loop would execute,
16433 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
16434 auto scalar_niters
= max_stmt_executions_int (loop
);
16435 if (scalar_niters
>= 0)
16437 unsigned int vf
= vect_vf_for_cost (loop_vinfo
);
16438 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
16439 m_num_vector_iterations
= scalar_niters
/ vf
;
16441 m_num_vector_iterations
= CEIL (scalar_niters
, vf
);
16444 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16445 heuristic described above m_unrolled_advsimd_niters. */
16446 record_potential_advsimd_unrolling (loop_vinfo
);
16449 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16451 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
16453 int misalign ATTRIBUTE_UNUSED
)
16456 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
16459 if (vectype
!= NULL
)
16460 fp
= FLOAT_TYPE_P (vectype
);
16462 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16464 switch (type_of_cost
)
16467 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
16470 return costs
->scalar_load_cost
;
16473 return costs
->scalar_store_cost
;
16476 return fp
? simd_costs
->fp_stmt_cost
16477 : simd_costs
->int_stmt_cost
;
16480 return simd_costs
->align_load_cost
;
16483 return simd_costs
->store_cost
;
16485 case vec_to_scalar
:
16486 return simd_costs
->vec_to_scalar_cost
;
16488 case scalar_to_vec
:
16489 return simd_costs
->scalar_to_vec_cost
;
16491 case unaligned_load
:
16492 case vector_gather_load
:
16493 return simd_costs
->unalign_load_cost
;
16495 case unaligned_store
:
16496 case vector_scatter_store
:
16497 return simd_costs
->unalign_store_cost
;
16499 case cond_branch_taken
:
16500 return costs
->cond_taken_branch_cost
;
16502 case cond_branch_not_taken
:
16503 return costs
->cond_not_taken_branch_cost
;
16506 return simd_costs
->permute_cost
;
16508 case vec_promote_demote
:
16509 return fp
? simd_costs
->fp_stmt_cost
16510 : simd_costs
->int_stmt_cost
;
16512 case vec_construct
:
16513 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
16514 return elements
/ 2 + 1;
16517 gcc_unreachable ();
16521 /* Return true if an access of kind KIND for STMT_INFO represents one
16522 vector of an LD[234] or ST[234] operation. Return the total number of
16523 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16525 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
)
16527 if ((kind
== vector_load
16528 || kind
== unaligned_load
16529 || kind
== vector_store
16530 || kind
== unaligned_store
)
16531 && STMT_VINFO_DATA_REF (stmt_info
))
16533 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
16535 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_LOAD_STORE_LANES
)
16536 return DR_GROUP_SIZE (stmt_info
);
16541 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16542 vectors would produce a series of LDP or STP operations. KIND is the
16543 kind of statement that STMT_INFO represents. */
16545 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind
,
16546 stmt_vec_info stmt_info
)
16552 case unaligned_load
:
16553 case unaligned_store
:
16560 return is_gimple_assign (stmt_info
->stmt
);
16563 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16564 or multiply-subtract sequence that might be suitable for fusing into a
16565 single instruction. If VEC_FLAGS is zero, analyze the operation as
16566 a scalar one, otherwise analyze it as an operation on vectors with those
16569 aarch64_multiply_add_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16570 unsigned int vec_flags
)
16572 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
16575 tree_code code
= gimple_assign_rhs_code (assign
);
16576 if (code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
16579 auto is_mul_result
= [&](int i
)
16581 tree rhs
= gimple_op (assign
, i
);
16582 /* ??? Should we try to check for a single use as well? */
16583 if (TREE_CODE (rhs
) != SSA_NAME
)
16586 stmt_vec_info def_stmt_info
= vinfo
->lookup_def (rhs
);
16588 || STMT_VINFO_DEF_TYPE (def_stmt_info
) != vect_internal_def
)
16590 gassign
*rhs_assign
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
);
16591 if (!rhs_assign
|| gimple_assign_rhs_code (rhs_assign
) != MULT_EXPR
)
16594 if (vec_flags
& VEC_ADVSIMD
)
16596 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16597 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16598 only supports MLA forms, so will require a move if the result
16599 cannot be tied to the accumulator. The most important case in
16600 which this is true is when the accumulator input is invariant. */
16601 rhs
= gimple_op (assign
, 3 - i
);
16602 if (TREE_CODE (rhs
) != SSA_NAME
)
16604 def_stmt_info
= vinfo
->lookup_def (rhs
);
16606 || STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_external_def
16607 || STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_constant_def
)
16614 if (code
== MINUS_EXPR
&& (vec_flags
& VEC_ADVSIMD
))
16615 /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16616 multiplication must be on the second operand (to form an FMLS).
16617 But if both operands are multiplications and the second operand
16618 is used more than once, we'll instead negate the second operand
16619 and use it as an accumulator for the first operand. */
16620 return (is_mul_result (2)
16621 && (has_single_use (gimple_assign_rhs2 (assign
))
16622 || !is_mul_result (1)));
16624 return is_mul_result (1) || is_mul_result (2);
16627 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16628 expression sequence that might be suitable for fusing into a
16629 single instruction. If VEC_FLAGS is zero, analyze the operation as
16630 a scalar one, otherwise analyze it as an operation on vectors with those
16634 aarch64_bool_compound_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16635 unsigned int vec_flags
)
16637 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
16639 || gimple_assign_rhs_code (assign
) != BIT_AND_EXPR
16640 || !STMT_VINFO_VECTYPE (stmt_info
)
16641 || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info
)))
16644 for (int i
= 1; i
< 3; ++i
)
16646 tree rhs
= gimple_op (assign
, i
);
16648 if (TREE_CODE (rhs
) != SSA_NAME
)
16651 stmt_vec_info def_stmt_info
= vinfo
->lookup_def (rhs
);
16653 || STMT_VINFO_DEF_TYPE (def_stmt_info
) != vect_internal_def
)
16656 gassign
*rhs_assign
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
);
16658 || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign
))
16662 if (vec_flags
& VEC_ADVSIMD
)
16670 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16671 in-loop reduction that SVE supports directly, return its latency in cycles,
16672 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16674 static unsigned int
16675 aarch64_sve_in_loop_reduction_latency (vec_info
*vinfo
,
16676 stmt_vec_info stmt_info
,
16677 const sve_vec_cost
*sve_costs
)
16679 switch (vect_reduc_type (vinfo
, stmt_info
))
16681 case EXTRACT_LAST_REDUCTION
:
16682 return sve_costs
->clast_cost
;
16684 case FOLD_LEFT_REDUCTION
:
16685 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info
->stmt
))))
16689 return sve_costs
->fadda_f16_cost
;
16692 return sve_costs
->fadda_f32_cost
;
16695 return sve_costs
->fadda_f64_cost
;
16706 /* STMT_INFO describes a loop-carried operation in the original scalar code
16707 that we are considering implementing as a reduction. Return one of the
16708 following values, depending on VEC_FLAGS:
16710 - If VEC_FLAGS is zero, return the loop carry latency of the original
16713 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16714 Advanced SIMD implementation.
16716 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16717 SVE implementation. */
16718 static unsigned int
16719 aarch64_in_loop_reduction_latency (vec_info
*vinfo
, stmt_vec_info stmt_info
,
16720 unsigned int vec_flags
)
16722 const cpu_vector_cost
*vec_costs
= aarch64_tune_params
.vec_costs
;
16723 const sve_vec_cost
*sve_costs
= nullptr;
16724 if (vec_flags
& VEC_ANY_SVE
)
16725 sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
16727 /* If the caller is asking for the SVE latency, check for forms of reduction
16728 that only SVE can handle directly. */
16731 unsigned int latency
16732 = aarch64_sve_in_loop_reduction_latency (vinfo
, stmt_info
, sve_costs
);
16737 /* Handle scalar costs. */
16738 bool is_float
= FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info
->stmt
)));
16739 if (vec_flags
== 0)
16742 return vec_costs
->scalar_fp_stmt_cost
;
16743 return vec_costs
->scalar_int_stmt_cost
;
16746 /* Otherwise, the loop body just contains normal integer or FP operations,
16747 with a vector reduction outside the loop. */
16748 const simd_vec_cost
*simd_costs
16749 = aarch64_simd_vec_costs_for_flags (vec_flags
);
16751 return simd_costs
->fp_stmt_cost
;
16752 return simd_costs
->int_stmt_cost
;
16755 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16756 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16757 try to subdivide the target-independent categorization provided by KIND
16758 to get a more accurate cost. */
16759 static fractional_cost
16760 aarch64_detect_scalar_stmt_subtype (vec_info
*vinfo
, vect_cost_for_stmt kind
,
16761 stmt_vec_info stmt_info
,
16762 fractional_cost stmt_cost
)
16764 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16765 the extension with the load. */
16766 if (kind
== scalar_stmt
&& vect_is_extending_load (vinfo
, stmt_info
))
16772 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16773 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16774 when vectorized would operate on vector type VECTYPE. Try to subdivide
16775 the target-independent categorization provided by KIND to get a more
16776 accurate cost. WHERE specifies where the cost associated with KIND
16778 static fractional_cost
16779 aarch64_detect_vector_stmt_subtype (vec_info
*vinfo
, vect_cost_for_stmt kind
,
16780 stmt_vec_info stmt_info
, tree vectype
,
16781 enum vect_cost_model_location where
,
16782 fractional_cost stmt_cost
)
16784 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16785 const sve_vec_cost
*sve_costs
= nullptr;
16786 if (aarch64_sve_mode_p (TYPE_MODE (vectype
)))
16787 sve_costs
= aarch64_tune_params
.vec_costs
->sve
;
16789 /* It's generally better to avoid costing inductions, since the induction
16790 will usually be hidden by other operations. This is particularly true
16791 for things like COND_REDUCTIONS. */
16792 if (is_a
<gphi
*> (stmt_info
->stmt
))
16795 /* Detect cases in which vec_to_scalar is describing the extraction of a
16796 vector element in preparation for a scalar store. The store itself is
16797 costed separately. */
16798 if (vect_is_store_elt_extraction (kind
, stmt_info
))
16799 return simd_costs
->store_elt_extra_cost
;
16801 /* Detect SVE gather loads, which are costed as a single scalar_load
16802 for each element. We therefore need to divide the full-instruction
16803 cost by the number of elements in the vector. */
16804 if (kind
== scalar_load
16806 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_GATHER_SCATTER
)
16808 unsigned int nunits
= vect_nunits_for_cost (vectype
);
16809 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype
)) == 64)
16810 return { sve_costs
->gather_load_x64_cost
, nunits
};
16811 return { sve_costs
->gather_load_x32_cost
, nunits
};
16814 /* Detect cases in which a scalar_store is really storing one element
16815 in a scatter operation. */
16816 if (kind
== scalar_store
16818 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_GATHER_SCATTER
)
16819 return sve_costs
->scatter_store_elt_cost
;
16821 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16822 if (kind
== vec_to_scalar
16823 && where
== vect_body
16826 unsigned int latency
16827 = aarch64_sve_in_loop_reduction_latency (vinfo
, stmt_info
, sve_costs
);
16832 /* Detect cases in which vec_to_scalar represents a single reduction
16833 instruction like FADDP or MAXV. */
16834 if (kind
== vec_to_scalar
16835 && where
== vect_epilogue
16836 && vect_is_reduction (stmt_info
))
16837 switch (GET_MODE_INNER (TYPE_MODE (vectype
)))
16840 return simd_costs
->reduc_i8_cost
;
16843 return simd_costs
->reduc_i16_cost
;
16846 return simd_costs
->reduc_i32_cost
;
16849 return simd_costs
->reduc_i64_cost
;
16853 return simd_costs
->reduc_f16_cost
;
16856 return simd_costs
->reduc_f32_cost
;
16859 return simd_costs
->reduc_f64_cost
;
16865 /* Otherwise stick with the original categorization. */
16869 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16870 for STMT_INFO, which has cost kind KIND and which when vectorized would
16871 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16873 static fractional_cost
16874 aarch64_sve_adjust_stmt_cost (class vec_info
*vinfo
, vect_cost_for_stmt kind
,
16875 stmt_vec_info stmt_info
, tree vectype
,
16876 fractional_cost stmt_cost
)
16878 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16879 vector register size or number of units. Integer promotions of this
16880 type therefore map to SXT[BHW] or UXT[BHW].
16882 Most loads have extending forms that can do the sign or zero extension
16883 on the fly. Optimistically assume that a load followed by an extension
16884 will fold to this form during combine, and that the extension therefore
16886 if (kind
== vector_stmt
&& vect_is_extending_load (vinfo
, stmt_info
))
16889 /* For similar reasons, vector_stmt integer truncations are a no-op,
16890 because we can just ignore the unused upper bits of the source. */
16891 if (kind
== vector_stmt
&& vect_is_integer_truncation (stmt_info
))
16894 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16895 but there are no equivalent instructions for SVE. This means that
16896 (all other things being equal) 128-bit SVE needs twice as many load
16897 and store instructions as Advanced SIMD in order to process vector pairs.
16899 Also, scalar code can often use LDP and STP to access pairs of values,
16900 so it is too simplistic to say that one SVE load or store replaces
16901 VF scalar loads and stores.
16903 Ideally we would account for this in the scalar and Advanced SIMD
16904 costs by making suitable load/store pairs as cheap as a single
16905 load/store. However, that would be a very invasive change and in
16906 practice it tends to stress other parts of the cost model too much.
16907 E.g. stores of scalar constants currently count just a store,
16908 whereas stores of vector constants count a store and a vec_init.
16909 This is an artificial distinction for AArch64, where stores of
16910 nonzero scalar constants need the same kind of register invariant
16913 An alternative would be to double the cost of any SVE loads and stores
16914 that could be paired in Advanced SIMD (and possibly also paired in
16915 scalar code). But this tends to stress other parts of the cost model
16916 in the same way. It also means that we can fall back to Advanced SIMD
16917 even if full-loop predication would have been useful.
16919 Here we go for a more conservative version: double the costs of SVE
16920 loads and stores if one iteration of the scalar loop processes enough
16921 elements for it to use a whole number of Advanced SIMD LDP or STP
16922 instructions. This makes it very likely that the VF would be 1 for
16923 Advanced SIMD, and so no epilogue should be needed. */
16924 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
16926 stmt_vec_info first
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
16927 unsigned int count
= DR_GROUP_SIZE (first
) - DR_GROUP_GAP (first
);
16928 unsigned int elt_bits
= GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype
));
16929 if (multiple_p (count
* elt_bits
, 256)
16930 && aarch64_advsimd_ldp_stp_p (kind
, stmt_info
))
16937 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16938 and which when vectorized would operate on vector type VECTYPE. Add the
16939 cost of any embedded operations. */
16940 static fractional_cost
16941 aarch64_adjust_stmt_cost (vec_info
*vinfo
, vect_cost_for_stmt kind
,
16942 stmt_vec_info stmt_info
, tree vectype
,
16943 unsigned vec_flags
, fractional_cost stmt_cost
)
16947 const simd_vec_cost
*simd_costs
= aarch64_simd_vec_costs (vectype
);
16949 /* Detect cases in which a vector load or store represents an
16950 LD[234] or ST[234] instruction. */
16951 switch (aarch64_ld234_st234_vectors (kind
, stmt_info
))
16954 stmt_cost
+= simd_costs
->ld2_st2_permute_cost
;
16958 stmt_cost
+= simd_costs
->ld3_st3_permute_cost
;
16962 stmt_cost
+= simd_costs
->ld4_st4_permute_cost
;
16966 gassign
*assign
= dyn_cast
<gassign
*> (STMT_VINFO_STMT (stmt_info
));
16967 if ((kind
== scalar_stmt
|| kind
== vector_stmt
) && assign
)
16969 /* For MLA we need to reduce the cost since MLA is 1 instruction. */
16970 if (!vect_is_reduction (stmt_info
)
16971 && aarch64_multiply_add_p (vinfo
, stmt_info
, vec_flags
))
16974 /* For vector boolean ANDs with a compare operand we just need
16976 if (aarch64_bool_compound_p (vinfo
, stmt_info
, vec_flags
))
16980 if (kind
== vector_stmt
|| kind
== vec_to_scalar
)
16981 if (tree cmp_type
= vect_embedded_comparison_type (stmt_info
))
16983 if (FLOAT_TYPE_P (cmp_type
))
16984 stmt_cost
+= simd_costs
->fp_stmt_cost
;
16986 stmt_cost
+= simd_costs
->int_stmt_cost
;
16990 if (kind
== scalar_stmt
)
16991 if (tree cmp_type
= vect_embedded_comparison_type (stmt_info
))
16993 if (FLOAT_TYPE_P (cmp_type
))
16994 stmt_cost
+= aarch64_tune_params
.vec_costs
->scalar_fp_stmt_cost
;
16996 stmt_cost
+= aarch64_tune_params
.vec_costs
->scalar_int_stmt_cost
;
17002 /* Return true if STMT_INFO is part of a reduction that has the form:
17007 with the single accumulator being read and written multiple times. */
17009 aarch64_force_single_cycle (vec_info
*vinfo
, stmt_vec_info stmt_info
)
17011 if (!STMT_VINFO_REDUC_DEF (stmt_info
))
17014 auto reduc_info
= info_for_reduction (vinfo
, stmt_info
);
17015 return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
17018 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
17019 and they describe an operation in the body of a vector loop. Record issue
17020 information relating to the vector operation in OPS. */
17022 aarch64_vector_costs::count_ops (unsigned int count
, vect_cost_for_stmt kind
,
17023 stmt_vec_info stmt_info
,
17024 aarch64_vec_op_count
*ops
)
17026 const aarch64_base_vec_issue_info
*base_issue
= ops
->base_issue_info ();
17029 const aarch64_simd_vec_issue_info
*simd_issue
= ops
->simd_issue_info ();
17030 const aarch64_sve_vec_issue_info
*sve_issue
= ops
->sve_issue_info ();
17032 /* Calculate the minimum cycles per iteration imposed by a reduction
17034 if ((kind
== scalar_stmt
|| kind
== vector_stmt
|| kind
== vec_to_scalar
)
17035 && vect_is_reduction (stmt_info
))
17038 = aarch64_in_loop_reduction_latency (m_vinfo
, stmt_info
, m_vec_flags
);
17039 if (aarch64_force_single_cycle (m_vinfo
, stmt_info
))
17040 /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17041 and then accumulate that, but at the moment the loop-carried
17042 dependency includes all copies. */
17043 ops
->reduction_latency
= MAX (ops
->reduction_latency
, base
* count
);
17045 ops
->reduction_latency
= MAX (ops
->reduction_latency
, base
);
17048 if (stmt_info
&& (kind
== scalar_stmt
|| kind
== vector_stmt
))
17050 /* Assume that multiply-adds will become a single operation. */
17051 if (aarch64_multiply_add_p (m_vinfo
, stmt_info
, m_vec_flags
))
17054 /* Assume that bool AND with compare operands will become a single
17056 if (aarch64_bool_compound_p (m_vinfo
, stmt_info
, m_vec_flags
))
17061 /* Count the basic operation cost associated with KIND. */
17064 case cond_branch_taken
:
17065 case cond_branch_not_taken
:
17066 case vector_gather_load
:
17067 case vector_scatter_store
:
17068 /* We currently don't expect these to be used in a loop body. */
17072 case vec_promote_demote
:
17073 case vec_construct
:
17074 case vec_to_scalar
:
17075 case scalar_to_vec
:
17078 ops
->general_ops
+= count
;
17083 case unaligned_load
:
17084 ops
->loads
+= count
;
17085 if (m_vec_flags
|| FLOAT_TYPE_P (aarch64_dr_type (stmt_info
)))
17086 ops
->general_ops
+= base_issue
->fp_simd_load_general_ops
* count
;
17090 case unaligned_store
:
17092 ops
->stores
+= count
;
17093 if (m_vec_flags
|| FLOAT_TYPE_P (aarch64_dr_type (stmt_info
)))
17094 ops
->general_ops
+= base_issue
->fp_simd_store_general_ops
* count
;
17098 /* Add any embedded comparison operations. */
17099 if ((kind
== scalar_stmt
|| kind
== vector_stmt
|| kind
== vec_to_scalar
)
17100 && vect_embedded_comparison_type (stmt_info
))
17101 ops
->general_ops
+= count
;
17103 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17104 have only accounted for one. */
17105 if ((kind
== vector_stmt
|| kind
== vec_to_scalar
)
17106 && vect_reduc_type (m_vinfo
, stmt_info
) == COND_REDUCTION
)
17107 ops
->general_ops
+= count
;
17109 /* Count the predicate operations needed by an SVE comparison. */
17110 if (sve_issue
&& (kind
== vector_stmt
|| kind
== vec_to_scalar
))
17111 if (tree type
= vect_comparison_type (stmt_info
))
17113 unsigned int base
= (FLOAT_TYPE_P (type
)
17114 ? sve_issue
->fp_cmp_pred_ops
17115 : sve_issue
->int_cmp_pred_ops
);
17116 ops
->pred_ops
+= base
* count
;
17119 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
17121 switch (aarch64_ld234_st234_vectors (kind
, stmt_info
))
17124 ops
->general_ops
+= simd_issue
->ld2_st2_general_ops
* count
;
17128 ops
->general_ops
+= simd_issue
->ld3_st3_general_ops
* count
;
17132 ops
->general_ops
+= simd_issue
->ld4_st4_general_ops
* count
;
17136 /* Add any overhead associated with gather loads and scatter stores. */
17138 && (kind
== scalar_load
|| kind
== scalar_store
)
17139 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
) == VMAT_GATHER_SCATTER
)
17141 unsigned int pairs
= CEIL (count
, 2);
17142 ops
->pred_ops
+= sve_issue
->gather_scatter_pair_pred_ops
* pairs
;
17143 ops
->general_ops
+= sve_issue
->gather_scatter_pair_general_ops
* pairs
;
17147 /* Return true if STMT_INFO contains a memory access and if the constant
17148 component of the memory address is aligned to SIZE bytes. */
17150 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info
,
17153 if (!STMT_VINFO_DATA_REF (stmt_info
))
17156 if (auto first_stmt
= DR_GROUP_FIRST_ELEMENT (stmt_info
))
17157 stmt_info
= first_stmt
;
17158 tree constant_offset
= DR_INIT (STMT_VINFO_DATA_REF (stmt_info
));
17159 /* Needed for gathers & scatters, for example. */
17160 if (!constant_offset
)
17163 return multiple_p (wi::to_poly_offset (constant_offset
), size
);
17166 /* Check if a scalar or vector stmt could be part of a region of code
17167 that does nothing more than store values to memory, in the scalar
17168 case using STP. Return the cost of the stmt if so, counting 2 for
17169 one instruction. Return ~0U otherwise.
17171 The arguments are a subset of those passed to add_stmt_cost. */
17173 aarch64_stp_sequence_cost (unsigned int count
, vect_cost_for_stmt kind
,
17174 stmt_vec_info stmt_info
, tree vectype
)
17176 /* Code that stores vector constants uses a vector_load to create
17177 the constant. We don't apply the heuristic to that case for two
17180 - At the moment, STPs are only formed via peephole2, and the
17181 constant scalar moves would often come between STRs and so
17182 prevent STP formation.
17184 - The scalar code also has to load the constant somehow, and that
17188 case scalar_to_vec
:
17189 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
17190 return (FLOAT_TYPE_P (vectype
) ? 2 : 4) * count
;
17192 case vec_construct
:
17193 if (FLOAT_TYPE_P (vectype
))
17194 /* Count 1 insn for the maximum number of FP->SIMD INS
17196 return (vect_nunits_for_cost (vectype
) - 1) * 2 * count
;
17198 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17199 maximum number of GPR->SIMD INS instructions. */
17200 return vect_nunits_for_cost (vectype
) * 4 * count
;
17203 case unaligned_store
:
17204 /* Count 1 insn per vector if we can't form STP Q pairs. */
17205 if (aarch64_sve_mode_p (TYPE_MODE (vectype
)))
17210 /* Assume we won't be able to use STP if the constant offset
17211 component of the address is misaligned. ??? This could be
17212 removed if we formed STP pairs earlier, rather than relying
17214 auto size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
17215 if (!aarch64_aligned_constant_offset_p (stmt_info
, size
))
17218 return CEIL (count
, 2) * 2;
17221 if (stmt_info
&& STMT_VINFO_DATA_REF (stmt_info
))
17223 /* Check for a mode in which STP pairs can be formed. */
17224 auto size
= GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info
)));
17225 if (maybe_ne (size
, 4) && maybe_ne (size
, 8))
17228 /* Assume we won't be able to use STP if the constant offset
17229 component of the address is misaligned. ??? This could be
17230 removed if we formed STP pairs earlier, rather than relying
17232 if (!aarch64_aligned_constant_offset_p (stmt_info
, size
))
17243 aarch64_vector_costs::add_stmt_cost (int count
, vect_cost_for_stmt kind
,
17244 stmt_vec_info stmt_info
, slp_tree
,
17245 tree vectype
, int misalign
,
17246 vect_cost_model_location where
)
17248 fractional_cost stmt_cost
17249 = aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
17251 bool in_inner_loop_p
= (where
== vect_body
17253 && stmt_in_inner_loop_p (m_vinfo
, stmt_info
));
17255 /* Do one-time initialization based on the vinfo. */
17256 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
17257 if (!m_analyzed_vinfo
&& aarch64_use_new_vector_costs_p ())
17260 analyze_loop_vinfo (loop_vinfo
);
17262 m_analyzed_vinfo
= true;
17265 /* Apply the heuristic described above m_stp_sequence_cost. */
17266 if (m_stp_sequence_cost
!= ~0U)
17268 uint64_t cost
= aarch64_stp_sequence_cost (count
, kind
,
17269 stmt_info
, vectype
);
17270 m_stp_sequence_cost
= MIN (m_stp_sequence_cost
+ cost
, ~0U);
17273 /* Try to get a more accurate cost by looking at STMT_INFO instead
17274 of just looking at KIND. */
17275 if (stmt_info
&& aarch64_use_new_vector_costs_p ())
17277 /* If we scalarize a strided store, the vectorizer costs one
17278 vec_to_scalar for each element. However, we can store the first
17279 element using an FP store without a separate extract step. */
17280 if (vect_is_store_elt_extraction (kind
, stmt_info
))
17283 stmt_cost
= aarch64_detect_scalar_stmt_subtype (m_vinfo
, kind
,
17284 stmt_info
, stmt_cost
);
17286 if (vectype
&& m_vec_flags
)
17287 stmt_cost
= aarch64_detect_vector_stmt_subtype (m_vinfo
, kind
,
17288 stmt_info
, vectype
,
17292 /* Do any SVE-specific adjustments to the cost. */
17293 if (stmt_info
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype
)))
17294 stmt_cost
= aarch64_sve_adjust_stmt_cost (m_vinfo
, kind
, stmt_info
,
17295 vectype
, stmt_cost
);
17297 /* Vector promotion and demotion requires us to widen the operation first
17298 and only after that perform the conversion. Unfortunately the mid-end
17299 expects this to be doable as a single operation and doesn't pass on
17300 enough context here for us to tell which operation is happening. To
17301 account for this we count every promote-demote operation twice and if
17302 the previously costed operation was also a promote-demote we reduce
17303 the cost of the currently being costed operation to simulate the final
17304 conversion cost. Note that for SVE we can do better here if the converted
17305 value comes from a load since the widening load would consume the widening
17306 operations. However since we're in stage 3 we can't change the helper
17307 vect_is_extending_load and duplicating the code seems not useful. */
17308 gassign
*assign
= NULL
;
17309 if (kind
== vec_promote_demote
17310 && (assign
= dyn_cast
<gassign
*> (STMT_VINFO_STMT (stmt_info
)))
17311 && gimple_assign_rhs_code (assign
) == FLOAT_EXPR
)
17313 auto new_count
= count
* 2 - m_num_last_promote_demote
;
17314 m_num_last_promote_demote
= count
;
17318 m_num_last_promote_demote
= 0;
17320 if (stmt_info
&& aarch64_use_new_vector_costs_p ())
17322 /* Account for any extra "embedded" costs that apply additively
17323 to the base cost calculated above. */
17324 stmt_cost
= aarch64_adjust_stmt_cost (m_vinfo
, kind
, stmt_info
,
17325 vectype
, m_vec_flags
, stmt_cost
);
17327 /* If we're recording a nonzero vector loop body cost for the
17328 innermost loop, also estimate the operations that would need
17329 to be issued by all relevant implementations of the loop. */
17331 && (m_costing_for_scalar
|| where
== vect_body
)
17332 && (!LOOP_VINFO_LOOP (loop_vinfo
)->inner
|| in_inner_loop_p
)
17334 for (auto &ops
: m_ops
)
17335 count_ops (count
, kind
, stmt_info
, &ops
);
17337 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17338 estimate the number of statements in the unrolled Advanced SIMD
17339 loop. For simplicitly, we assume that one iteration of the
17340 Advanced SIMD loop would need the same number of statements
17341 as one iteration of the SVE loop. */
17342 if (where
== vect_body
&& m_unrolled_advsimd_niters
)
17343 m_unrolled_advsimd_stmts
+= count
* m_unrolled_advsimd_niters
;
17345 /* Detect the use of an averaging operation. */
17346 gimple
*stmt
= stmt_info
->stmt
;
17347 if (is_gimple_call (stmt
)
17348 && gimple_call_internal_p (stmt
))
17350 switch (gimple_call_internal_fn (stmt
))
17352 case IFN_AVG_FLOOR
:
17361 /* If the statement stores to a decl that is known to be the argument
17362 to a vld1 in the same function, ignore the store for costing purposes.
17363 See the comment above m_stores_to_vector_load_decl for more details. */
17365 && (kind
== vector_store
|| kind
== unaligned_store
)
17366 && aarch64_accesses_vector_load_decl_p (stmt_info
))
17369 m_stores_to_vector_load_decl
= true;
17372 return record_stmt_cost (stmt_info
, where
, (count
* stmt_cost
).ceil ());
17375 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17376 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17377 says that we should prefer the Advanced SIMD loop. */
17379 aarch64_vector_costs::prefer_unrolled_loop () const
17381 if (!m_unrolled_advsimd_stmts
)
17384 if (dump_enabled_p ())
17385 dump_printf_loc (MSG_NOTE
, vect_location
, "Number of insns in"
17386 " unrolled Advanced SIMD loop = "
17387 HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
17388 m_unrolled_advsimd_stmts
);
17390 /* The balance here is tricky. On the one hand, we can't be sure whether
17391 the code is vectorizable with Advanced SIMD or not. However, even if
17392 it isn't vectorizable with Advanced SIMD, there's a possibility that
17393 the scalar code could also be unrolled. Some of the code might then
17394 benefit from SLP, or from using LDP and STP. We therefore apply
17395 the heuristic regardless of can_use_advsimd_p. */
17396 return (m_unrolled_advsimd_stmts
17397 && (m_unrolled_advsimd_stmts
17398 <= (unsigned int) param_max_completely_peeled_insns
));
17401 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
17402 how fast the SVE code can be issued and compare it to the equivalent value
17403 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
17404 also compare it to the issue rate of Advanced SIMD code
17405 (ADVSIMD_CYCLES_PER_ITER).
17407 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17408 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
17409 is true if we think the loop body is too expensive. */
17412 aarch64_vector_costs::
17413 adjust_body_cost_sve (const aarch64_vec_op_count
*ops
,
17414 fractional_cost scalar_cycles_per_iter
,
17415 unsigned int orig_body_cost
, unsigned int *body_cost
,
17416 bool *should_disparage
)
17418 if (dump_enabled_p ())
17421 fractional_cost sve_pred_cycles_per_iter
= ops
->min_pred_cycles_per_iter ();
17422 fractional_cost sve_cycles_per_iter
= ops
->min_cycles_per_iter ();
17424 /* If the scalar version of the loop could issue at least as
17425 quickly as the predicate parts of the SVE loop, make the SVE loop
17426 prohibitively expensive. In this case vectorization is adding an
17427 overhead that the original scalar code didn't have.
17429 This is mostly intended to detect cases in which WHILELOs dominate
17430 for very tight loops, which is something that normal latency-based
17431 costs would not model. Adding this kind of cliffedge would be
17432 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17433 code in the caller handles that case in a more conservative way. */
17434 fractional_cost sve_estimate
= sve_pred_cycles_per_iter
+ 1;
17435 if (scalar_cycles_per_iter
< sve_estimate
)
17437 unsigned int min_cost
17438 = orig_body_cost
* estimated_poly_value (BYTES_PER_SVE_VECTOR
);
17439 if (*body_cost
< min_cost
)
17441 if (dump_enabled_p ())
17442 dump_printf_loc (MSG_NOTE
, vect_location
,
17443 "Increasing body cost to %d because the"
17444 " scalar code could issue within the limit"
17445 " imposed by predicate operations\n",
17447 *body_cost
= min_cost
;
17448 *should_disparage
= true;
17452 return sve_cycles_per_iter
;
17456 aarch64_vector_costs::determine_suggested_unroll_factor ()
17458 bool sve
= m_vec_flags
& VEC_ANY_SVE
;
17459 /* If we are trying to unroll an Advanced SIMD main loop that contains
17460 an averaging operation that we do not support with SVE and we might use a
17461 predicated epilogue, we need to be conservative and block unrolling as
17462 this might lead to a less optimal loop for the first and only epilogue
17463 using the original loop's vectorization factor.
17464 TODO: Remove this constraint when we add support for multiple epilogue
17466 if (!sve
&& !TARGET_SVE2
&& m_has_avg
)
17469 unsigned int max_unroll_factor
= 1;
17470 for (auto vec_ops
: m_ops
)
17472 aarch64_simd_vec_issue_info
const *vec_issue
17473 = vec_ops
.simd_issue_info ();
17476 /* Limit unroll factor to a value adjustable by the user, the default
17478 unsigned int unroll_factor
= aarch64_vect_unroll_limit
;
17479 unsigned int factor
17480 = vec_ops
.reduction_latency
> 1 ? vec_ops
.reduction_latency
: 1;
17483 /* Sanity check, this should never happen. */
17484 if ((vec_ops
.stores
+ vec_ops
.loads
+ vec_ops
.general_ops
) == 0)
17487 /* Check stores. */
17488 if (vec_ops
.stores
> 0)
17490 temp
= CEIL (factor
* vec_issue
->stores_per_cycle
,
17492 unroll_factor
= MIN (unroll_factor
, temp
);
17495 /* Check loads + stores. */
17496 if (vec_ops
.loads
> 0)
17498 temp
= CEIL (factor
* vec_issue
->loads_stores_per_cycle
,
17499 vec_ops
.loads
+ vec_ops
.stores
);
17500 unroll_factor
= MIN (unroll_factor
, temp
);
17503 /* Check general ops. */
17504 if (vec_ops
.general_ops
> 0)
17506 temp
= CEIL (factor
* vec_issue
->general_ops_per_cycle
,
17507 vec_ops
.general_ops
);
17508 unroll_factor
= MIN (unroll_factor
, temp
);
17510 max_unroll_factor
= MAX (max_unroll_factor
, unroll_factor
);
17513 /* Make sure unroll factor is power of 2. */
17514 return 1 << ceil_log2 (max_unroll_factor
);
17517 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17518 and return the new cost. */
17520 aarch64_vector_costs::
17521 adjust_body_cost (loop_vec_info loop_vinfo
,
17522 const aarch64_vector_costs
*scalar_costs
,
17523 unsigned int body_cost
)
17525 if (scalar_costs
->m_ops
.is_empty () || m_ops
.is_empty ())
17528 const auto &scalar_ops
= scalar_costs
->m_ops
[0];
17529 const auto &vector_ops
= m_ops
[0];
17530 unsigned int estimated_vf
= vect_vf_for_cost (loop_vinfo
);
17531 unsigned int orig_body_cost
= body_cost
;
17532 bool should_disparage
= false;
17534 if (dump_enabled_p ())
17535 dump_printf_loc (MSG_NOTE
, vect_location
,
17536 "Original vector body cost = %d\n", body_cost
);
17538 fractional_cost scalar_cycles_per_iter
17539 = scalar_ops
.min_cycles_per_iter () * estimated_vf
;
17541 fractional_cost vector_cycles_per_iter
= vector_ops
.min_cycles_per_iter ();
17543 if (dump_enabled_p ())
17545 if (IN_RANGE (m_num_vector_iterations
, 0, 65536))
17546 dump_printf_loc (MSG_NOTE
, vect_location
,
17547 "Vector loop iterates at most %wd times\n",
17548 m_num_vector_iterations
);
17549 dump_printf_loc (MSG_NOTE
, vect_location
, "Scalar issue estimate:\n");
17550 scalar_ops
.dump ();
17551 dump_printf_loc (MSG_NOTE
, vect_location
,
17552 " estimated cycles per vector iteration"
17553 " (for VF %d) = %f\n",
17554 estimated_vf
, scalar_cycles_per_iter
.as_double ());
17557 if (vector_ops
.sve_issue_info ())
17559 if (dump_enabled_p ())
17560 dump_printf_loc (MSG_NOTE
, vect_location
, "SVE issue estimate:\n");
17561 vector_cycles_per_iter
17562 = adjust_body_cost_sve (&vector_ops
, scalar_cycles_per_iter
,
17563 orig_body_cost
, &body_cost
, &should_disparage
);
17565 if (aarch64_tune_params
.vec_costs
== &neoverse512tvb_vector_cost
)
17567 /* Also take Neoverse V1 tuning into account, doubling the
17568 scalar and Advanced SIMD estimates to account for the
17569 doubling in SVE vector length. */
17570 if (dump_enabled_p ())
17571 dump_printf_loc (MSG_NOTE
, vect_location
,
17572 "Neoverse V1 estimate:\n");
17573 auto vf_factor
= m_ops
[1].vf_factor ();
17574 adjust_body_cost_sve (&m_ops
[1], scalar_cycles_per_iter
* vf_factor
,
17575 orig_body_cost
, &body_cost
, &should_disparage
);
17580 if (dump_enabled_p ())
17582 dump_printf_loc (MSG_NOTE
, vect_location
,
17583 "Vector issue estimate:\n");
17584 vector_ops
.dump ();
17588 /* Decide whether to stick to latency-based costs or whether to try to
17589 take issue rates into account. */
17590 unsigned int threshold
= aarch64_loop_vect_issue_rate_niters
;
17591 if (m_vec_flags
& VEC_ANY_SVE
)
17592 threshold
= CEIL (threshold
, aarch64_estimated_sve_vq ());
17594 if (m_num_vector_iterations
>= 1
17595 && m_num_vector_iterations
< threshold
)
17597 if (dump_enabled_p ())
17598 dump_printf_loc (MSG_NOTE
, vect_location
,
17599 "Low iteration count, so using pure latency"
17602 /* Increase the cost of the vector code if it looks like the scalar code
17603 could issue more quickly. These values are only rough estimates,
17604 so minor differences should only result in minor changes. */
17605 else if (scalar_cycles_per_iter
< vector_cycles_per_iter
)
17607 body_cost
= fractional_cost::scale (body_cost
, vector_cycles_per_iter
,
17608 scalar_cycles_per_iter
);
17609 if (dump_enabled_p ())
17610 dump_printf_loc (MSG_NOTE
, vect_location
,
17611 "Increasing body cost to %d because scalar code"
17612 " would issue more quickly\n", body_cost
);
17614 /* In general, it's expected that the proposed vector code would be able
17615 to issue more quickly than the original scalar code. This should
17616 already be reflected to some extent in the latency-based costs.
17618 However, the latency-based costs effectively assume that the scalar
17619 code and the vector code execute serially, which tends to underplay
17620 one important case: if the real (non-serialized) execution time of
17621 a scalar iteration is dominated by loop-carried dependencies,
17622 and if the vector code is able to reduce both the length of
17623 the loop-carried dependencies *and* the number of cycles needed
17624 to issue the code in general, we can be more confident that the
17625 vector code is an improvement, even if adding the other (non-loop-carried)
17626 latencies tends to hide this saving. We therefore reduce the cost of the
17627 vector loop body in proportion to the saving. */
17628 else if (scalar_ops
.reduction_latency
> vector_ops
.reduction_latency
17629 && scalar_ops
.reduction_latency
== scalar_cycles_per_iter
17630 && scalar_cycles_per_iter
> vector_cycles_per_iter
17631 && !should_disparage
)
17633 body_cost
= fractional_cost::scale (body_cost
, vector_cycles_per_iter
,
17634 scalar_cycles_per_iter
);
17635 if (dump_enabled_p ())
17636 dump_printf_loc (MSG_NOTE
, vect_location
,
17637 "Decreasing body cost to %d account for smaller"
17638 " reduction latency\n", body_cost
);
17645 aarch64_vector_costs::finish_cost (const vector_costs
*uncast_scalar_costs
)
17647 /* Record the issue information for any SVE WHILE instructions that the
17649 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
17650 if (!m_ops
.is_empty ()
17652 && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
17654 unsigned int num_masks
= 0;
17655 rgroup_controls
*rgm
;
17656 unsigned int num_vectors_m1
;
17657 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
).rgc_vec
,
17658 num_vectors_m1
, rgm
)
17660 num_masks
+= num_vectors_m1
+ 1;
17661 for (auto &ops
: m_ops
)
17662 if (auto *issue
= ops
.sve_issue_info ())
17663 ops
.pred_ops
+= num_masks
* issue
->while_pred_ops
;
17667 = static_cast<const aarch64_vector_costs
*> (uncast_scalar_costs
);
17670 && aarch64_use_new_vector_costs_p ())
17672 m_costs
[vect_body
] = adjust_body_cost (loop_vinfo
, scalar_costs
,
17673 m_costs
[vect_body
]);
17674 m_suggested_unroll_factor
= determine_suggested_unroll_factor ();
17677 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17678 the scalar code in the event of a tie, since there is more chance
17679 of scalar code being optimized with surrounding operations.
17681 In addition, if the vector body is a simple store to a decl that
17682 is elsewhere loaded using vld1, strongly prefer the vector form,
17683 to the extent of giving the prologue a zero cost. See the comment
17684 above m_stores_to_vector_load_decl for details. */
17687 && m_stp_sequence_cost
!= ~0U)
17689 if (m_stores_to_vector_load_decl
)
17690 m_costs
[vect_prologue
] = 0;
17691 else if (m_stp_sequence_cost
>= scalar_costs
->m_stp_sequence_cost
)
17692 m_costs
[vect_body
] = 2 * scalar_costs
->total_cost ();
17695 vector_costs::finish_cost (scalar_costs
);
17699 aarch64_vector_costs::
17700 better_main_loop_than_p (const vector_costs
*uncast_other
) const
17702 auto other
= static_cast<const aarch64_vector_costs
*> (uncast_other
);
17704 auto this_loop_vinfo
= as_a
<loop_vec_info
> (this->m_vinfo
);
17705 auto other_loop_vinfo
= as_a
<loop_vec_info
> (other
->m_vinfo
);
17707 if (dump_enabled_p ())
17708 dump_printf_loc (MSG_NOTE
, vect_location
,
17709 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17710 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
17711 vect_vf_for_cost (this_loop_vinfo
),
17712 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
17713 vect_vf_for_cost (other_loop_vinfo
));
17715 /* Apply the unrolling heuristic described above
17716 m_unrolled_advsimd_niters. */
17717 if (bool (m_unrolled_advsimd_stmts
)
17718 != bool (other
->m_unrolled_advsimd_stmts
))
17720 bool this_prefer_unrolled
= this->prefer_unrolled_loop ();
17721 bool other_prefer_unrolled
= other
->prefer_unrolled_loop ();
17722 if (this_prefer_unrolled
!= other_prefer_unrolled
)
17724 if (dump_enabled_p ())
17725 dump_printf_loc (MSG_NOTE
, vect_location
,
17726 "Preferring Advanced SIMD loop because"
17727 " it can be unrolled\n");
17728 return other_prefer_unrolled
;
17732 for (unsigned int i
= 0; i
< m_ops
.length (); ++i
)
17734 if (dump_enabled_p ())
17737 dump_printf_loc (MSG_NOTE
, vect_location
,
17738 "Reconsidering with subtuning %d\n", i
);
17739 dump_printf_loc (MSG_NOTE
, vect_location
,
17740 "Issue info for %s loop:\n",
17741 GET_MODE_NAME (this_loop_vinfo
->vector_mode
));
17742 this->m_ops
[i
].dump ();
17743 dump_printf_loc (MSG_NOTE
, vect_location
,
17744 "Issue info for %s loop:\n",
17745 GET_MODE_NAME (other_loop_vinfo
->vector_mode
));
17746 other
->m_ops
[i
].dump ();
17749 auto this_estimated_vf
= (vect_vf_for_cost (this_loop_vinfo
)
17750 * this->m_ops
[i
].vf_factor ());
17751 auto other_estimated_vf
= (vect_vf_for_cost (other_loop_vinfo
)
17752 * other
->m_ops
[i
].vf_factor ());
17754 /* If it appears that one loop could process the same amount of data
17755 in fewer cycles, prefer that loop over the other one. */
17756 fractional_cost this_cost
17757 = this->m_ops
[i
].min_cycles_per_iter () * other_estimated_vf
;
17758 fractional_cost other_cost
17759 = other
->m_ops
[i
].min_cycles_per_iter () * this_estimated_vf
;
17760 if (dump_enabled_p ())
17762 dump_printf_loc (MSG_NOTE
, vect_location
,
17763 "Weighted cycles per iteration of %s loop ~= %f\n",
17764 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
17765 this_cost
.as_double ());
17766 dump_printf_loc (MSG_NOTE
, vect_location
,
17767 "Weighted cycles per iteration of %s loop ~= %f\n",
17768 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
17769 other_cost
.as_double ());
17771 if (this_cost
!= other_cost
)
17773 if (dump_enabled_p ())
17774 dump_printf_loc (MSG_NOTE
, vect_location
,
17775 "Preferring loop with lower cycles"
17776 " per iteration\n");
17777 return this_cost
< other_cost
;
17780 /* If the issue rate of SVE code is limited by predicate operations
17781 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17782 and if Advanced SIMD code could issue within the limit imposed
17783 by the predicate operations, the predicate operations are adding an
17784 overhead that the original code didn't have and so we should prefer
17785 the Advanced SIMD version. */
17786 auto better_pred_limit_p
= [](const aarch64_vec_op_count
&a
,
17787 const aarch64_vec_op_count
&b
) -> bool
17789 if (a
.pred_ops
== 0
17790 && (b
.min_pred_cycles_per_iter ()
17791 > b
.min_nonpred_cycles_per_iter ()))
17793 if (dump_enabled_p ())
17794 dump_printf_loc (MSG_NOTE
, vect_location
,
17795 "Preferring Advanced SIMD loop since"
17796 " SVE loop is predicate-limited\n");
17801 if (better_pred_limit_p (this->m_ops
[i
], other
->m_ops
[i
]))
17803 if (better_pred_limit_p (other
->m_ops
[i
], this->m_ops
[i
]))
17807 return vector_costs::better_main_loop_than_p (other
);
17810 static void initialize_aarch64_code_model (struct gcc_options
*);
17812 /* Parse the TO_PARSE string and put the architecture struct that it
17813 selects into RES and the architectural features into ISA_FLAGS.
17814 Return an aarch_parse_opt_result describing the parse result.
17815 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17816 When the TO_PARSE string contains an invalid extension,
17817 a copy of the string is created and stored to INVALID_EXTENSION. */
17819 static enum aarch_parse_opt_result
17820 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
17821 aarch64_feature_flags
*isa_flags
,
17822 std::string
*invalid_extension
)
17825 const struct processor
*arch
;
17828 ext
= strchr (to_parse
, '+');
17831 len
= ext
- to_parse
;
17833 len
= strlen (to_parse
);
17836 return AARCH_PARSE_MISSING_ARG
;
17839 /* Loop through the list of supported ARCHes to find a match. */
17840 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
17842 if (strlen (arch
->name
) == len
17843 && strncmp (arch
->name
, to_parse
, len
) == 0)
17845 auto isa_temp
= arch
->flags
;
17849 /* TO_PARSE string contains at least one extension. */
17850 enum aarch_parse_opt_result ext_res
17851 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
17853 if (ext_res
!= AARCH_PARSE_OK
)
17856 /* Extension parsing was successful. Confirm the result
17857 arch and ISA flags. */
17859 *isa_flags
= isa_temp
;
17860 return AARCH_PARSE_OK
;
17864 /* ARCH name not found in list. */
17865 return AARCH_PARSE_INVALID_ARG
;
17868 /* Parse the TO_PARSE string and put the result tuning in RES and the
17869 architecture flags in ISA_FLAGS. Return an aarch_parse_opt_result
17870 describing the parse result. If there is an error parsing, RES and
17871 ISA_FLAGS are left unchanged.
17872 When the TO_PARSE string contains an invalid extension,
17873 a copy of the string is created and stored to INVALID_EXTENSION. */
17875 static enum aarch_parse_opt_result
17876 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
17877 aarch64_feature_flags
*isa_flags
,
17878 std::string
*invalid_extension
)
17881 const struct processor
*cpu
;
17884 ext
= strchr (to_parse
, '+');
17887 len
= ext
- to_parse
;
17889 len
= strlen (to_parse
);
17892 return AARCH_PARSE_MISSING_ARG
;
17895 /* Loop through the list of supported CPUs to find a match. */
17896 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
17898 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
17900 auto isa_temp
= cpu
->flags
;
17904 /* TO_PARSE string contains at least one extension. */
17905 enum aarch_parse_opt_result ext_res
17906 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
17908 if (ext_res
!= AARCH_PARSE_OK
)
17911 /* Extension parsing was successfull. Confirm the result
17912 cpu and ISA flags. */
17914 *isa_flags
= isa_temp
;
17915 return AARCH_PARSE_OK
;
17919 /* CPU name not found in list. */
17920 return AARCH_PARSE_INVALID_ARG
;
17923 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17924 Return an aarch_parse_opt_result describing the parse result.
17925 If the parsing fails the RES does not change. */
17927 static enum aarch_parse_opt_result
17928 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
17930 const struct processor
*cpu
;
17932 /* Loop through the list of supported CPUs to find a match. */
17933 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
17935 if (strcmp (cpu
->name
, to_parse
) == 0)
17938 return AARCH_PARSE_OK
;
17942 /* CPU name not found in list. */
17943 return AARCH_PARSE_INVALID_ARG
;
17946 /* Parse TOKEN, which has length LENGTH to see if it is an option
17947 described in FLAG. If it is, return the index bit for that fusion type.
17948 If not, error (printing OPTION_NAME) and return zero. */
17950 static unsigned int
17951 aarch64_parse_one_option_token (const char *token
,
17953 const struct aarch64_flag_desc
*flag
,
17954 const char *option_name
)
17956 for (; flag
->name
!= NULL
; flag
++)
17958 if (length
== strlen (flag
->name
)
17959 && !strncmp (flag
->name
, token
, length
))
17963 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
17967 /* Parse OPTION which is a comma-separated list of flags to enable.
17968 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17969 default state we inherit from the CPU tuning structures. OPTION_NAME
17970 gives the top-level option we are parsing in the -moverride string,
17971 for use in error messages. */
17973 static unsigned int
17974 aarch64_parse_boolean_options (const char *option
,
17975 const struct aarch64_flag_desc
*flags
,
17976 unsigned int initial_state
,
17977 const char *option_name
)
17979 const char separator
= '.';
17980 const char* specs
= option
;
17981 const char* ntoken
= option
;
17982 unsigned int found_flags
= initial_state
;
17984 while ((ntoken
= strchr (specs
, separator
)))
17986 size_t token_length
= ntoken
- specs
;
17987 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
17991 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17992 in the token stream, reset the supported operations. So:
17994 adrp+add.cmp+branch.none.adrp+add
17996 would have the result of turning on only adrp+add fusion. */
18000 found_flags
|= token_ops
;
18004 /* We ended with a comma, print something. */
18007 error ("%qs string ill-formed", option_name
);
18011 /* We still have one more token to parse. */
18012 size_t token_length
= strlen (specs
);
18013 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
18020 found_flags
|= token_ops
;
18021 return found_flags
;
18024 /* Support for overriding instruction fusion. */
18027 aarch64_parse_fuse_string (const char *fuse_string
,
18028 struct tune_params
*tune
)
18030 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
18031 aarch64_fusible_pairs
,
18036 /* Support for overriding other tuning flags. */
18039 aarch64_parse_tune_string (const char *tune_string
,
18040 struct tune_params
*tune
)
18042 tune
->extra_tuning_flags
18043 = aarch64_parse_boolean_options (tune_string
,
18044 aarch64_tuning_flags
,
18045 tune
->extra_tuning_flags
,
18049 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18050 Accept the valid SVE vector widths allowed by
18051 aarch64_sve_vector_bits_enum and use it to override sve_width
18055 aarch64_parse_sve_width_string (const char *tune_string
,
18056 struct tune_params
*tune
)
18060 int n
= sscanf (tune_string
, "%d", &width
);
18063 error ("invalid format for %<sve_width%>");
18075 error ("invalid %<sve_width%> value: %d", width
);
18077 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
18080 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18081 we understand. If it is, extract the option string and handoff to
18082 the appropriate function. */
18085 aarch64_parse_one_override_token (const char* token
,
18087 struct tune_params
*tune
)
18089 const struct aarch64_tuning_override_function
*fn
18090 = aarch64_tuning_override_functions
;
18092 const char *option_part
= strchr (token
, '=');
18095 error ("tuning string missing in option (%s)", token
);
18099 /* Get the length of the option name. */
18100 length
= option_part
- token
;
18101 /* Skip the '=' to get to the option string. */
18104 for (; fn
->name
!= NULL
; fn
++)
18106 if (!strncmp (fn
->name
, token
, length
))
18108 fn
->parse_override (option_part
, tune
);
18113 error ("unknown tuning option (%s)",token
);
18117 /* A checking mechanism for the implementation of the tls size. */
18120 initialize_aarch64_tls_size (struct gcc_options
*opts
)
18122 if (aarch64_tls_size
== 0)
18123 aarch64_tls_size
= 24;
18125 switch (opts
->x_aarch64_cmodel_var
)
18127 case AARCH64_CMODEL_TINY
:
18128 /* Both the default and maximum TLS size allowed under tiny is 1M which
18129 needs two instructions to address, so we clamp the size to 24. */
18130 if (aarch64_tls_size
> 24)
18131 aarch64_tls_size
= 24;
18133 case AARCH64_CMODEL_SMALL
:
18134 /* The maximum TLS size allowed under small is 4G. */
18135 if (aarch64_tls_size
> 32)
18136 aarch64_tls_size
= 32;
18138 case AARCH64_CMODEL_LARGE
:
18139 /* The maximum TLS size allowed under large is 16E.
18140 FIXME: 16E should be 64bit, we only support 48bit offset now. */
18141 if (aarch64_tls_size
> 48)
18142 aarch64_tls_size
= 48;
18145 gcc_unreachable ();
18151 /* Return the CPU corresponding to the enum CPU. */
18153 static const struct processor
*
18154 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
18156 gcc_assert (cpu
!= aarch64_none
);
18158 return &all_cores
[cpu
];
18161 /* Return the architecture corresponding to the enum ARCH. */
18163 static const struct processor
*
18164 aarch64_get_arch (enum aarch64_arch arch
)
18166 gcc_assert (arch
!= aarch64_no_arch
);
18168 return &all_architectures
[arch
];
18171 /* Parse STRING looking for options in the format:
18172 string :: option:string
18173 option :: name=substring
18175 substring :: defined by option. */
18178 aarch64_parse_override_string (const char* input_string
,
18179 struct tune_params
* tune
)
18181 const char separator
= ':';
18182 size_t string_length
= strlen (input_string
) + 1;
18183 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
18184 char *string
= string_root
;
18185 strncpy (string
, input_string
, string_length
);
18186 string
[string_length
- 1] = '\0';
18188 char* ntoken
= string
;
18190 while ((ntoken
= strchr (string
, separator
)))
18192 size_t token_length
= ntoken
- string
;
18193 /* Make this substring look like a string. */
18195 aarch64_parse_one_override_token (string
, token_length
, tune
);
18199 /* One last option to parse. */
18200 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
18201 free (string_root
);
18204 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18205 are best for a generic target with the currently-enabled architecture
18208 aarch64_adjust_generic_arch_tuning (struct tune_params
¤t_tune
)
18210 /* Neoverse V1 is the only core that is known to benefit from
18211 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
18212 point enabling it for SVE2 and above. */
18214 current_tune
.extra_tuning_flags
18215 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
;
18219 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
18221 /* PR 70044: We have to be careful about being called multiple times for the
18222 same function. This means all changes should be repeatable. */
18224 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18225 Disable the frame pointer flag so the mid-end will not use a frame
18226 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18227 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18228 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
18229 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
18230 if (opts
->x_flag_omit_frame_pointer
== 0)
18231 opts
->x_flag_omit_frame_pointer
= 2;
18233 /* If not optimizing for size, set the default
18234 alignment to what the target wants. */
18235 if (!opts
->x_optimize_size
)
18237 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
18238 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
18239 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
18240 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
18241 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
18242 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
18245 /* We default to no pc-relative literal loads. */
18247 aarch64_pcrelative_literal_loads
= false;
18249 /* If -mpc-relative-literal-loads is set on the command line, this
18250 implies that the user asked for PC relative literal loads. */
18251 if (opts
->x_pcrelative_literal_loads
== 1)
18252 aarch64_pcrelative_literal_loads
= true;
18254 /* In the tiny memory model it makes no sense to disallow PC relative
18255 literal pool loads. */
18256 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
18257 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
18258 aarch64_pcrelative_literal_loads
= true;
18260 /* When enabling the lower precision Newton series for the square root, also
18261 enable it for the reciprocal square root, since the latter is an
18262 intermediary step for the former. */
18263 if (flag_mlow_precision_sqrt
)
18264 flag_mrecip_low_precision_sqrt
= true;
18267 /* 'Unpack' up the internal tuning structs and update the options
18268 in OPTS. The caller must have set up selected_tune and selected_arch
18269 as all the other target-specific codegen decisions are
18270 derived from them. */
18273 aarch64_override_options_internal (struct gcc_options
*opts
)
18275 const struct processor
*tune
= aarch64_get_tune_cpu (opts
->x_selected_tune
);
18276 aarch64_tune_flags
= tune
->flags
;
18277 aarch64_tune
= tune
->sched_core
;
18278 /* Make a copy of the tuning parameters attached to the core, which
18279 we may later overwrite. */
18280 aarch64_tune_params
= *(tune
->tune
);
18281 if (tune
->tune
== &generic_tunings
)
18282 aarch64_adjust_generic_arch_tuning (aarch64_tune_params
);
18284 if (opts
->x_aarch64_override_tune_string
)
18285 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
18286 &aarch64_tune_params
);
18288 if (opts
->x_aarch64_ldp_policy_param
)
18289 aarch64_tune_params
.ldp_policy_model
= opts
->x_aarch64_ldp_policy_param
;
18291 if (opts
->x_aarch64_stp_policy_param
)
18292 aarch64_tune_params
.stp_policy_model
= opts
->x_aarch64_stp_policy_param
;
18294 /* This target defaults to strict volatile bitfields. */
18295 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
18296 opts
->x_flag_strict_volatile_bitfields
= 1;
18298 if (aarch64_stack_protector_guard
== SSP_GLOBAL
18299 && opts
->x_aarch64_stack_protector_guard_offset_str
)
18301 error ("incompatible options %<-mstack-protector-guard=global%> and "
18302 "%<-mstack-protector-guard-offset=%s%>",
18303 aarch64_stack_protector_guard_offset_str
);
18306 if (aarch64_stack_protector_guard
== SSP_SYSREG
18307 && !(opts
->x_aarch64_stack_protector_guard_offset_str
18308 && opts
->x_aarch64_stack_protector_guard_reg_str
))
18310 error ("both %<-mstack-protector-guard-offset%> and "
18311 "%<-mstack-protector-guard-reg%> must be used "
18312 "with %<-mstack-protector-guard=sysreg%>");
18315 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
18317 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
18318 error ("specify a system register with a small string length");
18321 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
18324 const char *str
= aarch64_stack_protector_guard_offset_str
;
18326 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
18327 if (!*str
|| *end
|| errno
)
18328 error ("%qs is not a valid offset in %qs", str
,
18329 "-mstack-protector-guard-offset=");
18330 aarch64_stack_protector_guard_offset
= offs
;
18333 if ((flag_sanitize
& SANITIZE_SHADOW_CALL_STACK
)
18334 && !fixed_regs
[R18_REGNUM
])
18335 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18337 if ((opts
->x_aarch64_isa_flags
& (AARCH64_FL_SM_ON
| AARCH64_FL_ZA_ON
))
18338 && !(opts
->x_aarch64_isa_flags
& AARCH64_FL_SME
))
18340 if (opts
->x_aarch64_isa_flags
& AARCH64_FL_SM_ON
)
18341 error ("streaming functions require the ISA extension %qs", "sme");
18343 error ("functions with SME state require the ISA extension %qs",
18345 inform (input_location
, "you can enable %qs using the command-line"
18346 " option %<-march%>, or by using the %<target%>"
18347 " attribute or pragma", "sme");
18348 opts
->x_target_flags
&= ~MASK_GENERAL_REGS_ONLY
;
18349 auto new_flags
= (opts
->x_aarch64_asm_isa_flags
18350 | feature_deps::SME ().enable
);
18351 aarch64_set_asm_isa_flags (opts
, new_flags
);
18354 initialize_aarch64_code_model (opts
);
18355 initialize_aarch64_tls_size (opts
);
18356 aarch64_tpidr_register
= opts
->x_aarch64_tpidr_reg
;
18358 int queue_depth
= 0;
18359 switch (aarch64_tune_params
.autoprefetcher_model
)
18361 case tune_params::AUTOPREFETCHER_OFF
:
18364 case tune_params::AUTOPREFETCHER_WEAK
:
18367 case tune_params::AUTOPREFETCHER_STRONG
:
18368 queue_depth
= max_insn_queue_index
+ 1;
18371 gcc_unreachable ();
18374 /* We don't mind passing in global_options_set here as we don't use
18375 the *options_set structs anyway. */
18376 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18377 param_sched_autopref_queue_depth
, queue_depth
);
18379 /* Set up parameters to be used in prefetching algorithm. Do not
18380 override the defaults unless we are tuning for a core we have
18381 researched values for. */
18382 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
18383 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18384 param_simultaneous_prefetches
,
18385 aarch64_tune_params
.prefetch
->num_slots
);
18386 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
18387 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18388 param_l1_cache_size
,
18389 aarch64_tune_params
.prefetch
->l1_cache_size
);
18390 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
18391 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18392 param_l1_cache_line_size
,
18393 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18395 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
18397 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18398 param_destruct_interfere_size
,
18399 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18400 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18401 param_construct_interfere_size
,
18402 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
18406 /* For a generic AArch64 target, cover the current range of cache line
18408 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18409 param_destruct_interfere_size
,
18411 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18412 param_construct_interfere_size
,
18416 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
18417 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18418 param_l2_cache_size
,
18419 aarch64_tune_params
.prefetch
->l2_cache_size
);
18420 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
18421 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18422 param_prefetch_dynamic_strides
, 0);
18423 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
18424 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18425 param_prefetch_minimum_stride
,
18426 aarch64_tune_params
.prefetch
->minimum_stride
);
18428 /* Use the alternative scheduling-pressure algorithm by default. */
18429 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18430 param_sched_pressure_algorithm
,
18431 SCHED_PRESSURE_MODEL
);
18433 /* Validate the guard size. */
18434 int guard_size
= param_stack_clash_protection_guard_size
;
18436 if (guard_size
!= 12 && guard_size
!= 16)
18437 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18438 "size. Given value %d (%llu KB) is out of range",
18439 guard_size
, (1ULL << guard_size
) / 1024ULL);
18441 /* Enforce that interval is the same size as size so the mid-end does the
18443 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
18444 param_stack_clash_protection_probe_interval
,
18447 /* The maybe_set calls won't update the value if the user has explicitly set
18448 one. Which means we need to validate that probing interval and guard size
18451 = param_stack_clash_protection_probe_interval
;
18452 if (guard_size
!= probe_interval
)
18453 error ("stack clash guard size %<%d%> must be equal to probing interval "
18454 "%<%d%>", guard_size
, probe_interval
);
18456 /* Enable sw prefetching at specified optimization level for
18457 CPUS that have prefetch. Lower optimization level threshold by 1
18458 when profiling is enabled. */
18459 if (opts
->x_flag_prefetch_loop_arrays
< 0
18460 && !opts
->x_optimize_size
18461 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
18462 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
18463 opts
->x_flag_prefetch_loop_arrays
= 1;
18465 /* Avoid loop-dependant FMA chains. */
18466 if (aarch64_tune_params
.extra_tuning_flags
18467 & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA
)
18468 SET_OPTION_IF_UNSET (opts
, &global_options_set
, param_avoid_fma_max_bits
,
18471 /* Consider fully pipelined FMA in reassociation. */
18472 if (aarch64_tune_params
.extra_tuning_flags
18473 & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA
)
18474 SET_OPTION_IF_UNSET (opts
, &global_options_set
, param_fully_pipelined_fma
,
18477 aarch64_override_options_after_change_1 (opts
);
18480 /* Print a hint with a suggestion for a core or architecture name that
18481 most closely resembles what the user passed in STR. ARCH is true if
18482 the user is asking for an architecture name. ARCH is false if the user
18483 is asking for a core name. */
18486 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
18488 auto_vec
<const char *> candidates
;
18489 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
18490 for (; entry
->name
!= NULL
; entry
++)
18491 candidates
.safe_push (entry
->name
);
18493 #ifdef HAVE_LOCAL_CPU_DETECT
18494 /* Add also "native" as possible value. */
18496 candidates
.safe_push ("native");
18500 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
18502 inform (input_location
, "valid arguments are: %s;"
18503 " did you mean %qs?", s
, hint
);
18505 inform (input_location
, "valid arguments are: %s", s
);
18510 /* Print a hint with a suggestion for a core name that most closely resembles
18511 what the user passed in STR. */
18514 aarch64_print_hint_for_core (const char *str
)
18516 aarch64_print_hint_for_core_or_arch (str
, false);
18519 /* Print a hint with a suggestion for an architecture name that most closely
18520 resembles what the user passed in STR. */
18523 aarch64_print_hint_for_arch (const char *str
)
18525 aarch64_print_hint_for_core_or_arch (str
, true);
18529 /* Print a hint with a suggestion for an extension name
18530 that most closely resembles what the user passed in STR. */
18533 aarch64_print_hint_for_extensions (const std::string
&str
)
18535 auto_vec
<const char *> candidates
;
18536 aarch64_get_all_extension_candidates (&candidates
);
18538 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
18540 inform (input_location
, "valid arguments are: %s;"
18541 " did you mean %qs?", s
, hint
);
18543 inform (input_location
, "valid arguments are: %s", s
);
18548 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18549 specified in STR and throw errors if appropriate. Put the results if
18550 they are valid in RES and ISA_FLAGS. Return whether the option is
18554 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
18555 aarch64_feature_flags
*isa_flags
)
18557 std::string invalid_extension
;
18558 enum aarch_parse_opt_result parse_res
18559 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
18561 if (parse_res
== AARCH_PARSE_OK
)
18566 case AARCH_PARSE_MISSING_ARG
:
18567 error ("missing cpu name in %<-mcpu=%s%>", str
);
18569 case AARCH_PARSE_INVALID_ARG
:
18570 error ("unknown value %qs for %<-mcpu%>", str
);
18571 aarch64_print_hint_for_core (str
);
18572 /* A common user error is confusing -march and -mcpu.
18573 If the -mcpu string matches a known architecture then suggest
18575 parse_res
= aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
18576 if (parse_res
== AARCH_PARSE_OK
)
18577 inform (input_location
, "did you mean %<-march=%s%>?", str
);
18579 case AARCH_PARSE_INVALID_FEATURE
:
18580 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18581 invalid_extension
.c_str (), str
);
18582 aarch64_print_hint_for_extensions (invalid_extension
);
18585 gcc_unreachable ();
18591 /* Straight line speculation indicators. */
18592 enum aarch64_sls_hardening_type
18599 static enum aarch64_sls_hardening_type aarch64_sls_hardening
;
18601 /* Return whether we should mitigatate Straight Line Speculation for the RET
18602 and BR instructions. */
18604 aarch64_harden_sls_retbr_p (void)
18606 return aarch64_sls_hardening
& SLS_RETBR
;
18609 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18612 aarch64_harden_sls_blr_p (void)
18614 return aarch64_sls_hardening
& SLS_BLR
;
18617 /* As of yet we only allow setting these options globally, in the future we may
18618 allow setting them per function. */
18620 aarch64_validate_sls_mitigation (const char *const_str
)
18622 char *token_save
= NULL
;
18625 if (strcmp (const_str
, "none") == 0)
18627 aarch64_sls_hardening
= SLS_NONE
;
18630 if (strcmp (const_str
, "all") == 0)
18632 aarch64_sls_hardening
= SLS_ALL
;
18636 char *str_root
= xstrdup (const_str
);
18637 str
= strtok_r (str_root
, ",", &token_save
);
18639 error ("invalid argument given to %<-mharden-sls=%>");
18641 int temp
= SLS_NONE
;
18644 if (strcmp (str
, "blr") == 0)
18646 else if (strcmp (str
, "retbr") == 0)
18648 else if (strcmp (str
, "none") == 0 || strcmp (str
, "all") == 0)
18650 error ("%qs must be by itself for %<-mharden-sls=%>", str
);
18655 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str
);
18658 str
= strtok_r (NULL
, ",", &token_save
);
18660 aarch64_sls_hardening
= (aarch64_sls_hardening_type
) temp
;
18664 /* Validate a command-line -march option. Parse the arch and extensions
18665 (if any) specified in STR and throw errors if appropriate. Put the
18666 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18667 option is valid. */
18670 aarch64_validate_march (const char *str
, const struct processor
**res
,
18671 aarch64_feature_flags
*isa_flags
)
18673 std::string invalid_extension
;
18674 enum aarch_parse_opt_result parse_res
18675 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
18677 if (parse_res
== AARCH_PARSE_OK
)
18682 case AARCH_PARSE_MISSING_ARG
:
18683 error ("missing arch name in %<-march=%s%>", str
);
18685 case AARCH_PARSE_INVALID_ARG
:
18686 error ("unknown value %qs for %<-march%>", str
);
18687 aarch64_print_hint_for_arch (str
);
18688 /* A common user error is confusing -march and -mcpu.
18689 If the -march string matches a known CPU suggest -mcpu. */
18690 parse_res
= aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
18691 if (parse_res
== AARCH_PARSE_OK
)
18692 inform (input_location
, "did you mean %<-mcpu=%s%>?", str
);
18694 case AARCH_PARSE_INVALID_FEATURE
:
18695 error ("invalid feature modifier %qs in %<-march=%s%>",
18696 invalid_extension
.c_str (), str
);
18697 aarch64_print_hint_for_extensions (invalid_extension
);
18700 gcc_unreachable ();
18706 /* Validate a command-line -mtune option. Parse the cpu
18707 specified in STR and throw errors if appropriate. Put the
18708 result, if it is valid, in RES. Return whether the option is
18712 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
18714 enum aarch_parse_opt_result parse_res
18715 = aarch64_parse_tune (str
, res
);
18717 if (parse_res
== AARCH_PARSE_OK
)
18722 case AARCH_PARSE_MISSING_ARG
:
18723 error ("missing cpu name in %<-mtune=%s%>", str
);
18725 case AARCH_PARSE_INVALID_ARG
:
18726 error ("unknown value %qs for %<-mtune%>", str
);
18727 aarch64_print_hint_for_core (str
);
18730 gcc_unreachable ();
18735 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18738 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
18740 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18741 on big-endian targets, so we would need to forbid subregs that convert
18742 from one to the other. By default a reinterpret sequence would then
18743 involve a store to memory in one mode and a load back in the other.
18744 Even if we optimize that sequence using reverse instructions,
18745 it would still be a significant potential overhead.
18747 For now, it seems better to generate length-agnostic code for that
18749 if (value
== SVE_SCALABLE
18750 || (value
== SVE_128
&& BYTES_BIG_ENDIAN
))
18751 return poly_uint16 (2, 2);
18753 return (int) value
/ 64;
18756 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18757 aarch64_isa_flags accordingly. */
18760 aarch64_set_asm_isa_flags (aarch64_feature_flags flags
)
18762 aarch64_set_asm_isa_flags (&global_options
, flags
);
18766 aarch64_handle_no_branch_protection (void)
18768 aarch_ra_sign_scope
= AARCH_FUNCTION_NONE
;
18769 aarch_enable_bti
= 0;
18773 aarch64_handle_standard_branch_protection (void)
18775 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
18776 aarch64_ra_sign_key
= AARCH64_KEY_A
;
18777 aarch_enable_bti
= 1;
18781 aarch64_handle_pac_ret_protection (void)
18783 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
18784 aarch64_ra_sign_key
= AARCH64_KEY_A
;
18788 aarch64_handle_pac_ret_leaf (void)
18790 aarch_ra_sign_scope
= AARCH_FUNCTION_ALL
;
18794 aarch64_handle_pac_ret_b_key (void)
18796 aarch64_ra_sign_key
= AARCH64_KEY_B
;
18800 aarch64_handle_bti_protection (void)
18802 aarch_enable_bti
= 1;
18805 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes
[] = {
18806 { "leaf", false, aarch64_handle_pac_ret_leaf
, NULL
, 0 },
18807 { "b-key", false, aarch64_handle_pac_ret_b_key
, NULL
, 0 },
18808 { NULL
, false, NULL
, NULL
, 0 }
18811 static const struct aarch_branch_protect_type aarch64_branch_protect_types
[] =
18813 { "none", true, aarch64_handle_no_branch_protection
, NULL
, 0 },
18814 { "standard", true, aarch64_handle_standard_branch_protection
, NULL
, 0 },
18815 { "pac-ret", false, aarch64_handle_pac_ret_protection
,
18816 aarch64_pac_ret_subtypes
, ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
18817 { "bti", false, aarch64_handle_bti_protection
, NULL
, 0 },
18818 { NULL
, false, NULL
, NULL
, 0 }
18821 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18822 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18823 tuning structs. In particular it must set selected_tune and
18824 aarch64_asm_isa_flags that define the available ISA features and tuning
18825 decisions. It must also set selected_arch as this will be used to
18826 output the .arch asm tags for each function. */
18829 aarch64_override_options (void)
18831 aarch64_feature_flags cpu_isa
= 0;
18832 aarch64_feature_flags arch_isa
= 0;
18833 aarch64_set_asm_isa_flags (0);
18835 const struct processor
*cpu
= NULL
;
18836 const struct processor
*arch
= NULL
;
18837 const struct processor
*tune
= NULL
;
18839 if (aarch64_harden_sls_string
)
18840 aarch64_validate_sls_mitigation (aarch64_harden_sls_string
);
18842 if (aarch64_branch_protection_string
)
18843 aarch_validate_mbranch_protection (aarch64_branch_protect_types
,
18844 aarch64_branch_protection_string
,
18845 "-mbranch-protection=");
18847 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18848 If either of -march or -mtune is given, they override their
18849 respective component of -mcpu. */
18850 if (aarch64_cpu_string
)
18851 aarch64_validate_mcpu (aarch64_cpu_string
, &cpu
, &cpu_isa
);
18853 if (aarch64_arch_string
)
18854 aarch64_validate_march (aarch64_arch_string
, &arch
, &arch_isa
);
18856 if (aarch64_tune_string
)
18857 aarch64_validate_mtune (aarch64_tune_string
, &tune
);
18859 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18860 SUBTARGET_OVERRIDE_OPTIONS
;
18863 auto isa_mode
= AARCH64_FL_DEFAULT_ISA_MODE
;
18866 /* If both -mcpu and -march are specified, warn if they are not
18867 feature compatible. feature compatible means that the inclusion of the
18868 cpu features would end up disabling an achitecture feature. In
18869 otherwords the cpu features need to be a strict superset of the arch
18870 features and if so prefer the -march ISA flags. */
18871 auto full_arch_flags
= arch
->flags
| arch_isa
;
18872 auto full_cpu_flags
= cpu
->flags
| cpu_isa
;
18873 if (~full_cpu_flags
& full_arch_flags
)
18875 std::string ext_diff
18876 = aarch64_get_extension_string_for_isa_flags (full_arch_flags
,
18878 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18879 "and resulted in options %<%s%> being added",
18880 aarch64_cpu_string
,
18881 aarch64_arch_string
,
18882 ext_diff
.c_str ());
18885 selected_arch
= arch
->arch
;
18886 aarch64_set_asm_isa_flags (arch_isa
| isa_mode
);
18890 selected_arch
= cpu
->arch
;
18891 aarch64_set_asm_isa_flags (cpu_isa
| isa_mode
);
18895 cpu
= &all_cores
[arch
->ident
];
18896 selected_arch
= arch
->arch
;
18897 aarch64_set_asm_isa_flags (arch_isa
| isa_mode
);
18901 /* No -mcpu or -march specified, so use the default CPU. */
18902 cpu
= &all_cores
[TARGET_CPU_DEFAULT
];
18903 selected_arch
= cpu
->arch
;
18904 aarch64_set_asm_isa_flags (cpu
->flags
| isa_mode
);
18907 selected_tune
= tune
? tune
->ident
: cpu
->ident
;
18909 if (aarch_enable_bti
== 2)
18911 #ifdef TARGET_ENABLE_BTI
18912 aarch_enable_bti
= 1;
18914 aarch_enable_bti
= 0;
18918 /* Return address signing is currently not supported for ILP32 targets. For
18919 LP64 targets use the configured option in the absence of a command-line
18920 option for -mbranch-protection. */
18921 if (!TARGET_ILP32
&& aarch64_branch_protection_string
== NULL
)
18923 #ifdef TARGET_ENABLE_PAC_RET
18924 aarch_ra_sign_scope
= AARCH_FUNCTION_NON_LEAF
;
18926 aarch_ra_sign_scope
= AARCH_FUNCTION_NONE
;
18930 #ifndef HAVE_AS_MABI_OPTION
18931 /* The compiler may have been configured with 2.23.* binutils, which does
18932 not have support for ILP32. */
18934 error ("assembler does not support %<-mabi=ilp32%>");
18937 /* Convert -msve-vector-bits to a VG count. */
18938 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
18940 if (aarch_ra_sign_scope
!= AARCH_FUNCTION_NONE
&& TARGET_ILP32
)
18941 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18943 /* The pass to insert speculation tracking runs before
18944 shrink-wrapping and the latter does not know how to update the
18945 tracking status. So disable it in this case. */
18946 if (aarch64_track_speculation
)
18947 flag_shrink_wrap
= 0;
18949 aarch64_override_options_internal (&global_options
);
18951 /* Save these options as the default ones in case we push and pop them later
18952 while processing functions with potential target attributes. */
18953 target_option_default_node
= target_option_current_node
18954 = build_target_option_node (&global_options
, &global_options_set
);
18957 /* Implement targetm.override_options_after_change. */
18960 aarch64_override_options_after_change (void)
18962 aarch64_override_options_after_change_1 (&global_options
);
18965 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18967 aarch64_offload_options (void)
18970 return xstrdup ("-foffload-abi=ilp32");
18972 return xstrdup ("-foffload-abi=lp64");
18975 static struct machine_function
*
18976 aarch64_init_machine_status (void)
18978 struct machine_function
*machine
;
18979 machine
= ggc_cleared_alloc
<machine_function
> ();
18984 aarch64_init_expanders (void)
18986 init_machine_status
= aarch64_init_machine_status
;
18989 /* A checking mechanism for the implementation of the various code models. */
18991 initialize_aarch64_code_model (struct gcc_options
*opts
)
18993 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
18994 switch (opts
->x_aarch64_cmodel_var
)
18996 case AARCH64_CMODEL_TINY
:
18997 if (opts
->x_flag_pic
)
18998 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
19000 case AARCH64_CMODEL_SMALL
:
19001 if (opts
->x_flag_pic
)
19003 #ifdef HAVE_AS_SMALL_PIC_RELOCS
19004 aarch64_cmodel
= (flag_pic
== 2
19005 ? AARCH64_CMODEL_SMALL_PIC
19006 : AARCH64_CMODEL_SMALL_SPIC
);
19008 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
19012 case AARCH64_CMODEL_LARGE
:
19013 if (opts
->x_flag_pic
)
19014 sorry ("code model %qs with %<-f%s%>", "large",
19015 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
19016 if (opts
->x_aarch64_abi
== AARCH64_ABI_ILP32
)
19017 sorry ("code model %qs not supported in ilp32 mode", "large");
19019 case AARCH64_CMODEL_TINY_PIC
:
19020 case AARCH64_CMODEL_SMALL_PIC
:
19021 case AARCH64_CMODEL_SMALL_SPIC
:
19022 gcc_unreachable ();
19026 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
19027 using the information saved in PTR. */
19030 aarch64_option_restore (struct gcc_options
*opts
,
19031 struct gcc_options
* /* opts_set */,
19032 struct cl_target_option
* /* ptr */)
19034 aarch64_override_options_internal (opts
);
19037 /* Implement TARGET_OPTION_PRINT. */
19040 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
19042 const struct processor
*cpu
19043 = aarch64_get_tune_cpu (ptr
->x_selected_tune
);
19044 const struct processor
*arch
= aarch64_get_arch (ptr
->x_selected_arch
);
19045 std::string extension
19046 = aarch64_get_extension_string_for_isa_flags (ptr
->x_aarch64_asm_isa_flags
,
19049 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
19050 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
19051 arch
->name
, extension
.c_str ());
19054 static GTY(()) tree aarch64_previous_fndecl
;
19057 aarch64_reset_previous_fndecl (void)
19059 aarch64_previous_fndecl
= NULL
;
19062 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19063 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19064 make sure optab availability predicates are recomputed when necessary. */
19067 aarch64_save_restore_target_globals (tree new_tree
)
19069 if (TREE_TARGET_GLOBALS (new_tree
))
19070 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
19071 else if (new_tree
== target_option_default_node
)
19072 restore_target_globals (&default_target_globals
);
19074 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
19077 /* Return the target_option_node for FNDECL, or the current options
19078 if FNDECL is null. */
19081 aarch64_fndecl_options (tree fndecl
)
19084 return target_option_current_node
;
19086 if (tree options
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
))
19089 return target_option_default_node
;
19092 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
19093 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19094 of the function, if such exists. This function may be called multiple
19095 times on a single function so use aarch64_previous_fndecl to avoid
19096 setting up identical state. */
19099 aarch64_set_current_function (tree fndecl
)
19101 tree old_tree
= aarch64_fndecl_options (aarch64_previous_fndecl
);
19102 tree new_tree
= aarch64_fndecl_options (fndecl
);
19104 auto new_isa_mode
= (fndecl
19105 ? aarch64_fndecl_isa_mode (fndecl
)
19106 : AARCH64_FL_DEFAULT_ISA_MODE
);
19107 auto isa_flags
= TREE_TARGET_OPTION (new_tree
)->x_aarch64_isa_flags
;
19109 static bool reported_zt0_p
;
19110 if (!reported_zt0_p
19111 && !(isa_flags
& AARCH64_FL_SME2
)
19113 && aarch64_fndecl_has_state (fndecl
, "zt0"))
19115 error ("functions with %qs state require the ISA extension %qs",
19117 inform (input_location
, "you can enable %qs using the command-line"
19118 " option %<-march%>, or by using the %<target%>"
19119 " attribute or pragma", "sme2");
19120 reported_zt0_p
= true;
19123 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
19124 the default have been handled by aarch64_save_restore_target_globals from
19125 aarch64_pragma_target_parse. */
19126 if (old_tree
== new_tree
19127 && (!fndecl
|| aarch64_previous_fndecl
)
19128 && (isa_flags
& AARCH64_FL_ISA_MODES
) == new_isa_mode
)
19130 gcc_assert (AARCH64_ISA_MODE
== new_isa_mode
);
19134 aarch64_previous_fndecl
= fndecl
;
19136 /* First set the target options. */
19137 cl_target_option_restore (&global_options
, &global_options_set
,
19138 TREE_TARGET_OPTION (new_tree
));
19140 /* The ISA mode can vary based on function type attributes and
19141 function declaration attributes. Make sure that the target
19142 options correctly reflect these attributes. */
19143 if ((isa_flags
& AARCH64_FL_ISA_MODES
) != new_isa_mode
)
19145 auto base_flags
= (aarch64_asm_isa_flags
& ~AARCH64_FL_ISA_MODES
);
19146 aarch64_set_asm_isa_flags (base_flags
| new_isa_mode
);
19148 aarch64_override_options_internal (&global_options
);
19149 new_tree
= build_target_option_node (&global_options
,
19150 &global_options_set
);
19151 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_tree
;
19153 tree new_optimize
= build_optimization_node (&global_options
,
19154 &global_options_set
);
19155 if (new_optimize
!= optimization_default_node
)
19156 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
19159 aarch64_save_restore_target_globals (new_tree
);
19161 gcc_assert (AARCH64_ISA_MODE
== new_isa_mode
);
19164 /* Enum describing the various ways we can handle attributes.
19165 In many cases we can reuse the generic option handling machinery. */
19167 enum aarch64_attr_opt_type
19169 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
19170 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
19171 aarch64_attr_enum
, /* Attribute sets an enum variable. */
19172 aarch64_attr_custom
/* Attribute requires a custom handling function. */
19175 /* All the information needed to handle a target attribute.
19176 NAME is the name of the attribute.
19177 ATTR_TYPE specifies the type of behavior of the attribute as described
19178 in the definition of enum aarch64_attr_opt_type.
19179 ALLOW_NEG is true if the attribute supports a "no-" form.
19180 HANDLER is the function that takes the attribute string as an argument
19181 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19182 OPT_NUM is the enum specifying the option that the attribute modifies.
19183 This is needed for attributes that mirror the behavior of a command-line
19184 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19185 aarch64_attr_enum. */
19187 struct aarch64_attribute_info
19190 enum aarch64_attr_opt_type attr_type
;
19192 bool (*handler
) (const char *);
19193 enum opt_code opt_num
;
19196 /* Handle the ARCH_STR argument to the arch= target attribute. */
19199 aarch64_handle_attr_arch (const char *str
)
19201 const struct processor
*tmp_arch
= NULL
;
19202 std::string invalid_extension
;
19203 aarch64_feature_flags tmp_flags
;
19204 enum aarch_parse_opt_result parse_res
19205 = aarch64_parse_arch (str
, &tmp_arch
, &tmp_flags
, &invalid_extension
);
19207 if (parse_res
== AARCH_PARSE_OK
)
19209 gcc_assert (tmp_arch
);
19210 selected_arch
= tmp_arch
->arch
;
19211 aarch64_set_asm_isa_flags (tmp_flags
| AARCH64_ISA_MODE
);
19217 case AARCH_PARSE_MISSING_ARG
:
19218 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19220 case AARCH_PARSE_INVALID_ARG
:
19221 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str
);
19222 aarch64_print_hint_for_arch (str
);
19224 case AARCH_PARSE_INVALID_FEATURE
:
19225 error ("invalid feature modifier %s of value %qs in "
19226 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19227 aarch64_print_hint_for_extensions (invalid_extension
);
19230 gcc_unreachable ();
19236 /* Handle the argument CPU_STR to the cpu= target attribute. */
19239 aarch64_handle_attr_cpu (const char *str
)
19241 const struct processor
*tmp_cpu
= NULL
;
19242 std::string invalid_extension
;
19243 aarch64_feature_flags tmp_flags
;
19244 enum aarch_parse_opt_result parse_res
19245 = aarch64_parse_cpu (str
, &tmp_cpu
, &tmp_flags
, &invalid_extension
);
19247 if (parse_res
== AARCH_PARSE_OK
)
19249 gcc_assert (tmp_cpu
);
19250 selected_tune
= tmp_cpu
->ident
;
19251 selected_arch
= tmp_cpu
->arch
;
19252 aarch64_set_asm_isa_flags (tmp_flags
| AARCH64_ISA_MODE
);
19258 case AARCH_PARSE_MISSING_ARG
:
19259 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19261 case AARCH_PARSE_INVALID_ARG
:
19262 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str
);
19263 aarch64_print_hint_for_core (str
);
19265 case AARCH_PARSE_INVALID_FEATURE
:
19266 error ("invalid feature modifier %qs of value %qs in "
19267 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19268 aarch64_print_hint_for_extensions (invalid_extension
);
19271 gcc_unreachable ();
19277 /* Handle the argument STR to the branch-protection= attribute. */
19280 aarch64_handle_attr_branch_protection (const char* str
)
19282 return aarch_validate_mbranch_protection (aarch64_branch_protect_types
, str
,
19283 "target(\"branch-protection=\")");
19286 /* Handle the argument STR to the tune= target attribute. */
19289 aarch64_handle_attr_tune (const char *str
)
19291 const struct processor
*tmp_tune
= NULL
;
19292 enum aarch_parse_opt_result parse_res
19293 = aarch64_parse_tune (str
, &tmp_tune
);
19295 if (parse_res
== AARCH_PARSE_OK
)
19297 gcc_assert (tmp_tune
);
19298 selected_tune
= tmp_tune
->ident
;
19304 case AARCH_PARSE_INVALID_ARG
:
19305 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str
);
19306 aarch64_print_hint_for_core (str
);
19309 gcc_unreachable ();
19315 /* Parse an architecture extensions target attribute string specified in STR.
19316 For example "+fp+nosimd". Show any errors if needed. Return TRUE
19317 if successful. Update aarch64_isa_flags to reflect the ISA features
19321 aarch64_handle_attr_isa_flags (char *str
)
19323 enum aarch_parse_opt_result parse_res
;
19324 auto isa_flags
= aarch64_asm_isa_flags
;
19326 /* We allow "+nothing" in the beginning to clear out all architectural
19327 features if the user wants to handpick specific features. */
19328 if (strncmp ("+nothing", str
, 8) == 0)
19330 isa_flags
= AARCH64_ISA_MODE
;
19334 std::string invalid_extension
;
19335 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
19337 if (parse_res
== AARCH_PARSE_OK
)
19339 aarch64_set_asm_isa_flags (isa_flags
);
19345 case AARCH_PARSE_MISSING_ARG
:
19346 error ("missing value in %<target()%> pragma or attribute");
19349 case AARCH_PARSE_INVALID_FEATURE
:
19350 error ("invalid feature modifier %qs of value %qs in "
19351 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
19355 gcc_unreachable ();
19361 /* The target attributes that we support. On top of these we also support just
19362 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
19363 handled explicitly in aarch64_process_one_target_attr. */
19365 static const struct aarch64_attribute_info aarch64_attributes
[] =
19367 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
19368 OPT_mgeneral_regs_only
},
19369 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
19370 OPT_mfix_cortex_a53_835769
},
19371 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
19372 OPT_mfix_cortex_a53_843419
},
19373 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
19374 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
19375 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
19376 OPT_momit_leaf_frame_pointer
},
19377 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
19378 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
19380 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
19381 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
19383 { "branch-protection", aarch64_attr_custom
, false,
19384 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
19385 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
19386 OPT_msign_return_address_
},
19387 { "outline-atomics", aarch64_attr_bool
, true, NULL
,
19388 OPT_moutline_atomics
},
19389 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
19392 /* Parse ARG_STR which contains the definition of one target attribute.
19393 Show appropriate errors if any or return true if the attribute is valid. */
19396 aarch64_process_one_target_attr (char *arg_str
)
19398 bool invert
= false;
19400 size_t len
= strlen (arg_str
);
19404 error ("malformed %<target()%> pragma or attribute");
19408 auto_vec
<char, 32> buffer
;
19409 buffer
.safe_grow (len
+ 1);
19410 char *str_to_check
= buffer
.address ();
19411 memcpy (str_to_check
, arg_str
, len
+ 1);
19413 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19414 It is easier to detect and handle it explicitly here rather than going
19415 through the machinery for the rest of the target attributes in this
19417 if (*str_to_check
== '+')
19418 return aarch64_handle_attr_isa_flags (str_to_check
);
19420 if (len
> 3 && startswith (str_to_check
, "no-"))
19425 char *arg
= strchr (str_to_check
, '=');
19427 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19428 and point ARG to "foo". */
19434 const struct aarch64_attribute_info
*p_attr
;
19435 bool found
= false;
19436 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
19438 /* If the names don't match up, or the user has given an argument
19439 to an attribute that doesn't accept one, or didn't give an argument
19440 to an attribute that expects one, fail to match. */
19441 if (strcmp (str_to_check
, p_attr
->name
) != 0)
19445 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
19446 || p_attr
->attr_type
== aarch64_attr_enum
;
19448 if (attr_need_arg_p
^ (arg
!= NULL
))
19450 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
19454 /* If the name matches but the attribute does not allow "no-" versions
19455 then we can't match. */
19456 if (invert
&& !p_attr
->allow_neg
)
19458 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
19462 switch (p_attr
->attr_type
)
19464 /* Has a custom handler registered.
19465 For example, cpu=, arch=, tune=. */
19466 case aarch64_attr_custom
:
19467 gcc_assert (p_attr
->handler
);
19468 if (!p_attr
->handler (arg
))
19472 /* Either set or unset a boolean option. */
19473 case aarch64_attr_bool
:
19475 struct cl_decoded_option decoded
;
19477 generate_option (p_attr
->opt_num
, NULL
, !invert
,
19478 CL_TARGET
, &decoded
);
19479 aarch64_handle_option (&global_options
, &global_options_set
,
19480 &decoded
, input_location
);
19483 /* Set or unset a bit in the target_flags. aarch64_handle_option
19484 should know what mask to apply given the option number. */
19485 case aarch64_attr_mask
:
19487 struct cl_decoded_option decoded
;
19488 /* We only need to specify the option number.
19489 aarch64_handle_option will know which mask to apply. */
19490 decoded
.opt_index
= p_attr
->opt_num
;
19491 decoded
.value
= !invert
;
19492 aarch64_handle_option (&global_options
, &global_options_set
,
19493 &decoded
, input_location
);
19496 /* Use the option setting machinery to set an option to an enum. */
19497 case aarch64_attr_enum
:
19502 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
19503 &value
, CL_TARGET
);
19506 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
19507 NULL
, DK_UNSPECIFIED
, input_location
,
19512 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
19517 gcc_unreachable ();
19521 /* If we reached here we either have found an attribute and validated
19522 it or didn't match any. If we matched an attribute but its arguments
19523 were malformed we will have returned false already. */
19527 /* Count how many times the character C appears in
19528 NULL-terminated string STR. */
19530 static unsigned int
19531 num_occurences_in_str (char c
, char *str
)
19533 unsigned int res
= 0;
19534 while (*str
!= '\0')
19545 /* Parse the tree in ARGS that contains the target attribute information
19546 and update the global target options space. */
19549 aarch64_process_target_attr (tree args
)
19551 if (TREE_CODE (args
) == TREE_LIST
)
19555 tree head
= TREE_VALUE (args
);
19558 if (!aarch64_process_target_attr (head
))
19561 args
= TREE_CHAIN (args
);
19567 if (TREE_CODE (args
) != STRING_CST
)
19569 error ("attribute %<target%> argument not a string");
19573 size_t len
= strlen (TREE_STRING_POINTER (args
));
19574 auto_vec
<char, 32> buffer
;
19575 buffer
.safe_grow (len
+ 1);
19576 char *str_to_check
= buffer
.address ();
19577 memcpy (str_to_check
, TREE_STRING_POINTER (args
), len
+ 1);
19581 error ("malformed %<target()%> pragma or attribute");
19585 /* Used to catch empty spaces between commas i.e.
19586 attribute ((target ("attr1,,attr2"))). */
19587 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
19589 /* Handle multiple target attributes separated by ','. */
19590 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
19592 unsigned int num_attrs
= 0;
19596 if (!aarch64_process_one_target_attr (token
))
19598 /* Check if token is possibly an arch extension without
19600 aarch64_feature_flags isa_temp
= 0;
19601 auto with_plus
= std::string ("+") + token
;
19602 enum aarch_parse_opt_result ext_res
19603 = aarch64_parse_extension (with_plus
.c_str (), &isa_temp
, nullptr);
19605 if (ext_res
== AARCH_PARSE_OK
)
19606 error ("arch extension %<%s%> should be prefixed by %<+%>",
19609 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
19613 token
= strtok_r (NULL
, ",", &str_to_check
);
19616 if (num_attrs
!= num_commas
+ 1)
19618 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
19625 static bool aarch64_process_target_version_attr (tree args
);
19627 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19628 process attribute ((target ("..."))). */
19631 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
19633 struct cl_target_option cur_target
;
19636 tree new_target
, new_optimize
;
19637 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
19639 /* If what we're processing is the current pragma string then the
19640 target option node is already stored in target_option_current_node
19641 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19642 having to re-parse the string. This is especially useful to keep
19643 arm_neon.h compile times down since that header contains a lot
19644 of intrinsics enclosed in pragmas. */
19645 if (!existing_target
&& args
== current_target_pragma
)
19647 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
19650 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
19653 = build_optimization_node (&global_options
, &global_options_set
);
19654 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
19656 /* If the function changed the optimization levels as well as setting
19657 target options, start with the optimizations specified. */
19658 if (func_optimize
&& func_optimize
!= old_optimize
)
19659 cl_optimization_restore (&global_options
, &global_options_set
,
19660 TREE_OPTIMIZATION (func_optimize
));
19662 /* Save the current target options to restore at the end. */
19663 cl_target_option_save (&cur_target
, &global_options
, &global_options_set
);
19665 /* If fndecl already has some target attributes applied to it, unpack
19666 them so that we add this attribute on top of them, rather than
19667 overwriting them. */
19668 if (existing_target
)
19670 struct cl_target_option
*existing_options
19671 = TREE_TARGET_OPTION (existing_target
);
19673 if (existing_options
)
19674 cl_target_option_restore (&global_options
, &global_options_set
,
19678 cl_target_option_restore (&global_options
, &global_options_set
,
19679 TREE_TARGET_OPTION (target_option_current_node
));
19681 ret
= aarch64_process_target_attr (args
);
19684 tree version_attr
= lookup_attribute ("target_version",
19685 DECL_ATTRIBUTES (fndecl
));
19686 if (version_attr
!= NULL_TREE
)
19688 /* Reapply any target_version attribute after target attribute.
19689 This should be equivalent to applying the target_version once
19690 after processing all target attributes. */
19691 tree version_args
= TREE_VALUE (version_attr
);
19692 ret
= aarch64_process_target_version_attr (version_args
);
19696 /* Set up any additional state. */
19699 aarch64_override_options_internal (&global_options
);
19700 new_target
= build_target_option_node (&global_options
,
19701 &global_options_set
);
19706 new_optimize
= build_optimization_node (&global_options
,
19707 &global_options_set
);
19711 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
19713 if (old_optimize
!= new_optimize
)
19714 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
19717 cl_target_option_restore (&global_options
, &global_options_set
, &cur_target
);
19719 if (old_optimize
!= new_optimize
)
19720 cl_optimization_restore (&global_options
, &global_options_set
,
19721 TREE_OPTIMIZATION (old_optimize
));
19725 typedef unsigned long long aarch64_fmv_feature_mask
;
19730 aarch64_fmv_feature_mask feature_mask
;
19731 aarch64_feature_flags opt_flags
;
19732 } aarch64_fmv_feature_datum
;
19734 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19735 {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19737 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19738 feature_deps name. */
19739 #define FEAT_RDMA FEAT_RDM
19741 /* FMV features are listed in priority order, to make it easier to sort target
19743 static aarch64_fmv_feature_datum aarch64_fmv_feature_data
[] = {
19744 #include "config/aarch64/aarch64-option-extensions.def"
19747 /* Parse a function multiversioning feature string STR, as found in a
19748 target_version or target_clones attribute.
19750 If ISA_FLAGS is nonnull, then update it with the specified architecture
19751 features turned on. If FEATURE_MASK is nonnull, then assign to it a bitmask
19752 representing the set of features explicitly specified in the feature string.
19753 Return an aarch_parse_opt_result describing the result.
19755 When the STR string contains an invalid or duplicate extension, a copy of
19756 the extension string is created and stored to INVALID_EXTENSION. */
19758 static enum aarch_parse_opt_result
19759 aarch64_parse_fmv_features (const char *str
, aarch64_feature_flags
*isa_flags
,
19760 aarch64_fmv_feature_mask
*feature_mask
,
19761 std::string
*invalid_extension
)
19764 *feature_mask
= 0ULL;
19766 if (strcmp (str
, "default") == 0)
19767 return AARCH_PARSE_OK
;
19769 while (str
!= NULL
&& *str
!= 0)
19774 ext
= strchr (str
, '+');
19779 len
= strlen (str
);
19782 return AARCH_PARSE_MISSING_ARG
;
19784 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
19786 for (i
= 0; i
< num_features
; i
++)
19788 if (strlen (aarch64_fmv_feature_data
[i
].name
) == len
19789 && strncmp (aarch64_fmv_feature_data
[i
].name
, str
, len
) == 0)
19792 *isa_flags
|= aarch64_fmv_feature_data
[i
].opt_flags
;
19795 auto old_feature_mask
= *feature_mask
;
19796 *feature_mask
|= aarch64_fmv_feature_data
[i
].feature_mask
;
19797 if (*feature_mask
== old_feature_mask
)
19799 /* Duplicate feature. */
19800 if (invalid_extension
)
19801 *invalid_extension
= std::string (str
, len
);
19802 return AARCH_PARSE_DUPLICATE_FEATURE
;
19809 if (i
== num_features
)
19811 /* Feature not found in list. */
19812 if (invalid_extension
)
19813 *invalid_extension
= std::string (str
, len
);
19814 return AARCH_PARSE_INVALID_FEATURE
;
19819 /* Skip over the next '+'. */
19823 return AARCH_PARSE_OK
;
19826 /* Parse the tree in ARGS that contains the target_version attribute
19827 information and update the global target options space. */
19830 aarch64_process_target_version_attr (tree args
)
19832 if (TREE_CODE (args
) == TREE_LIST
)
19834 if (TREE_CHAIN (args
))
19836 error ("attribute %<target_version%> has multiple values");
19839 args
= TREE_VALUE (args
);
19842 if (!args
|| TREE_CODE (args
) != STRING_CST
)
19844 error ("attribute %<target_version%> argument not a string");
19848 const char *str
= TREE_STRING_POINTER (args
);
19850 enum aarch_parse_opt_result parse_res
;
19851 auto isa_flags
= aarch64_asm_isa_flags
;
19853 std::string invalid_extension
;
19854 parse_res
= aarch64_parse_fmv_features (str
, &isa_flags
, NULL
,
19855 &invalid_extension
);
19857 if (parse_res
== AARCH_PARSE_OK
)
19859 aarch64_set_asm_isa_flags (isa_flags
);
19865 case AARCH_PARSE_MISSING_ARG
:
19866 error ("missing value in %<target_version%> attribute");
19869 case AARCH_PARSE_INVALID_FEATURE
:
19870 error ("invalid feature modifier %qs of value %qs in "
19871 "%<target_version%> attribute", invalid_extension
.c_str (),
19875 case AARCH_PARSE_DUPLICATE_FEATURE
:
19876 error ("duplicate feature modifier %qs of value %qs in "
19877 "%<target_version%> attribute", invalid_extension
.c_str (),
19882 gcc_unreachable ();
19888 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P. This is used to
19889 process attribute ((target_version ("..."))). */
19892 aarch64_option_valid_version_attribute_p (tree fndecl
, tree
, tree args
, int)
19894 struct cl_target_option cur_target
;
19897 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
19899 /* Save the current target options to restore at the end. */
19900 cl_target_option_save (&cur_target
, &global_options
, &global_options_set
);
19902 /* If fndecl already has some target attributes applied to it, unpack
19903 them so that we add this attribute on top of them, rather than
19904 overwriting them. */
19905 if (existing_target
)
19907 struct cl_target_option
*existing_options
19908 = TREE_TARGET_OPTION (existing_target
);
19910 if (existing_options
)
19911 cl_target_option_restore (&global_options
, &global_options_set
,
19915 cl_target_option_restore (&global_options
, &global_options_set
,
19916 TREE_TARGET_OPTION (target_option_current_node
));
19918 ret
= aarch64_process_target_version_attr (args
);
19920 /* Set up any additional state. */
19923 aarch64_override_options_internal (&global_options
);
19924 new_target
= build_target_option_node (&global_options
,
19925 &global_options_set
);
19931 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
19933 cl_target_option_restore (&global_options
, &global_options_set
, &cur_target
);
19938 /* This parses the attribute arguments to target_version in DECL and the
19939 feature mask required to select those targets. No adjustments are made to
19940 add or remove redundant feature requirements. */
19942 static aarch64_fmv_feature_mask
19943 get_feature_mask_for_version (tree decl
)
19945 tree version_attr
= lookup_attribute ("target_version",
19946 DECL_ATTRIBUTES (decl
));
19947 if (version_attr
== NULL
)
19950 const char *version_string
= TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
19952 enum aarch_parse_opt_result parse_res
;
19953 aarch64_fmv_feature_mask feature_mask
;
19955 parse_res
= aarch64_parse_fmv_features (version_string
, NULL
, &feature_mask
,
19958 /* We should have detected any errors before getting here. */
19959 gcc_assert (parse_res
== AARCH_PARSE_OK
);
19961 return feature_mask
;
19964 /* Compare priorities of two feature masks. Return:
19965 1: mask1 is higher priority
19966 -1: mask2 is higher priority
19967 0: masks are equal. */
19970 compare_feature_masks (aarch64_fmv_feature_mask mask1
,
19971 aarch64_fmv_feature_mask mask2
)
19973 int pop1
= popcount_hwi (mask1
);
19974 int pop2
= popcount_hwi (mask2
);
19980 auto diff_mask
= mask1
^ mask2
;
19981 if (diff_mask
== 0ULL)
19983 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
19984 for (int i
= num_features
- 1; i
>= 0; i
--)
19986 auto bit_mask
= aarch64_fmv_feature_data
[i
].feature_mask
;
19987 if (diff_mask
& bit_mask
)
19988 return (mask1
& bit_mask
) ? 1 : -1;
19993 /* Compare priorities of two version decls. */
19996 aarch64_compare_version_priority (tree decl1
, tree decl2
)
19998 auto mask1
= get_feature_mask_for_version (decl1
);
19999 auto mask2
= get_feature_mask_for_version (decl2
);
20001 return compare_feature_masks (mask1
, mask2
);
20004 /* Build the struct __ifunc_arg_t type:
20006 struct __ifunc_arg_t
20008 unsigned long _size; // Size of the struct, so it can grow.
20009 unsigned long _hwcap;
20010 unsigned long _hwcap2;
20015 build_ifunc_arg_type ()
20017 tree ifunc_arg_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
20018 tree field1
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20019 get_identifier ("_size"),
20020 long_unsigned_type_node
);
20021 tree field2
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20022 get_identifier ("_hwcap"),
20023 long_unsigned_type_node
);
20024 tree field3
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20025 get_identifier ("_hwcap2"),
20026 long_unsigned_type_node
);
20028 DECL_FIELD_CONTEXT (field1
) = ifunc_arg_type
;
20029 DECL_FIELD_CONTEXT (field2
) = ifunc_arg_type
;
20030 DECL_FIELD_CONTEXT (field3
) = ifunc_arg_type
;
20032 TYPE_FIELDS (ifunc_arg_type
) = field1
;
20033 DECL_CHAIN (field1
) = field2
;
20034 DECL_CHAIN (field2
) = field3
;
20036 layout_type (ifunc_arg_type
);
20038 tree const_type
= build_qualified_type (ifunc_arg_type
, TYPE_QUAL_CONST
);
20039 tree pointer_type
= build_pointer_type (const_type
);
20041 return pointer_type
;
20044 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20048 aarch64_mangle_decl_assembler_name (tree decl
, tree id
)
20050 /* For function version, add the target suffix to the assembler name. */
20051 if (TREE_CODE (decl
) == FUNCTION_DECL
20052 && DECL_FUNCTION_VERSIONED (decl
))
20054 aarch64_fmv_feature_mask feature_mask
= get_feature_mask_for_version (decl
);
20056 std::string name
= IDENTIFIER_POINTER (id
);
20058 /* For the default version, append ".default". */
20059 if (feature_mask
== 0ULL)
20061 name
+= ".default";
20062 return get_identifier (name
.c_str());
20067 int num_features
= ARRAY_SIZE (aarch64_fmv_feature_data
);
20068 for (int i
= 0; i
< num_features
; i
++)
20070 if (feature_mask
& aarch64_fmv_feature_data
[i
].feature_mask
)
20073 name
+= aarch64_fmv_feature_data
[i
].name
;
20077 if (DECL_ASSEMBLER_NAME_SET_P (decl
))
20078 SET_DECL_RTL (decl
, NULL
);
20080 id
= get_identifier (name
.c_str());
20085 /* Return an identifier for the base assembler name of a versioned function.
20086 This is computed by taking the default version's assembler name, and
20087 stripping off the ".default" suffix if it's already been appended. */
20090 get_suffixed_assembler_name (tree default_decl
, const char *suffix
)
20092 std::string name
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl
));
20094 auto size
= name
.size ();
20095 if (size
>= 8 && name
.compare (size
- 8, 8, ".default") == 0)
20096 name
.resize (size
- 8);
20098 return get_identifier (name
.c_str());
20101 /* Make the resolver function decl to dispatch the versions of
20102 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
20103 ifunc alias that will point to the created resolver. Create an
20104 empty basic block in the resolver and store the pointer in
20105 EMPTY_BB. Return the decl of the resolver function. */
20108 make_resolver_func (const tree default_decl
,
20109 const tree ifunc_alias_decl
,
20110 basic_block
*empty_bb
)
20112 tree decl
, type
, t
;
20114 /* Create resolver function name based on default_decl. We need to remove an
20115 existing ".default" suffix if this has already been appended. */
20116 tree decl_name
= get_suffixed_assembler_name (default_decl
, ".resolver");
20117 const char *resolver_name
= IDENTIFIER_POINTER (decl_name
);
20119 /* The resolver function should have signature
20120 (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20121 type
= build_function_type_list (ptr_type_node
,
20123 build_ifunc_arg_type (),
20126 decl
= build_fn_decl (resolver_name
, type
);
20127 SET_DECL_ASSEMBLER_NAME (decl
, decl_name
);
20129 DECL_NAME (decl
) = decl_name
;
20130 TREE_USED (decl
) = 1;
20131 DECL_ARTIFICIAL (decl
) = 1;
20132 DECL_IGNORED_P (decl
) = 1;
20133 TREE_PUBLIC (decl
) = 0;
20134 DECL_UNINLINABLE (decl
) = 1;
20136 /* Resolver is not external, body is generated. */
20137 DECL_EXTERNAL (decl
) = 0;
20138 DECL_EXTERNAL (ifunc_alias_decl
) = 0;
20140 DECL_CONTEXT (decl
) = NULL_TREE
;
20141 DECL_INITIAL (decl
) = make_node (BLOCK
);
20142 DECL_STATIC_CONSTRUCTOR (decl
) = 0;
20144 if (DECL_COMDAT_GROUP (default_decl
)
20145 || TREE_PUBLIC (default_decl
))
20147 /* In this case, each translation unit with a call to this
20148 versioned function will put out a resolver. Ensure it
20149 is comdat to keep just one copy. */
20150 DECL_COMDAT (decl
) = 1;
20151 make_decl_one_only (decl
, DECL_ASSEMBLER_NAME (decl
));
20154 TREE_PUBLIC (ifunc_alias_decl
) = 0;
20156 /* Build result decl and add to function_decl. */
20157 t
= build_decl (UNKNOWN_LOCATION
, RESULT_DECL
, NULL_TREE
, ptr_type_node
);
20158 DECL_CONTEXT (t
) = decl
;
20159 DECL_ARTIFICIAL (t
) = 1;
20160 DECL_IGNORED_P (t
) = 1;
20161 DECL_RESULT (decl
) = t
;
20163 /* Build parameter decls and add to function_decl. */
20164 tree arg1
= build_decl (UNKNOWN_LOCATION
, PARM_DECL
,
20165 get_identifier ("hwcap"),
20167 tree arg2
= build_decl (UNKNOWN_LOCATION
, PARM_DECL
,
20168 get_identifier ("arg"),
20169 build_ifunc_arg_type());
20170 DECL_CONTEXT (arg1
) = decl
;
20171 DECL_CONTEXT (arg2
) = decl
;
20172 DECL_ARTIFICIAL (arg1
) = 1;
20173 DECL_ARTIFICIAL (arg2
) = 1;
20174 DECL_IGNORED_P (arg1
) = 1;
20175 DECL_IGNORED_P (arg2
) = 1;
20176 DECL_ARG_TYPE (arg1
) = uint64_type_node
;
20177 DECL_ARG_TYPE (arg2
) = build_ifunc_arg_type ();
20178 DECL_ARGUMENTS (decl
) = arg1
;
20179 TREE_CHAIN (arg1
) = arg2
;
20181 gimplify_function_tree (decl
);
20182 push_cfun (DECL_STRUCT_FUNCTION (decl
));
20183 *empty_bb
= init_lowered_empty_function (decl
, false,
20184 profile_count::uninitialized ());
20186 cgraph_node::add_new_function (decl
, true);
20187 symtab
->call_cgraph_insertion_hooks (cgraph_node::get_create (decl
));
20191 gcc_assert (ifunc_alias_decl
!= NULL
);
20192 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
20193 DECL_ATTRIBUTES (ifunc_alias_decl
)
20194 = make_attribute ("ifunc", resolver_name
,
20195 DECL_ATTRIBUTES (ifunc_alias_decl
));
20197 /* Create the alias for dispatch to resolver here. */
20198 cgraph_node::create_same_body_alias (ifunc_alias_decl
, decl
);
20202 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20203 to return a pointer to VERSION_DECL if all feature bits specified in
20204 FEATURE_MASK are not set in MASK_VAR. This function will be called during
20205 version dispatch to decide which function version to execute. It returns
20206 the basic block at the end, to which more conditions can be added. */
20208 add_condition_to_bb (tree function_decl
, tree version_decl
,
20209 aarch64_fmv_feature_mask feature_mask
,
20210 tree mask_var
, basic_block new_bb
)
20212 gimple
*return_stmt
;
20213 tree convert_expr
, result_var
;
20214 gimple
*convert_stmt
;
20215 gimple
*if_else_stmt
;
20217 basic_block bb1
, bb2
, bb3
;
20222 push_cfun (DECL_STRUCT_FUNCTION (function_decl
));
20224 gcc_assert (new_bb
!= NULL
);
20225 gseq
= bb_seq (new_bb
);
20227 convert_expr
= build1 (CONVERT_EXPR
, ptr_type_node
,
20228 build_fold_addr_expr (version_decl
));
20229 result_var
= create_tmp_var (ptr_type_node
);
20230 convert_stmt
= gimple_build_assign (result_var
, convert_expr
);
20231 return_stmt
= gimple_build_return (result_var
);
20233 if (feature_mask
== 0ULL)
20235 /* Default version. */
20236 gimple_seq_add_stmt (&gseq
, convert_stmt
);
20237 gimple_seq_add_stmt (&gseq
, return_stmt
);
20238 set_bb_seq (new_bb
, gseq
);
20239 gimple_set_bb (convert_stmt
, new_bb
);
20240 gimple_set_bb (return_stmt
, new_bb
);
20245 tree and_expr_var
= create_tmp_var (long_long_unsigned_type_node
);
20246 tree and_expr
= build2 (BIT_AND_EXPR
,
20247 long_long_unsigned_type_node
,
20249 build_int_cst (long_long_unsigned_type_node
,
20251 gimple
*and_stmt
= gimple_build_assign (and_expr_var
, and_expr
);
20252 gimple_set_block (and_stmt
, DECL_INITIAL (function_decl
));
20253 gimple_set_bb (and_stmt
, new_bb
);
20254 gimple_seq_add_stmt (&gseq
, and_stmt
);
20256 tree zero_llu
= build_int_cst (long_long_unsigned_type_node
, 0);
20257 if_else_stmt
= gimple_build_cond (EQ_EXPR
, and_expr_var
, zero_llu
,
20258 NULL_TREE
, NULL_TREE
);
20259 gimple_set_block (if_else_stmt
, DECL_INITIAL (function_decl
));
20260 gimple_set_bb (if_else_stmt
, new_bb
);
20261 gimple_seq_add_stmt (&gseq
, if_else_stmt
);
20263 gimple_seq_add_stmt (&gseq
, convert_stmt
);
20264 gimple_seq_add_stmt (&gseq
, return_stmt
);
20265 set_bb_seq (new_bb
, gseq
);
20268 e12
= split_block (bb1
, if_else_stmt
);
20270 e12
->flags
&= ~EDGE_FALLTHRU
;
20271 e12
->flags
|= EDGE_TRUE_VALUE
;
20273 e23
= split_block (bb2
, return_stmt
);
20275 gimple_set_bb (convert_stmt
, bb2
);
20276 gimple_set_bb (return_stmt
, bb2
);
20279 make_edge (bb1
, bb3
, EDGE_FALSE_VALUE
);
20282 make_edge (bb2
, EXIT_BLOCK_PTR_FOR_FN (cfun
), 0);
20289 /* This function generates the dispatch function for
20290 multi-versioned functions. DISPATCH_DECL is the function which will
20291 contain the dispatch logic. FNDECLS are the function choices for
20292 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
20293 in DISPATCH_DECL in which the dispatch code is generated. */
20296 dispatch_function_versions (tree dispatch_decl
,
20298 basic_block
*empty_bb
)
20300 gimple
*ifunc_cpu_init_stmt
;
20302 vec
<tree
> *fndecls
;
20304 gcc_assert (dispatch_decl
!= NULL
20305 && fndecls_p
!= NULL
20306 && empty_bb
!= NULL
);
20308 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl
));
20310 gseq
= bb_seq (*empty_bb
);
20311 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
20312 constructors, so explicity call __init_cpu_features_resolver here. */
20313 tree init_fn_type
= build_function_type_list (void_type_node
,
20314 long_unsigned_type_node
,
20315 build_ifunc_arg_type(),
20317 tree init_fn_id
= get_identifier ("__init_cpu_features_resolver");
20318 tree init_fn_decl
= build_decl (UNKNOWN_LOCATION
, FUNCTION_DECL
,
20319 init_fn_id
, init_fn_type
);
20320 tree arg1
= DECL_ARGUMENTS (dispatch_decl
);
20321 tree arg2
= TREE_CHAIN (arg1
);
20322 ifunc_cpu_init_stmt
= gimple_build_call (init_fn_decl
, 2, arg1
, arg2
);
20323 gimple_seq_add_stmt (&gseq
, ifunc_cpu_init_stmt
);
20324 gimple_set_bb (ifunc_cpu_init_stmt
, *empty_bb
);
20326 /* Build the struct type for __aarch64_cpu_features. */
20327 tree global_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
20328 tree field1
= build_decl (UNKNOWN_LOCATION
, FIELD_DECL
,
20329 get_identifier ("features"),
20330 long_long_unsigned_type_node
);
20331 DECL_FIELD_CONTEXT (field1
) = global_type
;
20332 TYPE_FIELDS (global_type
) = field1
;
20333 layout_type (global_type
);
20335 tree global_var
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
20336 get_identifier ("__aarch64_cpu_features"),
20338 DECL_EXTERNAL (global_var
) = 1;
20339 tree mask_var
= create_tmp_var (long_long_unsigned_type_node
);
20341 tree component_expr
= build3 (COMPONENT_REF
, long_long_unsigned_type_node
,
20342 global_var
, field1
, NULL_TREE
);
20343 gimple
*component_stmt
= gimple_build_assign (mask_var
, component_expr
);
20344 gimple_set_block (component_stmt
, DECL_INITIAL (dispatch_decl
));
20345 gimple_set_bb (component_stmt
, *empty_bb
);
20346 gimple_seq_add_stmt (&gseq
, component_stmt
);
20348 tree not_expr
= build1 (BIT_NOT_EXPR
, long_long_unsigned_type_node
, mask_var
);
20349 gimple
*not_stmt
= gimple_build_assign (mask_var
, not_expr
);
20350 gimple_set_block (not_stmt
, DECL_INITIAL (dispatch_decl
));
20351 gimple_set_bb (not_stmt
, *empty_bb
);
20352 gimple_seq_add_stmt (&gseq
, not_stmt
);
20354 set_bb_seq (*empty_bb
, gseq
);
20358 /* fndecls_p is actually a vector. */
20359 fndecls
= static_cast<vec
<tree
> *> (fndecls_p
);
20361 /* At least one more version other than the default. */
20362 unsigned int num_versions
= fndecls
->length ();
20363 gcc_assert (num_versions
>= 2);
20365 struct function_version_info
20368 aarch64_fmv_feature_mask feature_mask
;
20369 } *function_versions
;
20371 function_versions
= (struct function_version_info
*)
20372 XNEWVEC (struct function_version_info
, (num_versions
));
20374 unsigned int actual_versions
= 0;
20376 for (tree version_decl
: *fndecls
)
20378 aarch64_fmv_feature_mask feature_mask
;
20379 /* Get attribute string, parse it and find the right features. */
20380 feature_mask
= get_feature_mask_for_version (version_decl
);
20381 function_versions
[actual_versions
].version_decl
= version_decl
;
20382 function_versions
[actual_versions
].feature_mask
= feature_mask
;
20386 auto compare_feature_version_info
= [](const void *p1
, const void *p2
) {
20387 const function_version_info v1
= *(const function_version_info
*)p1
;
20388 const function_version_info v2
= *(const function_version_info
*)p2
;
20389 return - compare_feature_masks (v1
.feature_mask
, v2
.feature_mask
);
20392 /* Sort the versions according to descending order of dispatch priority. */
20393 qsort (function_versions
, actual_versions
,
20394 sizeof (struct function_version_info
), compare_feature_version_info
);
20396 for (unsigned int i
= 0; i
< actual_versions
; ++i
)
20397 *empty_bb
= add_condition_to_bb (dispatch_decl
,
20398 function_versions
[i
].version_decl
,
20399 function_versions
[i
].feature_mask
,
20403 free (function_versions
);
20407 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY. */
20410 aarch64_generate_version_dispatcher_body (void *node_p
)
20412 tree resolver_decl
;
20413 basic_block empty_bb
;
20414 tree default_ver_decl
;
20415 struct cgraph_node
*versn
;
20416 struct cgraph_node
*node
;
20418 struct cgraph_function_version_info
*node_version_info
= NULL
;
20419 struct cgraph_function_version_info
*versn_info
= NULL
;
20421 node
= (cgraph_node
*)node_p
;
20423 node_version_info
= node
->function_version ();
20424 gcc_assert (node
->dispatcher_function
20425 && node_version_info
!= NULL
);
20427 if (node_version_info
->dispatcher_resolver
)
20428 return node_version_info
->dispatcher_resolver
;
20430 /* The first version in the chain corresponds to the default version. */
20431 default_ver_decl
= node_version_info
->next
->this_node
->decl
;
20433 /* node is going to be an alias, so remove the finalized bit. */
20434 node
->definition
= false;
20436 resolver_decl
= make_resolver_func (default_ver_decl
,
20437 node
->decl
, &empty_bb
);
20439 node_version_info
->dispatcher_resolver
= resolver_decl
;
20441 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl
));
20443 auto_vec
<tree
, 2> fn_ver_vec
;
20445 for (versn_info
= node_version_info
->next
; versn_info
;
20446 versn_info
= versn_info
->next
)
20448 versn
= versn_info
->this_node
;
20449 /* Check for virtual functions here again, as by this time it should
20450 have been determined if this function needs a vtable index or
20451 not. This happens for methods in derived classes that override
20452 virtual methods in base classes but are not explicitly marked as
20454 if (DECL_VINDEX (versn
->decl
))
20455 sorry ("virtual function multiversioning not supported");
20457 fn_ver_vec
.safe_push (versn
->decl
);
20460 dispatch_function_versions (resolver_decl
, &fn_ver_vec
, &empty_bb
);
20461 cgraph_edge::rebuild_edges ();
20464 /* Fix up symbol names. First we need to obtain the base name, which may
20465 have already been mangled. */
20466 tree base_name
= get_suffixed_assembler_name (default_ver_decl
, "");
20468 /* We need to redo the version mangling on the non-default versions for the
20469 target_clones case. Redoing the mangling for the target_version case is
20470 redundant but does no harm. We need to skip the default version, because
20471 expand_clones will append ".default" later; fortunately that suffix is the
20472 one we want anyway. */
20473 for (versn_info
= node_version_info
->next
->next
; versn_info
;
20474 versn_info
= versn_info
->next
)
20476 tree version_decl
= versn_info
->this_node
->decl
;
20477 tree name
= aarch64_mangle_decl_assembler_name (version_decl
,
20479 symtab
->change_decl_assembler_name (version_decl
, name
);
20482 /* We also need to use the base name for the ifunc declaration. */
20483 symtab
->change_decl_assembler_name (node
->decl
, base_name
);
20485 return resolver_decl
;
20488 /* Make a dispatcher declaration for the multi-versioned function DECL.
20489 Calls to DECL function will be replaced with calls to the dispatcher
20490 by the front-end. Returns the decl of the dispatcher function. */
20493 aarch64_get_function_versions_dispatcher (void *decl
)
20495 tree fn
= (tree
) decl
;
20496 struct cgraph_node
*node
= NULL
;
20497 struct cgraph_node
*default_node
= NULL
;
20498 struct cgraph_function_version_info
*node_v
= NULL
;
20499 struct cgraph_function_version_info
*first_v
= NULL
;
20501 tree dispatch_decl
= NULL
;
20503 struct cgraph_function_version_info
*default_version_info
= NULL
;
20505 gcc_assert (fn
!= NULL
&& DECL_FUNCTION_VERSIONED (fn
));
20507 node
= cgraph_node::get (fn
);
20508 gcc_assert (node
!= NULL
);
20510 node_v
= node
->function_version ();
20511 gcc_assert (node_v
!= NULL
);
20513 if (node_v
->dispatcher_resolver
!= NULL
)
20514 return node_v
->dispatcher_resolver
;
20516 /* Find the default version and make it the first node. */
20518 /* Go to the beginning of the chain. */
20519 while (first_v
->prev
!= NULL
)
20520 first_v
= first_v
->prev
;
20521 default_version_info
= first_v
;
20522 while (default_version_info
!= NULL
)
20524 if (get_feature_mask_for_version
20525 (default_version_info
->this_node
->decl
) == 0ULL)
20527 default_version_info
= default_version_info
->next
;
20530 /* If there is no default node, just return NULL. */
20531 if (default_version_info
== NULL
)
20534 /* Make default info the first node. */
20535 if (first_v
!= default_version_info
)
20537 default_version_info
->prev
->next
= default_version_info
->next
;
20538 if (default_version_info
->next
)
20539 default_version_info
->next
->prev
= default_version_info
->prev
;
20540 first_v
->prev
= default_version_info
;
20541 default_version_info
->next
= first_v
;
20542 default_version_info
->prev
= NULL
;
20545 default_node
= default_version_info
->this_node
;
20547 if (targetm
.has_ifunc_p ())
20549 struct cgraph_function_version_info
*it_v
= NULL
;
20550 struct cgraph_node
*dispatcher_node
= NULL
;
20551 struct cgraph_function_version_info
*dispatcher_version_info
= NULL
;
20553 /* Right now, the dispatching is done via ifunc. */
20554 dispatch_decl
= make_dispatcher_decl (default_node
->decl
);
20555 TREE_NOTHROW (dispatch_decl
) = TREE_NOTHROW (fn
);
20557 dispatcher_node
= cgraph_node::get_create (dispatch_decl
);
20558 gcc_assert (dispatcher_node
!= NULL
);
20559 dispatcher_node
->dispatcher_function
= 1;
20560 dispatcher_version_info
20561 = dispatcher_node
->insert_new_function_version ();
20562 dispatcher_version_info
->next
= default_version_info
;
20563 dispatcher_node
->definition
= 1;
20565 /* Set the dispatcher for all the versions. */
20566 it_v
= default_version_info
;
20567 while (it_v
!= NULL
)
20569 it_v
->dispatcher_resolver
= dispatch_decl
;
20575 error_at (DECL_SOURCE_LOCATION (default_node
->decl
),
20576 "multiversioning needs %<ifunc%> which is not supported "
20580 return dispatch_decl
;
20583 /* This function returns true if FN1 and FN2 are versions of the same function,
20584 that is, the target_version attributes of the function decls are different.
20585 This assumes that FN1 and FN2 have the same signature. */
20588 aarch64_common_function_versions (tree fn1
, tree fn2
)
20590 if (TREE_CODE (fn1
) != FUNCTION_DECL
20591 || TREE_CODE (fn2
) != FUNCTION_DECL
)
20594 return (aarch64_compare_version_priority (fn1
, fn2
) != 0);
20597 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P. Use an opt-out
20598 rather than an opt-in list. */
20601 aarch64_function_attribute_inlinable_p (const_tree fndecl
)
20603 /* A function that has local SME state cannot be inlined into its caller,
20604 since we only support managing PSTATE.ZA switches at function scope. */
20605 return (!aarch64_fndecl_has_new_state (fndecl
, "za")
20606 && !aarch64_fndecl_has_new_state (fndecl
, "zt0"));
20609 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
20610 tri-bool options (yes, no, don't care) and the default value is
20611 DEF, determine whether to reject inlining. */
20614 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
20615 int dont_care
, int def
)
20617 /* If the callee doesn't care, always allow inlining. */
20618 if (callee
== dont_care
)
20621 /* If the caller doesn't care, always allow inlining. */
20622 if (caller
== dont_care
)
20625 /* Otherwise, allow inlining if either the callee and caller values
20626 agree, or if the callee is using the default value. */
20627 return (callee
== caller
|| callee
== def
);
20630 /* Bit allocations for ipa_fn_summary::target_info. */
20632 /* Set if the function contains a stmt that relies on the function's
20633 choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20634 Not meaningful for streaming-compatible functions. */
20635 constexpr auto AARCH64_IPA_SM_FIXED
= 1U << 0;
20637 /* Set if the function clobbers ZA and ZT0. Not meaningful for functions that
20639 constexpr auto AARCH64_IPA_CLOBBERS_ZA
= 1U << 1;
20640 constexpr auto AARCH64_IPA_CLOBBERS_ZT0
= 1U << 2;
20642 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO. */
20645 aarch64_need_ipa_fn_target_info (const_tree
, unsigned int &)
20647 /* We could in principle skip this for streaming-compatible functions
20648 that have ZA state, but that's a rare combination. */
20652 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO. */
20655 aarch64_update_ipa_fn_target_info (unsigned int &info
, const gimple
*stmt
)
20657 if (auto *ga
= dyn_cast
<const gasm
*> (stmt
))
20659 /* We don't know what the asm does, so conservatively assume that
20660 it requires the function's current SM mode. */
20661 info
|= AARCH64_IPA_SM_FIXED
;
20662 for (unsigned int i
= 0; i
< gimple_asm_nclobbers (ga
); ++i
)
20664 tree op
= gimple_asm_clobber_op (ga
, i
);
20665 const char *clobber
= TREE_STRING_POINTER (TREE_VALUE (op
));
20666 if (strcmp (clobber
, "za") == 0)
20667 info
|= AARCH64_IPA_CLOBBERS_ZA
;
20668 if (strcmp (clobber
, "zt0") == 0)
20669 info
|= AARCH64_IPA_CLOBBERS_ZT0
;
20672 if (auto *call
= dyn_cast
<const gcall
*> (stmt
))
20674 if (gimple_call_builtin_p (call
, BUILT_IN_MD
))
20676 /* The attributes on AArch64 builtins are supposed to be accurate.
20677 If the function isn't marked streaming-compatible then it
20678 needs whichever SM mode it selects. */
20679 tree decl
= gimple_call_fndecl (call
);
20680 if (aarch64_fndecl_pstate_sm (decl
) != 0)
20681 info
|= AARCH64_IPA_SM_FIXED
;
20687 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
20688 to inline CALLEE into CALLER based on target-specific info.
20689 Make sure that the caller and callee have compatible architectural
20690 features. Then go through the other possible target attributes
20691 and see if they can block inlining. Try not to reject always_inline
20692 callees unless they are incompatible architecturally. */
20695 aarch64_can_inline_p (tree caller
, tree callee
)
20697 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
20698 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
20700 struct cl_target_option
*caller_opts
20701 = TREE_TARGET_OPTION (caller_tree
? caller_tree
20702 : target_option_default_node
);
20704 struct cl_target_option
*callee_opts
20705 = TREE_TARGET_OPTION (callee_tree
? callee_tree
20706 : target_option_default_node
);
20708 /* Callee's ISA flags should be a subset of the caller's. */
20709 auto caller_asm_isa
= (caller_opts
->x_aarch64_asm_isa_flags
20710 & ~AARCH64_FL_ISA_MODES
);
20711 auto callee_asm_isa
= (callee_opts
->x_aarch64_asm_isa_flags
20712 & ~AARCH64_FL_ISA_MODES
);
20713 if (callee_asm_isa
& ~caller_asm_isa
)
20716 auto caller_isa
= (caller_opts
->x_aarch64_isa_flags
20717 & ~AARCH64_FL_ISA_MODES
);
20718 auto callee_isa
= (callee_opts
->x_aarch64_isa_flags
20719 & ~AARCH64_FL_ISA_MODES
);
20720 if (callee_isa
& ~caller_isa
)
20723 /* Return true if the callee might have target_info property PROPERTY.
20724 The answer must be true unless we have positive proof to the contrary. */
20725 auto callee_has_property
= [&](unsigned int property
)
20727 if (ipa_fn_summaries
)
20728 if (auto *summary
= ipa_fn_summaries
->get (cgraph_node::get (callee
)))
20729 if (!(summary
->target_info
& property
))
20734 /* Streaming-compatible code can be inlined into functions with any
20735 PSTATE.SM mode. Otherwise the caller and callee must agree on
20736 PSTATE.SM mode, unless we can prove that the callee is naturally
20737 streaming-compatible. */
20738 auto caller_sm
= (caller_opts
->x_aarch64_isa_flags
& AARCH64_FL_SM_STATE
);
20739 auto callee_sm
= (callee_opts
->x_aarch64_isa_flags
& AARCH64_FL_SM_STATE
);
20741 && caller_sm
!= callee_sm
20742 && callee_has_property (AARCH64_IPA_SM_FIXED
))
20745 /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20746 functions from being inlined into others. We also need to prevent
20747 inlining of shared-ZA functions into functions without ZA state,
20748 since this is an error condition.
20750 The only other problematic case for ZA is inlining a function that
20751 directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state. */
20752 auto caller_za
= (caller_opts
->x_aarch64_isa_flags
& AARCH64_FL_ZA_ON
);
20753 auto callee_za
= (callee_opts
->x_aarch64_isa_flags
& AARCH64_FL_ZA_ON
);
20754 if (!caller_za
&& callee_za
)
20757 && aarch64_fndecl_has_state (caller
, "za")
20758 && callee_has_property (AARCH64_IPA_CLOBBERS_ZA
))
20761 && aarch64_fndecl_has_state (caller
, "zt0")
20762 && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0
))
20765 /* Allow non-strict aligned functions inlining into strict
20767 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
20768 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
20769 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
20770 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
20773 bool always_inline
= lookup_attribute ("always_inline",
20774 DECL_ATTRIBUTES (callee
));
20776 /* If the architectural features match up and the callee is always_inline
20777 then the other attributes don't matter. */
20781 if (caller_opts
->x_aarch64_cmodel_var
20782 != callee_opts
->x_aarch64_cmodel_var
)
20785 if (caller_opts
->x_aarch64_tls_dialect
20786 != callee_opts
->x_aarch64_tls_dialect
)
20789 /* Honour explicit requests to workaround errata. */
20790 if (!aarch64_tribools_ok_for_inlining_p (
20791 caller_opts
->x_aarch64_fix_a53_err835769
,
20792 callee_opts
->x_aarch64_fix_a53_err835769
,
20793 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
20796 if (!aarch64_tribools_ok_for_inlining_p (
20797 caller_opts
->x_aarch64_fix_a53_err843419
,
20798 callee_opts
->x_aarch64_fix_a53_err843419
,
20799 2, TARGET_FIX_ERR_A53_843419
))
20802 /* If the user explicitly specified -momit-leaf-frame-pointer for the
20803 caller and calle and they don't match up, reject inlining. */
20804 if (!aarch64_tribools_ok_for_inlining_p (
20805 caller_opts
->x_flag_omit_leaf_frame_pointer
,
20806 callee_opts
->x_flag_omit_leaf_frame_pointer
,
20810 /* If the callee has specific tuning overrides, respect them. */
20811 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
20812 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
20815 /* If the user specified tuning override strings for the
20816 caller and callee and they don't match up, reject inlining.
20817 We just do a string compare here, we don't analyze the meaning
20818 of the string, as it would be too costly for little gain. */
20819 if (callee_opts
->x_aarch64_override_tune_string
20820 && caller_opts
->x_aarch64_override_tune_string
20821 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
20822 caller_opts
->x_aarch64_override_tune_string
) != 0))
20828 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20832 aarch64_tlsdesc_abi_id ()
20834 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
20835 if (!tlsdesc_abi
.initialized_p ())
20837 HARD_REG_SET full_reg_clobbers
;
20838 CLEAR_HARD_REG_SET (full_reg_clobbers
);
20839 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
20840 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
20841 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
20842 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
20843 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
20845 return ARM_PCS_TLSDESC
;
20848 /* Return true if SYMBOL_REF X binds locally. */
20851 aarch64_symbol_binds_local_p (const_rtx x
)
20853 return (SYMBOL_REF_DECL (x
)
20854 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
20855 : SYMBOL_REF_LOCAL_P (x
));
20858 /* Return true if SYMBOL_REF X is thread local */
20860 aarch64_tls_symbol_p (rtx x
)
20862 if (! TARGET_HAVE_TLS
)
20865 x
= strip_salt (x
);
20866 if (!SYMBOL_REF_P (x
))
20869 return SYMBOL_REF_TLS_MODEL (x
) != 0;
20872 /* Classify a TLS symbol into one of the TLS kinds. */
20873 enum aarch64_symbol_type
20874 aarch64_classify_tls_symbol (rtx x
)
20876 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
20880 case TLS_MODEL_GLOBAL_DYNAMIC
:
20881 case TLS_MODEL_LOCAL_DYNAMIC
:
20882 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
20884 case TLS_MODEL_INITIAL_EXEC
:
20885 switch (aarch64_cmodel
)
20887 case AARCH64_CMODEL_TINY
:
20888 case AARCH64_CMODEL_TINY_PIC
:
20889 return SYMBOL_TINY_TLSIE
;
20891 return SYMBOL_SMALL_TLSIE
;
20894 case TLS_MODEL_LOCAL_EXEC
:
20895 if (aarch64_tls_size
== 12)
20896 return SYMBOL_TLSLE12
;
20897 else if (aarch64_tls_size
== 24)
20898 return SYMBOL_TLSLE24
;
20899 else if (aarch64_tls_size
== 32)
20900 return SYMBOL_TLSLE32
;
20901 else if (aarch64_tls_size
== 48)
20902 return SYMBOL_TLSLE48
;
20904 gcc_unreachable ();
20906 case TLS_MODEL_EMULATED
:
20907 case TLS_MODEL_NONE
:
20908 return SYMBOL_FORCE_TO_MEM
;
20911 gcc_unreachable ();
20915 /* Return the correct method for accessing X + OFFSET, where X is either
20916 a SYMBOL_REF or LABEL_REF. */
20918 enum aarch64_symbol_type
20919 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
20921 x
= strip_salt (x
);
20923 if (LABEL_REF_P (x
))
20925 switch (aarch64_cmodel
)
20927 case AARCH64_CMODEL_LARGE
:
20928 return SYMBOL_FORCE_TO_MEM
;
20930 case AARCH64_CMODEL_TINY_PIC
:
20931 case AARCH64_CMODEL_TINY
:
20932 return SYMBOL_TINY_ABSOLUTE
;
20934 case AARCH64_CMODEL_SMALL_SPIC
:
20935 case AARCH64_CMODEL_SMALL_PIC
:
20936 case AARCH64_CMODEL_SMALL
:
20937 return SYMBOL_SMALL_ABSOLUTE
;
20940 gcc_unreachable ();
20944 if (SYMBOL_REF_P (x
))
20946 if (aarch64_tls_symbol_p (x
))
20947 return aarch64_classify_tls_symbol (x
);
20949 switch (aarch64_cmodel
)
20951 case AARCH64_CMODEL_TINY_PIC
:
20952 case AARCH64_CMODEL_TINY
:
20953 /* With -fPIC non-local symbols use the GOT. For orthogonality
20954 always use the GOT for extern weak symbols. */
20955 if ((flag_pic
|| SYMBOL_REF_WEAK (x
))
20956 && !aarch64_symbol_binds_local_p (x
))
20957 return SYMBOL_TINY_GOT
;
20959 /* When we retrieve symbol + offset address, we have to make sure
20960 the offset does not cause overflow of the final address. But
20961 we have no way of knowing the address of symbol at compile time
20962 so we can't accurately say if the distance between the PC and
20963 symbol + offset is outside the addressible range of +/-1MB in the
20964 TINY code model. So we limit the maximum offset to +/-64KB and
20965 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
20966 If offset_within_block_p is true we allow larger offsets. */
20967 if (!(IN_RANGE (offset
, -0x10000, 0x10000)
20968 || offset_within_block_p (x
, offset
)))
20969 return SYMBOL_FORCE_TO_MEM
;
20971 return SYMBOL_TINY_ABSOLUTE
;
20974 case AARCH64_CMODEL_SMALL_SPIC
:
20975 case AARCH64_CMODEL_SMALL_PIC
:
20976 case AARCH64_CMODEL_SMALL
:
20977 if ((flag_pic
|| SYMBOL_REF_WEAK (x
))
20978 && !aarch64_symbol_binds_local_p (x
))
20979 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
20980 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
;
20982 /* Same reasoning as the tiny code model, but the offset cap here is
20983 1MB, allowing +/-3.9GB for the offset to the symbol. */
20984 if (!(IN_RANGE (offset
, -0x100000, 0x100000)
20985 || offset_within_block_p (x
, offset
)))
20986 return SYMBOL_FORCE_TO_MEM
;
20988 return SYMBOL_SMALL_ABSOLUTE
;
20990 case AARCH64_CMODEL_LARGE
:
20991 /* This is alright even in PIC code as the constant
20992 pool reference is always PC relative and within
20993 the same translation unit. */
20994 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
20995 return SYMBOL_SMALL_ABSOLUTE
;
20997 return SYMBOL_FORCE_TO_MEM
;
21000 gcc_unreachable ();
21004 /* By default push everything into the constant pool. */
21005 return SYMBOL_FORCE_TO_MEM
;
21009 aarch64_constant_address_p (rtx x
)
21011 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
21015 aarch64_legitimate_pic_operand_p (rtx x
)
21018 x
= strip_offset_and_salt (x
, &offset
);
21019 if (SYMBOL_REF_P (x
))
21025 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
21026 that should be rematerialized rather than spilled. */
21029 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
21031 /* Support CSE and rematerialization of common constants. */
21032 if (CONST_INT_P (x
)
21033 || CONST_DOUBLE_P (x
))
21036 /* Only accept variable-length vector constants if they can be
21039 ??? It would be possible (but complex) to handle rematerialization
21040 of other constants via secondary reloads. */
21041 if (!GET_MODE_SIZE (mode
).is_constant ())
21042 return aarch64_simd_valid_immediate (x
, NULL
);
21044 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21045 least be forced to memory and loaded from there. */
21046 if (CONST_VECTOR_P (x
))
21047 return !targetm
.cannot_force_const_mem (mode
, x
);
21049 /* Do not allow vector struct mode constants for Advanced SIMD.
21050 We could support 0 and -1 easily, but they need support in
21051 aarch64-simd.md. */
21052 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
21053 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
21056 if (GET_CODE (x
) == HIGH
)
21059 /* Accept polynomial constants that can be calculated by using the
21060 destination of a move as the sole temporary. Constants that
21061 require a second temporary cannot be rematerialized (they can't be
21062 forced to memory and also aren't legitimate constants). */
21064 if (poly_int_rtx_p (x
, &offset
))
21065 return aarch64_offset_temporaries (false, offset
) <= 1;
21067 /* If an offset is being added to something else, we need to allow the
21068 base to be moved into the destination register, meaning that there
21069 are no free temporaries for the offset. */
21070 x
= strip_offset_and_salt (x
, &offset
);
21071 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
21074 /* Do not allow const (plus (anchor_symbol, const_int)). */
21075 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
21078 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
21079 so spilling them is better than rematerialization. */
21080 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
21083 /* Label references are always constant. */
21084 if (LABEL_REF_P (x
))
21091 aarch64_load_tp (rtx target
)
21094 || GET_MODE (target
) != Pmode
21095 || !register_operand (target
, Pmode
))
21096 target
= gen_reg_rtx (Pmode
);
21098 /* Can return in any reg. */
21099 emit_insn (gen_aarch64_load_tp_hard (target
));
21103 /* On AAPCS systems, this is the "struct __va_list". */
21104 static GTY(()) tree va_list_type
;
21106 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21107 Return the type to use as __builtin_va_list.
21109 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21121 aarch64_build_builtin_va_list (void)
21124 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21126 /* Create the type. */
21127 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
21128 /* Give it the required name. */
21129 va_list_name
= build_decl (BUILTINS_LOCATION
,
21131 get_identifier ("__va_list"),
21133 DECL_ARTIFICIAL (va_list_name
) = 1;
21134 TYPE_NAME (va_list_type
) = va_list_name
;
21135 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
21137 /* Create the fields. */
21138 f_stack
= build_decl (BUILTINS_LOCATION
,
21139 FIELD_DECL
, get_identifier ("__stack"),
21141 f_grtop
= build_decl (BUILTINS_LOCATION
,
21142 FIELD_DECL
, get_identifier ("__gr_top"),
21144 f_vrtop
= build_decl (BUILTINS_LOCATION
,
21145 FIELD_DECL
, get_identifier ("__vr_top"),
21147 f_groff
= build_decl (BUILTINS_LOCATION
,
21148 FIELD_DECL
, get_identifier ("__gr_offs"),
21149 integer_type_node
);
21150 f_vroff
= build_decl (BUILTINS_LOCATION
,
21151 FIELD_DECL
, get_identifier ("__vr_offs"),
21152 integer_type_node
);
21154 /* Tell tree-stdarg pass about our internal offset fields.
21155 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21156 purpose to identify whether the code is updating va_list internal
21157 offset fields through irregular way. */
21158 va_list_gpr_counter_field
= f_groff
;
21159 va_list_fpr_counter_field
= f_vroff
;
21161 DECL_ARTIFICIAL (f_stack
) = 1;
21162 DECL_ARTIFICIAL (f_grtop
) = 1;
21163 DECL_ARTIFICIAL (f_vrtop
) = 1;
21164 DECL_ARTIFICIAL (f_groff
) = 1;
21165 DECL_ARTIFICIAL (f_vroff
) = 1;
21167 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
21168 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
21169 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
21170 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
21171 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
21173 TYPE_FIELDS (va_list_type
) = f_stack
;
21174 DECL_CHAIN (f_stack
) = f_grtop
;
21175 DECL_CHAIN (f_grtop
) = f_vrtop
;
21176 DECL_CHAIN (f_vrtop
) = f_groff
;
21177 DECL_CHAIN (f_groff
) = f_vroff
;
21179 /* Compute its layout. */
21180 layout_type (va_list_type
);
21182 return va_list_type
;
21185 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
21187 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
21189 const CUMULATIVE_ARGS
*cum
;
21190 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21191 tree stack
, grtop
, vrtop
, groff
, vroff
;
21193 int gr_save_area_size
= cfun
->va_list_gpr_size
;
21194 int vr_save_area_size
= cfun
->va_list_fpr_size
;
21197 cum
= &crtl
->args
.info
;
21198 if (cfun
->va_list_gpr_size
)
21199 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
21200 cfun
->va_list_gpr_size
);
21201 if (cfun
->va_list_fpr_size
)
21202 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
21203 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
21207 gcc_assert (cum
->aapcs_nvrn
== 0);
21208 vr_save_area_size
= 0;
21211 f_stack
= TYPE_FIELDS (va_list_type_node
);
21212 f_grtop
= DECL_CHAIN (f_stack
);
21213 f_vrtop
= DECL_CHAIN (f_grtop
);
21214 f_groff
= DECL_CHAIN (f_vrtop
);
21215 f_vroff
= DECL_CHAIN (f_groff
);
21217 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
21219 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
21221 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
21223 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
21225 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
21228 /* Emit code to initialize STACK, which points to the next varargs stack
21229 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
21230 by named arguments. STACK is 8-byte aligned. */
21231 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
21232 if (cum
->aapcs_stack_size
> 0)
21233 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
21234 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
21235 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21237 /* Emit code to initialize GRTOP, the top of the GR save area.
21238 virtual_incoming_args_rtx should have been 16 byte aligned. */
21239 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
21240 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
21241 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21243 /* Emit code to initialize VRTOP, the top of the VR save area.
21244 This address is gr_save_area_bytes below GRTOP, rounded
21245 down to the next 16-byte boundary. */
21246 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
21247 vr_offset
= ROUND_UP (gr_save_area_size
,
21248 STACK_BOUNDARY
/ BITS_PER_UNIT
);
21251 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
21252 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
21253 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21255 /* Emit code to initialize GROFF, the offset from GRTOP of the
21256 next GPR argument. */
21257 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
21258 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
21259 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21261 /* Likewise emit code to initialize VROFF, the offset from FTOP
21262 of the next VR argument. */
21263 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
21264 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
21265 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
21268 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
21271 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
21272 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
21276 bool is_ha
; /* is HFA or HVA. */
21277 bool dw_align
; /* double-word align. */
21278 machine_mode ag_mode
= VOIDmode
;
21282 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
21283 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
21284 HOST_WIDE_INT size
, rsize
, adjust
, align
;
21285 tree t
, u
, cond1
, cond2
;
21287 indirect_p
= pass_va_arg_by_reference (type
);
21289 type
= build_pointer_type (type
);
21291 mode
= TYPE_MODE (type
);
21293 f_stack
= TYPE_FIELDS (va_list_type_node
);
21294 f_grtop
= DECL_CHAIN (f_stack
);
21295 f_vrtop
= DECL_CHAIN (f_grtop
);
21296 f_groff
= DECL_CHAIN (f_vrtop
);
21297 f_vroff
= DECL_CHAIN (f_groff
);
21299 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
21300 f_stack
, NULL_TREE
);
21301 size
= int_size_in_bytes (type
);
21303 unsigned int abi_break_gcc_9
;
21304 unsigned int abi_break_gcc_13
;
21305 unsigned int abi_break_gcc_14
;
21307 = aarch64_function_arg_alignment (mode
, type
, &abi_break_gcc_9
,
21308 &abi_break_gcc_13
, &abi_break_gcc_14
)
21313 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
, &ag_mode
, &nregs
,
21316 /* No frontends can create types with variable-sized modes, so we
21317 shouldn't be asked to pass or return them. */
21318 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
21320 /* TYPE passed in fp/simd registers. */
21322 aarch64_err_no_fpadvsimd (mode
);
21324 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
21325 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
21326 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
21327 unshare_expr (valist
), f_vroff
, NULL_TREE
);
21329 rsize
= nregs
* UNITS_PER_VREG
;
21333 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
21334 adjust
= UNITS_PER_VREG
- ag_size
;
21336 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21337 && size
< UNITS_PER_VREG
)
21339 adjust
= UNITS_PER_VREG
- size
;
21344 /* TYPE passed in general registers. */
21345 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
21346 unshare_expr (valist
), f_grtop
, NULL_TREE
);
21347 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
21348 unshare_expr (valist
), f_groff
, NULL_TREE
);
21349 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
21350 nregs
= rsize
/ UNITS_PER_WORD
;
21353 && abi_break_gcc_13
21355 && !bitint_or_aggr_of_bitint_p (type
))
21356 inform (input_location
, "parameter passing for argument of type "
21357 "%qT changed in GCC 13.1", type
);
21360 && abi_break_gcc_14
21361 && (abi_break_gcc_14
> 8 * BITS_PER_UNIT
) != (align
> 8)
21362 && !bitint_or_aggr_of_bitint_p (type
))
21363 inform (input_location
, "parameter passing for argument of type "
21364 "%qT changed in GCC 14.1", type
);
21368 if (abi_break_gcc_9
21370 && !bitint_or_aggr_of_bitint_p (type
))
21371 inform (input_location
, "parameter passing for argument of type "
21372 "%qT changed in GCC 9.1", type
);
21376 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21377 && size
< UNITS_PER_WORD
)
21379 adjust
= UNITS_PER_WORD
- size
;
21383 /* Get a local temporary for the field value. */
21384 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
21386 /* Emit code to branch if off >= 0. */
21387 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
21388 build_int_cst (TREE_TYPE (off
), 0));
21389 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
21393 /* Emit: offs = (offs + 15) & -16. */
21394 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
21395 build_int_cst (TREE_TYPE (off
), 15));
21396 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
21397 build_int_cst (TREE_TYPE (off
), -16));
21398 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
21403 /* Update ap.__[g|v]r_offs */
21404 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
21405 build_int_cst (TREE_TYPE (off
), rsize
));
21406 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
21410 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
21412 /* [cond2] if (ap.__[g|v]r_offs > 0) */
21413 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
21414 build_int_cst (TREE_TYPE (f_off
), 0));
21415 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
21417 /* String up: make sure the assignment happens before the use. */
21418 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
21419 COND_EXPR_ELSE (cond1
) = t
;
21421 /* Prepare the trees handling the argument that is passed on the stack;
21422 the top level node will store in ON_STACK. */
21423 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
21426 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
21427 t
= fold_build_pointer_plus_hwi (arg
, 15);
21428 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
21429 build_int_cst (TREE_TYPE (t
), -16));
21430 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
21434 /* Advance ap.__stack */
21435 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
21436 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
21437 build_int_cst (TREE_TYPE (t
), -8));
21438 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
21439 /* String up roundup and advance. */
21441 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
21442 /* String up with arg */
21443 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
21444 /* Big-endianness related address adjustment. */
21445 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
21446 && size
< UNITS_PER_WORD
)
21448 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
21449 size_int (UNITS_PER_WORD
- size
));
21450 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
21453 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
21454 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
21456 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
21459 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
21460 build_int_cst (TREE_TYPE (off
), adjust
));
21462 t
= fold_convert (sizetype
, t
);
21463 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
21467 /* type ha; // treat as "struct {ftype field[n];}"
21468 ... [computing offs]
21469 for (i = 0; i <nregs; ++i, offs += 16)
21470 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21473 tree tmp_ha
, field_t
, field_ptr_t
;
21475 /* Declare a local variable. */
21476 tmp_ha
= create_tmp_var_raw (type
, "ha");
21477 gimple_add_tmp_var (tmp_ha
);
21479 /* Establish the base type. */
21483 field_t
= float_type_node
;
21484 field_ptr_t
= float_ptr_type_node
;
21487 field_t
= double_type_node
;
21488 field_ptr_t
= double_ptr_type_node
;
21491 field_t
= long_double_type_node
;
21492 field_ptr_t
= long_double_ptr_type_node
;
21495 field_t
= dfloat32_type_node
;
21496 field_ptr_t
= build_pointer_type (dfloat32_type_node
);
21499 field_t
= dfloat64_type_node
;
21500 field_ptr_t
= build_pointer_type (dfloat64_type_node
);
21503 field_t
= dfloat128_type_node
;
21504 field_ptr_t
= build_pointer_type (dfloat128_type_node
);
21507 field_t
= aarch64_fp16_type_node
;
21508 field_ptr_t
= aarch64_fp16_ptr_type_node
;
21511 field_t
= bfloat16_type_node
;
21512 field_ptr_t
= aarch64_bf16_ptr_type_node
;
21517 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
21518 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
21519 field_ptr_t
= build_pointer_type (field_t
);
21526 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
21527 TREE_ADDRESSABLE (tmp_ha
) = 1;
21528 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
21530 t
= fold_convert (field_ptr_t
, addr
);
21531 t
= build2 (MODIFY_EXPR
, field_t
,
21532 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
21533 build1 (INDIRECT_REF
, field_t
, t
));
21535 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
21536 for (i
= 1; i
< nregs
; ++i
)
21538 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
21539 u
= fold_convert (field_ptr_t
, addr
);
21540 u
= build2 (MODIFY_EXPR
, field_t
,
21541 build2 (MEM_REF
, field_t
, tmp_ha
,
21542 build_int_cst (field_ptr_t
,
21544 int_size_in_bytes (field_t
)))),
21545 build1 (INDIRECT_REF
, field_t
, u
));
21546 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
21549 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
21550 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
21553 COND_EXPR_ELSE (cond2
) = t
;
21554 addr
= fold_convert (build_pointer_type (type
), cond1
);
21555 addr
= build_va_arg_indirect_ref (addr
);
21558 addr
= build_va_arg_indirect_ref (addr
);
21563 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
21566 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
21567 const function_arg_info
&arg
,
21568 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
21570 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
21571 CUMULATIVE_ARGS local_cum
;
21572 int gr_saved
= cfun
->va_list_gpr_size
;
21573 int vr_saved
= cfun
->va_list_fpr_size
;
21575 /* The caller has advanced CUM up to, but not beyond, the last named
21576 argument. Advance a local copy of CUM past the last "real" named
21577 argument, to find out how many registers are left over. */
21579 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl
)))
21580 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
21582 /* Found out how many registers we need to save.
21583 Honor tree-stdvar analysis results. */
21584 if (cfun
->va_list_gpr_size
)
21585 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
21586 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
21587 if (cfun
->va_list_fpr_size
)
21588 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
21589 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
21593 gcc_assert (local_cum
.aapcs_nvrn
== 0);
21603 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
21604 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
21605 - gr_saved
* UNITS_PER_WORD
);
21606 mem
= gen_frame_mem (BLKmode
, ptr
);
21607 set_mem_alias_set (mem
, get_varargs_alias_set ());
21609 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
21614 /* We can't use move_block_from_reg, because it will use
21615 the wrong mode, storing D regs only. */
21616 machine_mode mode
= TImode
;
21617 int off
, i
, vr_start
;
21619 /* Set OFF to the offset from virtual_incoming_args_rtx of
21620 the first vector register. The VR save area lies below
21621 the GR one, and is aligned to 16 bytes. */
21622 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
21623 STACK_BOUNDARY
/ BITS_PER_UNIT
);
21624 off
-= vr_saved
* UNITS_PER_VREG
;
21626 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
21627 for (i
= 0; i
< vr_saved
; ++i
)
21631 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
21632 mem
= gen_frame_mem (mode
, ptr
);
21633 set_mem_alias_set (mem
, get_varargs_alias_set ());
21634 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
21635 off
+= UNITS_PER_VREG
;
21640 /* We don't save the size into *PRETEND_SIZE because we want to avoid
21641 any complication of having crtl->args.pretend_args_size changed. */
21642 cfun
->machine
->frame
.saved_varargs_size
21643 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
21644 STACK_BOUNDARY
/ BITS_PER_UNIT
)
21645 + vr_saved
* UNITS_PER_VREG
);
21649 aarch64_conditional_register_usage (void)
21654 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
21657 call_used_regs
[i
] = 1;
21658 CLEAR_HARD_REG_BIT (operand_reg_set
, i
);
21662 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
21665 call_used_regs
[i
] = 1;
21668 /* Only allow these registers to be accessed via special patterns. */
21669 CLEAR_HARD_REG_BIT (operand_reg_set
, VG_REGNUM
);
21670 CLEAR_HARD_REG_BIT (operand_reg_set
, FFR_REGNUM
);
21671 CLEAR_HARD_REG_BIT (operand_reg_set
, FFRT_REGNUM
);
21672 for (int i
= FIRST_FAKE_REGNUM
; i
<= LAST_FAKE_REGNUM
; ++i
)
21673 CLEAR_HARD_REG_BIT (operand_reg_set
, i
);
21675 /* When tracking speculation, we need a couple of call-clobbered registers
21676 to track the speculation state. It would be nice to just use
21677 IP0 and IP1, but currently there are numerous places that just
21678 assume these registers are free for other uses (eg pointer
21679 authentication). */
21680 if (aarch64_track_speculation
)
21682 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
21683 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
21684 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
21685 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
21689 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
21692 aarch64_member_type_forces_blk (const_tree field_or_array
, machine_mode mode
)
21694 /* For records we're passed a FIELD_DECL, for arrays we're passed
21695 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
21696 const_tree type
= TREE_TYPE (field_or_array
);
21698 /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21699 For structures, the "multiple" case is indicated by MODE being
21701 unsigned int num_zr
, num_pr
;
21702 if (aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
) && num_pr
> 2)
21704 if (TREE_CODE (field_or_array
) == ARRAY_TYPE
)
21705 return !simple_cst_equal (TYPE_SIZE (field_or_array
),
21707 return mode
== VOIDmode
;
21710 return default_member_type_forces_blk (field_or_array
, mode
);
21713 /* Bitmasks that indicate whether earlier versions of GCC would have
21714 taken a different path through the ABI logic. This should result in
21715 a -Wpsabi warning if the earlier path led to a different ABI decision.
21717 WARN_PSABI_EMPTY_CXX17_BASE
21718 Indicates that the type includes an artificial empty C++17 base field
21719 that, prior to GCC 10.1, would prevent the type from being treated as
21720 a HFA or HVA. See PR94383 for details.
21722 WARN_PSABI_NO_UNIQUE_ADDRESS
21723 Indicates that the type includes an empty [[no_unique_address]] field
21724 that, prior to GCC 10.1, would prevent the type from being treated as
21726 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE
= 1U << 0;
21727 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS
= 1U << 1;
21728 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD
= 1U << 2;
21730 /* Walk down the type tree of TYPE counting consecutive base elements.
21731 If *MODEP is VOIDmode, then set it to the first valid floating point
21732 type. If a non-floating point type is found, or if a floating point
21733 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21734 otherwise return the count in the sub-tree.
21736 The WARN_PSABI_FLAGS argument allows the caller to check whether this
21737 function has changed its behavior relative to earlier versions of GCC.
21738 Normally the argument should be nonnull and point to a zero-initialized
21739 variable. The function then records whether the ABI decision might
21740 be affected by a known fix to the ABI logic, setting the associated
21741 WARN_PSABI_* bits if so.
21743 When the argument is instead a null pointer, the function tries to
21744 simulate the behavior of GCC before all such ABI fixes were made.
21745 This is useful to check whether the function returns something
21746 different after the ABI fixes. */
21748 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
,
21749 unsigned int *warn_psabi_flags
)
21752 HOST_WIDE_INT size
;
21754 if (aarch64_sve::builtin_type_p (type
))
21757 switch (TREE_CODE (type
))
21760 mode
= TYPE_MODE (type
);
21761 if (mode
!= DFmode
&& mode
!= SFmode
21762 && mode
!= TFmode
&& mode
!= HFmode
21763 && mode
!= SDmode
&& mode
!= DDmode
&& mode
!= TDmode
)
21766 if (*modep
== VOIDmode
)
21769 if (*modep
== mode
)
21775 mode
= TYPE_MODE (TREE_TYPE (type
));
21776 if (mode
!= DFmode
&& mode
!= SFmode
21777 && mode
!= TFmode
&& mode
!= HFmode
)
21780 if (*modep
== VOIDmode
)
21783 if (*modep
== mode
)
21789 /* Use V2SImode and V4SImode as representatives of all 64-bit
21790 and 128-bit vector types. */
21791 size
= int_size_in_bytes (type
);
21804 if (*modep
== VOIDmode
)
21807 /* Vector modes are considered to be opaque: two vectors are
21808 equivalent for the purposes of being homogeneous aggregates
21809 if they are the same size. */
21810 if (*modep
== mode
)
21818 tree index
= TYPE_DOMAIN (type
);
21820 /* Can't handle incomplete types nor sizes that are not
21822 if (!COMPLETE_TYPE_P (type
)
21823 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
21826 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
,
21830 || !TYPE_MAX_VALUE (index
)
21831 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
21832 || !TYPE_MIN_VALUE (index
)
21833 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
21837 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
21838 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
21840 /* There must be no padding. */
21841 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
21842 count
* GET_MODE_BITSIZE (*modep
)))
21854 /* Can't handle incomplete types nor sizes that are not
21856 if (!COMPLETE_TYPE_P (type
)
21857 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
21860 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
21862 if (TREE_CODE (field
) != FIELD_DECL
)
21865 if (DECL_FIELD_ABI_IGNORED (field
))
21867 /* See whether this is something that earlier versions of
21868 GCC failed to ignore. */
21870 if (lookup_attribute ("no_unique_address",
21871 DECL_ATTRIBUTES (field
)))
21872 flag
= WARN_PSABI_NO_UNIQUE_ADDRESS
;
21873 else if (cxx17_empty_base_field_p (field
))
21874 flag
= WARN_PSABI_EMPTY_CXX17_BASE
;
21876 /* No compatibility problem. */
21879 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
21880 if (warn_psabi_flags
)
21882 *warn_psabi_flags
|= flag
;
21886 /* A zero-width bitfield may affect layout in some
21887 circumstances, but adds no members. The determination
21888 of whether or not a type is an HFA is performed after
21889 layout is complete, so if the type still looks like an
21890 HFA afterwards, it is still classed as one. This is
21891 potentially an ABI break for the hard-float ABI. */
21892 else if (DECL_BIT_FIELD (field
)
21893 && integer_zerop (DECL_SIZE (field
)))
21895 /* Prior to GCC-12 these fields were striped early,
21896 hiding them from the back-end entirely and
21897 resulting in the correct behaviour for argument
21898 passing. Simulate that old behaviour without
21899 generating a warning. */
21900 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field
))
21902 if (warn_psabi_flags
)
21904 *warn_psabi_flags
|= WARN_PSABI_ZERO_WIDTH_BITFIELD
;
21909 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
,
21913 count
+= sub_count
;
21916 /* There must be no padding. */
21917 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
21918 count
* GET_MODE_BITSIZE (*modep
)))
21925 case QUAL_UNION_TYPE
:
21927 /* These aren't very interesting except in a degenerate case. */
21932 /* Can't handle incomplete types nor sizes that are not
21934 if (!COMPLETE_TYPE_P (type
)
21935 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
21938 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
21940 if (TREE_CODE (field
) != FIELD_DECL
)
21943 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
,
21947 count
= count
> sub_count
? count
: sub_count
;
21950 /* There must be no padding. */
21951 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
21952 count
* GET_MODE_BITSIZE (*modep
)))
21965 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
21966 type as described in AAPCS64 \S 4.1.2.
21968 See the comment above aarch64_composite_type_p for the notes on MODE. */
21971 aarch64_short_vector_p (const_tree type
,
21974 poly_int64 size
= -1;
21976 if (type
&& VECTOR_TYPE_P (type
))
21978 if (aarch64_sve::builtin_type_p (type
))
21980 size
= int_size_in_bytes (type
);
21982 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
21983 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
21985 /* The containing "else if" is too loose: it means that we look at TYPE
21986 if the type is a vector type (good), but that we otherwise ignore TYPE
21987 and look only at the mode. This is wrong because the type describes
21988 the language-level information whereas the mode is purely an internal
21989 GCC concept. We can therefore reach here for types that are not
21990 vectors in the AAPCS64 sense.
21992 We can't "fix" that for the traditional Advanced SIMD vector modes
21993 without breaking backwards compatibility. However, there's no such
21994 baggage for the structure modes, which were introduced in GCC 12. */
21995 if (aarch64_advsimd_struct_mode_p (mode
))
21998 /* For similar reasons, rely only on the type, not the mode, when
21999 processing SVE types. */
22000 if (type
&& aarch64_some_values_include_pst_objects_p (type
))
22001 /* Leave later code to report an error if SVE is disabled. */
22002 gcc_assert (!TARGET_SVE
|| aarch64_sve_mode_p (mode
));
22004 size
= GET_MODE_SIZE (mode
);
22006 if (known_eq (size
, 8) || known_eq (size
, 16))
22008 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
22009 they are being treated as scalable AAPCS64 types. */
22010 gcc_assert (!aarch64_sve_mode_p (mode
)
22011 && !aarch64_advsimd_struct_mode_p (mode
));
22017 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
22018 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
22019 array types. The C99 floating-point complex types are also considered
22020 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
22021 types, which are GCC extensions and out of the scope of AAPCS64, are
22022 treated as composite types here as well.
22024 Note that MODE itself is not sufficient in determining whether a type
22025 is such a composite type or not. This is because
22026 stor-layout.cc:compute_record_mode may have already changed the MODE
22027 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
22028 structure with only one field may have its MODE set to the mode of the
22029 field. Also an integer mode whose size matches the size of the
22030 RECORD_TYPE type may be used to substitute the original mode
22031 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
22032 solely relied on. */
22035 aarch64_composite_type_p (const_tree type
,
22038 if (aarch64_short_vector_p (type
, mode
))
22041 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
22045 && TREE_CODE (type
) == BITINT_TYPE
22046 && int_size_in_bytes (type
) > 16)
22049 if (mode
== BLKmode
22050 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
22051 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
22057 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22058 shall be passed or returned in simd/fp register(s) (providing these
22059 parameter passing registers are available).
22061 Upon successful return, *COUNT returns the number of needed registers,
22062 *BASE_MODE returns the mode of the individual register and when IS_HA
22063 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22064 floating-point aggregate or a homogeneous short-vector aggregate.
22066 SILENT_P is true if the function should refrain from reporting any
22067 diagnostics. This should only be used if the caller is certain that
22068 any ABI decisions would eventually come through this function with
22069 SILENT_P set to false. */
22072 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
22074 machine_mode
*base_mode
,
22079 if (is_ha
!= NULL
) *is_ha
= false;
22081 machine_mode new_mode
= VOIDmode
;
22082 bool composite_p
= aarch64_composite_type_p (type
, mode
);
22085 && (GET_MODE_CLASS (mode
) == MODE_FLOAT
22086 || GET_MODE_CLASS (mode
) == MODE_DECIMAL_FLOAT
))
22087 || aarch64_short_vector_p (type
, mode
))
22092 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
22094 if (is_ha
!= NULL
) *is_ha
= true;
22096 new_mode
= GET_MODE_INNER (mode
);
22098 else if (type
&& composite_p
)
22100 unsigned int warn_psabi_flags
= 0;
22101 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
,
22102 &warn_psabi_flags
);
22103 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
22105 static unsigned last_reported_type_uid
;
22106 unsigned uid
= TYPE_UID (TYPE_MAIN_VARIANT (type
));
22110 && warn_psabi_flags
22111 && uid
!= last_reported_type_uid
22112 && ((alt
= aapcs_vfp_sub_candidate (type
, &new_mode
, NULL
))
22116 = CHANGES_ROOT_URL
"gcc-10/changes.html#empty_base";
22118 = CHANGES_ROOT_URL
"gcc-12/changes.html#zero_width_bitfields";
22119 gcc_assert (alt
== -1);
22120 last_reported_type_uid
= uid
;
22121 /* Use TYPE_MAIN_VARIANT to strip any redundant const
22123 if (warn_psabi_flags
& WARN_PSABI_NO_UNIQUE_ADDRESS
)
22124 inform (input_location
, "parameter passing for argument of "
22125 "type %qT with %<[[no_unique_address]]%> members "
22126 "changed %{in GCC 10.1%}",
22127 TYPE_MAIN_VARIANT (type
), url10
);
22128 else if (warn_psabi_flags
& WARN_PSABI_EMPTY_CXX17_BASE
)
22129 inform (input_location
, "parameter passing for argument of "
22130 "type %qT when C++17 is enabled changed to match "
22131 "C++14 %{in GCC 10.1%}",
22132 TYPE_MAIN_VARIANT (type
), url10
);
22133 else if (warn_psabi_flags
& WARN_PSABI_ZERO_WIDTH_BITFIELD
)
22134 inform (input_location
, "parameter passing for argument of "
22135 "type %qT changed %{in GCC 12.1%}",
22136 TYPE_MAIN_VARIANT (type
), url12
);
22139 if (is_ha
!= NULL
) *is_ha
= true;
22148 gcc_assert (!aarch64_sve_mode_p (new_mode
));
22149 *base_mode
= new_mode
;
22153 /* Implement TARGET_STRUCT_VALUE_RTX. */
22156 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
22157 int incoming ATTRIBUTE_UNUSED
)
22159 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
22162 /* Implements target hook vector_mode_supported_p. */
22164 aarch64_vector_mode_supported_p (machine_mode mode
)
22166 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
22167 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
22170 /* Implements target hook vector_mode_supported_any_target_p. */
22172 aarch64_vector_mode_supported_any_target_p (machine_mode mode
)
22174 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
, true);
22175 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
22178 /* Return the full-width SVE vector mode for element mode MODE, if one
22181 aarch64_full_sve_mode (scalar_mode mode
)
22200 return VNx16QImode
;
22202 return opt_machine_mode ();
22206 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22209 aarch64_vq_mode (scalar_mode mode
)
22230 return opt_machine_mode ();
22234 /* Return appropriate SIMD container
22235 for MODE within a vector of WIDTH bits. */
22236 static machine_mode
22237 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
22240 && maybe_ne (width
, 128)
22241 && known_eq (width
, BITS_PER_SVE_VECTOR
))
22242 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
22244 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
22245 if (TARGET_BASE_SIMD
)
22247 if (known_eq (width
, 128))
22248 return aarch64_vq_mode (mode
).else_mode (word_mode
);
22271 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22272 and return whether the SVE mode should be preferred over the
22273 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
22275 aarch64_cmp_autovec_modes (machine_mode sve_m
, machine_mode asimd_m
)
22277 /* Take into account the aarch64-autovec-preference param if non-zero. */
22278 bool only_asimd_p
= aarch64_autovec_preference
== 1;
22279 bool only_sve_p
= aarch64_autovec_preference
== 2;
22286 /* The preference in case of a tie in costs. */
22287 bool prefer_asimd
= aarch64_autovec_preference
== 3;
22288 bool prefer_sve
= aarch64_autovec_preference
== 4;
22290 poly_int64 nunits_sve
= GET_MODE_NUNITS (sve_m
);
22291 poly_int64 nunits_asimd
= GET_MODE_NUNITS (asimd_m
);
22292 /* If the CPU information does not have an SVE width registered use the
22293 generic poly_int comparison that prefers SVE. If a preference is
22294 explicitly requested avoid this path. */
22295 if (aarch64_tune_params
.sve_width
== SVE_SCALABLE
22298 return maybe_gt (nunits_sve
, nunits_asimd
);
22300 /* Otherwise estimate the runtime width of the modes involved. */
22301 HOST_WIDE_INT est_sve
= estimated_poly_value (nunits_sve
);
22302 HOST_WIDE_INT est_asimd
= estimated_poly_value (nunits_asimd
);
22304 /* Preferring SVE means picking it first unless the Advanced SIMD mode
22305 is clearly wider. */
22307 return est_sve
>= est_asimd
;
22308 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22309 is clearly wider. */
22311 return est_sve
> est_asimd
;
22313 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
22314 return est_sve
> est_asimd
;
22317 /* Return 128-bit container as the preferred SIMD mode for MODE. */
22318 static machine_mode
22319 aarch64_preferred_simd_mode (scalar_mode mode
)
22321 /* Take into account explicit auto-vectorization ISA preferences through
22322 aarch64_cmp_autovec_modes. */
22323 if (TARGET_SVE
&& aarch64_cmp_autovec_modes (VNx16QImode
, V16QImode
))
22324 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
22326 return aarch64_vq_mode (mode
).else_mode (word_mode
);
22330 /* Return a list of possible vector sizes for the vectorizer
22331 to iterate over. */
22332 static unsigned int
22333 aarch64_autovectorize_vector_modes (vector_modes
*modes
, bool)
22335 static const machine_mode sve_modes
[] = {
22336 /* Try using full vectors for all element types. */
22339 /* Try using 16-bit containers for 8-bit elements and full vectors
22340 for wider elements. */
22343 /* Try using 32-bit containers for 8-bit and 16-bit elements and
22344 full vectors for wider elements. */
22347 /* Try using 64-bit containers for all element types. */
22351 static const machine_mode advsimd_modes
[] = {
22352 /* Try using 128-bit vectors for all element types. */
22355 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22356 for wider elements. */
22359 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22360 for wider elements.
22362 TODO: We could support a limited form of V4QImode too, so that
22363 we use 32-bit vectors for 8-bit elements. */
22366 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22367 for 64-bit elements.
22369 TODO: We could similarly support limited forms of V2QImode and V2HImode
22374 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22377 - If we can't use N-byte Advanced SIMD vectors then the placement
22378 doesn't matter; we'll just continue as though the Advanced SIMD
22379 entry didn't exist.
22381 - If an SVE main loop with N bytes ends up being cheaper than an
22382 Advanced SIMD main loop with N bytes then by default we'll replace
22383 the Advanced SIMD version with the SVE one.
22385 - If an Advanced SIMD main loop with N bytes ends up being cheaper
22386 than an SVE main loop with N bytes then by default we'll try to
22387 use the SVE loop to vectorize the epilogue instead. */
22389 bool only_asimd_p
= aarch64_autovec_preference
== 1;
22390 bool only_sve_p
= aarch64_autovec_preference
== 2;
22392 unsigned int sve_i
= (TARGET_SVE
&& !only_asimd_p
) ? 0 : ARRAY_SIZE (sve_modes
);
22393 unsigned int advsimd_i
= 0;
22395 while (!only_sve_p
&& advsimd_i
< ARRAY_SIZE (advsimd_modes
))
22397 if (sve_i
< ARRAY_SIZE (sve_modes
)
22398 && aarch64_cmp_autovec_modes (sve_modes
[sve_i
],
22399 advsimd_modes
[advsimd_i
]))
22400 modes
->safe_push (sve_modes
[sve_i
++]);
22402 modes
->safe_push (advsimd_modes
[advsimd_i
++]);
22404 while (sve_i
< ARRAY_SIZE (sve_modes
))
22405 modes
->safe_push (sve_modes
[sve_i
++]);
22407 unsigned int flags
= 0;
22408 if (aarch64_vect_compare_costs
)
22409 flags
|= VECT_COMPARE_COSTS
;
22413 /* Implement TARGET_MANGLE_TYPE. */
22415 static const char *
22416 aarch64_mangle_type (const_tree type
)
22418 /* The AArch64 ABI documents say that "__va_list" has to be
22419 mangled as if it is in the "std" namespace. */
22420 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
22421 return "St9__va_list";
22423 /* Half-precision floating point types. */
22424 if (SCALAR_FLOAT_TYPE_P (type
) && TYPE_PRECISION (type
) == 16)
22426 if (TYPE_MAIN_VARIANT (type
) == float16_type_node
)
22428 if (TYPE_MODE (type
) == BFmode
)
22434 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
22436 if (TYPE_NAME (type
) != NULL
)
22439 if ((res
= aarch64_general_mangle_builtin_type (type
))
22440 || (res
= aarch64_sve::mangle_builtin_type (type
)))
22444 /* Use the default mangling. */
22448 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
22451 aarch64_verify_type_context (location_t loc
, type_context_kind context
,
22452 const_tree type
, bool silent_p
)
22454 return aarch64_sve::verify_type_context (loc
, context
, type
, silent_p
);
22457 /* Find the first rtx_insn before insn that will generate an assembly
22461 aarch64_prev_real_insn (rtx_insn
*insn
)
22468 insn
= prev_real_insn (insn
);
22470 while (insn
&& recog_memoized (insn
) < 0);
22476 is_madd_op (enum attr_type t1
)
22479 /* A number of these may be AArch32 only. */
22480 enum attr_type mlatypes
[] = {
22481 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
22482 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
22483 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
22486 for (i
= 0; i
< ARRAY_SIZE (mlatypes
); i
++)
22488 if (t1
== mlatypes
[i
])
22495 /* Check if there is a register dependency between a load and the insn
22496 for which we hold recog_data. */
22499 dep_between_memop_and_curr (rtx memop
)
22504 gcc_assert (GET_CODE (memop
) == SET
);
22506 if (!REG_P (SET_DEST (memop
)))
22509 load_reg
= SET_DEST (memop
);
22510 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
22512 rtx operand
= recog_data
.operand
[opno
];
22513 if (REG_P (operand
)
22514 && reg_overlap_mentioned_p (load_reg
, operand
))
22522 /* When working around the Cortex-A53 erratum 835769,
22523 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22524 instruction and has a preceding memory instruction such that a NOP
22525 should be inserted between them. */
22528 aarch64_madd_needs_nop (rtx_insn
* insn
)
22530 enum attr_type attr_type
;
22534 if (!TARGET_FIX_ERR_A53_835769
)
22537 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
22540 attr_type
= get_attr_type (insn
);
22541 if (!is_madd_op (attr_type
))
22544 prev
= aarch64_prev_real_insn (insn
);
22545 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22546 Restore recog state to INSN to avoid state corruption. */
22547 extract_constrain_insn_cached (insn
);
22549 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
22552 body
= single_set (prev
);
22554 /* If the previous insn is a memory op and there is no dependency between
22555 it and the DImode madd, emit a NOP between them. If body is NULL then we
22556 have a complex memory operation, probably a load/store pair.
22557 Be conservative for now and emit a NOP. */
22558 if (GET_MODE (recog_data
.operand
[0]) == DImode
22559 && (!body
|| !dep_between_memop_and_curr (body
)))
22567 /* Implement FINAL_PRESCAN_INSN. */
22570 aarch64_final_prescan_insn (rtx_insn
*insn
)
22572 if (aarch64_madd_needs_nop (insn
))
22573 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
22577 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22581 aarch64_sve_index_immediate_p (rtx base_or_step
)
22583 return (CONST_INT_P (base_or_step
)
22584 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
22587 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22588 when applied to mode MODE. Negate X first if NEGATE_P is true. */
22591 aarch64_sve_arith_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
22593 rtx elt
= unwrap_const_vec_duplicate (x
);
22594 if (!CONST_INT_P (elt
))
22597 HOST_WIDE_INT val
= INTVAL (elt
);
22600 val
&= GET_MODE_MASK (GET_MODE_INNER (mode
));
22603 return IN_RANGE (val
, 0, 0xff);
22604 return IN_RANGE (val
, 0, 0xff00);
22607 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22608 instructions when applied to mode MODE. Negate X first if NEGATE_P
22612 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
22614 if (!aarch64_sve_arith_immediate_p (mode
, x
, negate_p
))
22617 /* After the optional negation, the immediate must be nonnegative.
22618 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22619 instead of SQADD Zn.B, Zn.B, #129. */
22620 rtx elt
= unwrap_const_vec_duplicate (x
);
22621 return negate_p
== (INTVAL (elt
) < 0);
22624 /* Return true if X is a valid immediate operand for an SVE logical
22625 instruction such as AND. */
22628 aarch64_sve_bitmask_immediate_p (rtx x
)
22632 return (const_vec_duplicate_p (x
, &elt
)
22633 && CONST_INT_P (elt
)
22634 && aarch64_bitmask_imm (INTVAL (elt
),
22635 GET_MODE_INNER (GET_MODE (x
))));
22638 /* Return true if X is a valid immediate for the SVE DUP and CPY
22642 aarch64_sve_dup_immediate_p (rtx x
)
22644 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
22645 if (!CONST_INT_P (x
))
22648 HOST_WIDE_INT val
= INTVAL (x
);
22650 return IN_RANGE (val
, -0x80, 0x7f);
22651 return IN_RANGE (val
, -0x8000, 0x7f00);
22654 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22655 SIGNED_P says whether the operand is signed rather than unsigned. */
22658 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
22660 x
= unwrap_const_vec_duplicate (x
);
22661 return (CONST_INT_P (x
)
22663 ? IN_RANGE (INTVAL (x
), -16, 15)
22664 : IN_RANGE (INTVAL (x
), 0, 127)));
22667 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22668 instruction. Negate X first if NEGATE_P is true. */
22671 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
22676 if (!const_vec_duplicate_p (x
, &elt
)
22677 || !CONST_DOUBLE_P (elt
))
22680 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
22683 r
= real_value_negate (&r
);
22685 if (real_equal (&r
, &dconst1
))
22687 if (real_equal (&r
, &dconsthalf
))
22692 /* Return true if X is a valid immediate operand for an SVE FMUL
22696 aarch64_sve_float_mul_immediate_p (rtx x
)
22700 return (const_vec_duplicate_p (x
, &elt
)
22701 && CONST_DOUBLE_P (elt
)
22702 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
22703 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
22706 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22707 for the Advanced SIMD operation described by WHICH and INSN. If INFO
22708 is nonnull, use it to describe valid immediates. */
22710 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
22711 simd_immediate_info
*info
,
22712 enum simd_immediate_check which
,
22713 simd_immediate_info::insn_type insn
)
22715 /* Try a 4-byte immediate with LSL. */
22716 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
22717 if ((val32
& (0xff << shift
)) == val32
)
22720 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
22721 simd_immediate_info::LSL
, shift
);
22725 /* Try a 2-byte immediate with LSL. */
22726 unsigned int imm16
= val32
& 0xffff;
22727 if (imm16
== (val32
>> 16))
22728 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
22729 if ((imm16
& (0xff << shift
)) == imm16
)
22732 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
22733 simd_immediate_info::LSL
, shift
);
22737 /* Try a 4-byte immediate with MSL, except for cases that MVN
22739 if (which
== AARCH64_CHECK_MOV
)
22740 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
22742 unsigned int low
= (1 << shift
) - 1;
22743 if (((val32
& (0xff << shift
)) | low
) == val32
)
22746 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
22747 simd_immediate_info::MSL
, shift
);
22755 /* Return true if replicating VAL64 is a valid immediate for the
22756 Advanced SIMD operation described by WHICH. If INFO is nonnull,
22757 use it to describe valid immediates. */
22759 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
22760 simd_immediate_info
*info
,
22761 enum simd_immediate_check which
)
22763 unsigned int val32
= val64
& 0xffffffff;
22764 unsigned int val16
= val64
& 0xffff;
22765 unsigned int val8
= val64
& 0xff;
22767 if (val32
== (val64
>> 32))
22769 if ((which
& AARCH64_CHECK_ORR
) != 0
22770 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
22771 simd_immediate_info::MOV
))
22774 if ((which
& AARCH64_CHECK_BIC
) != 0
22775 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
22776 simd_immediate_info::MVN
))
22779 /* Try using a replicated byte. */
22780 if (which
== AARCH64_CHECK_MOV
22781 && val16
== (val32
>> 16)
22782 && val8
== (val16
>> 8))
22785 *info
= simd_immediate_info (QImode
, val8
);
22790 /* Try using a bit-to-bytemask. */
22791 if (which
== AARCH64_CHECK_MOV
)
22794 for (i
= 0; i
< 64; i
+= 8)
22796 unsigned char byte
= (val64
>> i
) & 0xff;
22797 if (byte
!= 0 && byte
!= 0xff)
22803 *info
= simd_immediate_info (DImode
, val64
);
22810 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22811 instruction. If INFO is nonnull, use it to describe valid immediates. */
22814 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
22815 simd_immediate_info
*info
)
22817 scalar_int_mode mode
= DImode
;
22818 unsigned int val32
= val64
& 0xffffffff;
22819 if (val32
== (val64
>> 32))
22822 unsigned int val16
= val32
& 0xffff;
22823 if (val16
== (val32
>> 16))
22826 unsigned int val8
= val16
& 0xff;
22827 if (val8
== (val16
>> 8))
22831 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
22832 if (IN_RANGE (val
, -0x80, 0x7f))
22834 /* DUP with no shift. */
22836 *info
= simd_immediate_info (mode
, val
);
22839 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
22841 /* DUP with LSL #8. */
22843 *info
= simd_immediate_info (mode
, val
);
22846 if (aarch64_bitmask_imm (val64
, mode
))
22850 *info
= simd_immediate_info (mode
, val
);
22856 /* Return true if X is an UNSPEC_PTRUE constant of the form:
22858 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
22860 where PATTERN is the svpattern as a CONST_INT and where ZERO
22861 is a zero constant of the required PTRUE mode (which can have
22862 fewer elements than X's mode, if zero bits are significant).
22864 If so, and if INFO is nonnull, describe the immediate in INFO. */
22866 aarch64_sve_ptrue_svpattern_p (rtx x
, struct simd_immediate_info
*info
)
22868 if (GET_CODE (x
) != CONST
)
22872 if (GET_CODE (x
) != UNSPEC
|| XINT (x
, 1) != UNSPEC_PTRUE
)
22877 aarch64_svpattern pattern
22878 = (aarch64_svpattern
) INTVAL (XVECEXP (x
, 0, 0));
22879 machine_mode pred_mode
= GET_MODE (XVECEXP (x
, 0, 1));
22880 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (pred_mode
);
22881 *info
= simd_immediate_info (int_mode
, pattern
);
22886 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
22887 it to describe valid immediates. */
22890 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
22892 if (aarch64_sve_ptrue_svpattern_p (x
, info
))
22895 if (x
== CONST0_RTX (GET_MODE (x
)))
22898 *info
= simd_immediate_info (DImode
, 0);
22902 /* Analyze the value as a VNx16BImode. This should be relatively
22903 efficient, since rtx_vector_builder has enough built-in capacity
22904 to store all VLA predicate constants without needing the heap. */
22905 rtx_vector_builder builder
;
22906 if (!aarch64_get_sve_pred_bits (builder
, x
))
22909 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
22910 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
22912 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
22913 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
22914 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
22918 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
22919 *info
= simd_immediate_info (int_mode
, pattern
);
22927 /* Return true if OP is a valid SIMD immediate for the operation
22928 described by WHICH. If INFO is nonnull, use it to describe valid
22931 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
22932 enum simd_immediate_check which
)
22934 machine_mode mode
= GET_MODE (op
);
22935 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
22936 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
22939 if ((vec_flags
& VEC_ADVSIMD
) && !TARGET_SIMD
)
22942 if (vec_flags
== (VEC_SVE_PRED
| VEC_STRUCT
))
22943 return op
== CONST0_RTX (mode
) || op
== CONSTM1_RTX (mode
);
22945 if (vec_flags
& VEC_SVE_PRED
)
22946 return aarch64_sve_pred_valid_immediate (op
, info
);
22948 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
22950 unsigned int n_elts
;
22951 if (CONST_VECTOR_P (op
)
22952 && CONST_VECTOR_DUPLICATE_P (op
))
22953 n_elts
= CONST_VECTOR_NPATTERNS (op
);
22954 else if ((vec_flags
& VEC_SVE_DATA
)
22955 && const_vec_series_p (op
, &base
, &step
))
22957 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
22958 if (!aarch64_sve_index_immediate_p (base
)
22959 || !aarch64_sve_index_immediate_p (step
))
22964 /* Get the corresponding container mode. E.g. an INDEX on V2SI
22965 should yield two integer values per 128-bit block, meaning
22966 that we need to treat it in the same way as V2DI and then
22967 ignore the upper 32 bits of each element. */
22968 elt_mode
= aarch64_sve_container_int_mode (mode
);
22969 *info
= simd_immediate_info (elt_mode
, base
, step
);
22973 else if (CONST_VECTOR_P (op
)
22974 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
22975 /* N_ELTS set above. */;
22979 scalar_float_mode elt_float_mode
;
22981 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
22983 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
22984 if (aarch64_float_const_zero_rtx_p (elt
)
22985 || aarch64_float_const_representable_p (elt
))
22988 *info
= simd_immediate_info (elt_float_mode
, elt
);
22993 /* If all elements in an SVE vector have the same value, we have a free
22994 choice between using the element mode and using the container mode.
22995 Using the element mode means that unused parts of the vector are
22996 duplicates of the used elements, while using the container mode means
22997 that the unused parts are an extension of the used elements. Using the
22998 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
22999 for its container mode VNx4SI while 0x00000101 isn't.
23001 If not all elements in an SVE vector have the same value, we need the
23002 transition from one element to the next to occur at container boundaries.
23003 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
23004 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
23005 scalar_int_mode elt_int_mode
;
23006 if ((vec_flags
& VEC_SVE_DATA
) && n_elts
> 1)
23007 elt_int_mode
= aarch64_sve_container_int_mode (mode
);
23009 elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
23011 unsigned int elt_size
= GET_MODE_SIZE (elt_int_mode
);
23015 /* Expand the vector constant out into a byte vector, with the least
23016 significant byte of the register first. */
23017 auto_vec
<unsigned char, 16> bytes
;
23018 bytes
.reserve (n_elts
* elt_size
);
23019 for (unsigned int i
= 0; i
< n_elts
; i
++)
23021 /* The vector is provided in gcc endian-neutral fashion.
23022 For aarch64_be Advanced SIMD, it must be laid out in the vector
23023 register in reverse order. */
23024 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
23025 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
23027 if (elt_mode
!= elt_int_mode
)
23028 elt
= gen_lowpart (elt_int_mode
, elt
);
23030 if (!CONST_INT_P (elt
))
23033 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
23034 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
23036 bytes
.quick_push (elt_val
& 0xff);
23037 elt_val
>>= BITS_PER_UNIT
;
23041 /* The immediate must repeat every eight bytes. */
23042 unsigned int nbytes
= bytes
.length ();
23043 for (unsigned i
= 8; i
< nbytes
; ++i
)
23044 if (bytes
[i
] != bytes
[i
- 8])
23047 /* Get the repeating 8-byte value as an integer. No endian correction
23048 is needed here because bytes is already in lsb-first order. */
23049 unsigned HOST_WIDE_INT val64
= 0;
23050 for (unsigned int i
= 0; i
< 8; i
++)
23051 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
23052 << (i
* BITS_PER_UNIT
));
23054 if (vec_flags
& VEC_SVE_DATA
)
23055 return aarch64_sve_valid_immediate (val64
, info
);
23057 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
23060 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23061 has a step in the range of INDEX. Return the index expression if so,
23062 otherwise return null. */
23064 aarch64_check_zero_based_sve_index_immediate (rtx x
)
23067 if (const_vec_series_p (x
, &base
, &step
)
23068 && base
== const0_rtx
23069 && aarch64_sve_index_immediate_p (step
))
23074 /* Check of immediate shift constants are within range. */
23076 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
23078 x
= unwrap_const_vec_duplicate (x
);
23079 if (!CONST_INT_P (x
))
23081 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
23083 return IN_RANGE (INTVAL (x
), 0, bit_width
- 1);
23085 return IN_RANGE (INTVAL (x
), 1, bit_width
);
23088 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23089 operation of width WIDTH at bit position POS. */
23092 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
23094 gcc_assert (CONST_INT_P (width
));
23095 gcc_assert (CONST_INT_P (pos
));
23097 unsigned HOST_WIDE_INT mask
23098 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
23099 return GEN_INT (mask
<< UINTVAL (pos
));
23103 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
23105 if (GET_CODE (x
) == HIGH
23106 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
23109 if (CONST_INT_P (x
))
23112 if (VECTOR_MODE_P (GET_MODE (x
)))
23114 /* Require predicate constants to be VNx16BI before RA, so that we
23115 force everything to have a canonical form. */
23116 if (!lra_in_progress
23117 && !reload_completed
23118 && aarch64_sve_pred_mode_p (GET_MODE (x
))
23119 && known_eq (GET_MODE_SIZE (GET_MODE (x
)), BYTES_PER_SVE_PRED
)
23120 && GET_MODE (x
) != VNx16BImode
)
23123 return aarch64_simd_valid_immediate (x
, NULL
);
23126 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
23127 x
= strip_salt (x
);
23129 /* GOT accesses are valid moves. */
23130 if (SYMBOL_REF_P (x
)
23131 && aarch64_classify_symbolic_expression (x
) == SYMBOL_SMALL_GOT_4G
)
23134 if (SYMBOL_REF_P (x
) && mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
23138 && (aarch64_sve_cnt_immediate_p (x
)
23139 || aarch64_sve_rdvl_immediate_p (x
)))
23142 if (aarch64_rdsvl_immediate_p (x
))
23145 return aarch64_classify_symbolic_expression (x
)
23146 == SYMBOL_TINY_ABSOLUTE
;
23149 /* Return a function-invariant register that contains VALUE. *CACHED_INSN
23150 caches instructions that set up such registers, so that they can be
23151 reused by future calls. */
23154 aarch64_get_shareable_reg (rtx_insn
**cached_insn
, rtx value
)
23156 rtx_insn
*insn
= *cached_insn
;
23157 if (insn
&& INSN_P (insn
) && !insn
->deleted ())
23159 rtx pat
= PATTERN (insn
);
23160 if (GET_CODE (pat
) == SET
)
23162 rtx dest
= SET_DEST (pat
);
23164 && !HARD_REGISTER_P (dest
)
23165 && rtx_equal_p (SET_SRC (pat
), value
))
23169 rtx reg
= gen_reg_rtx (GET_MODE (value
));
23170 *cached_insn
= emit_insn_before (gen_rtx_SET (reg
, value
),
23171 function_beg_insn
);
23175 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23176 the constant creation. */
23179 aarch64_gen_shareable_zero (machine_mode mode
)
23181 rtx reg
= aarch64_get_shareable_reg (&cfun
->machine
->advsimd_zero_insn
,
23182 CONST0_RTX (V4SImode
));
23183 return lowpart_subreg (mode
, reg
, GET_MODE (reg
));
23186 /* INSN is some form of extension or shift that can be split into a
23187 permutation involving a shared zero. Return true if we should
23188 perform such a split.
23190 ??? For now, make sure that the split instruction executes more
23191 frequently than the zero that feeds it. In future it would be good
23192 to split without that restriction and instead recombine shared zeros
23193 if they turn out not to be worthwhile. This would allow splits in
23194 single-block functions and would also cope more naturally with
23195 rematerialization. The downside of not doing this is that we lose the
23196 optimizations for vector epilogues as well. */
23199 aarch64_split_simd_shift_p (rtx_insn
*insn
)
23201 return (can_create_pseudo_p ()
23202 && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn
))
23203 && (ENTRY_BLOCK_PTR_FOR_FN (cfun
)->count
23204 < BLOCK_FOR_INSN (insn
)->count
));
23207 /* Return a const_int vector of VAL. */
23209 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
23211 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
23212 return gen_const_vec_duplicate (mode
, c
);
23215 /* Check OP is a legal scalar immediate for the MOVI instruction. */
23218 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
23220 machine_mode vmode
;
23222 vmode
= aarch64_simd_container_mode (mode
, 64);
23223 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
23224 return aarch64_simd_valid_immediate (op_v
, NULL
);
23227 /* Construct and return a PARALLEL RTX vector with elements numbering the
23228 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23229 the vector - from the perspective of the architecture. This does not
23230 line up with GCC's perspective on lane numbers, so we end up with
23231 different masks depending on our target endian-ness. The diagram
23232 below may help. We must draw the distinction when building masks
23233 which select one half of the vector. An instruction selecting
23234 architectural low-lanes for a big-endian target, must be described using
23235 a mask selecting GCC high-lanes.
23237 Big-Endian Little-Endian
23239 GCC 0 1 2 3 3 2 1 0
23240 | x | x | x | x | | x | x | x | x |
23241 Architecture 3 2 1 0 3 2 1 0
23243 Low Mask: { 2, 3 } { 0, 1 }
23244 High Mask: { 0, 1 } { 2, 3 }
23246 MODE Is the mode of the vector and NUNITS is the number of units in it. */
23249 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
23251 rtvec v
= rtvec_alloc (nunits
/ 2);
23252 int high_base
= nunits
/ 2;
23258 if (BYTES_BIG_ENDIAN
)
23259 base
= high
? low_base
: high_base
;
23261 base
= high
? high_base
: low_base
;
23263 for (i
= 0; i
< nunits
/ 2; i
++)
23264 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
23266 t1
= gen_rtx_PARALLEL (mode
, v
);
23270 /* Check OP for validity as a PARALLEL RTX vector with elements
23271 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23272 from the perspective of the architecture. See the diagram above
23273 aarch64_simd_vect_par_cnst_half for more details. */
23276 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
23280 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
23283 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
23284 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
23285 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
23288 if (count_op
!= count_ideal
)
23291 for (i
= 0; i
< count_ideal
; i
++)
23293 rtx elt_op
= XVECEXP (op
, 0, i
);
23294 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
23296 if (!CONST_INT_P (elt_op
)
23297 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
23303 /* Return a PARALLEL containing NELTS elements, with element I equal
23304 to BASE + I * STEP. */
23307 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
23309 rtvec vec
= rtvec_alloc (nelts
);
23310 for (unsigned int i
= 0; i
< nelts
; ++i
)
23311 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
23312 return gen_rtx_PARALLEL (VOIDmode
, vec
);
23315 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23316 series with step STEP. */
23319 aarch64_stepped_int_parallel_p (rtx op
, int step
)
23321 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
23324 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
23325 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
23326 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
23327 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
23333 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23334 sequence of strided registers, with the stride being equal STRIDE.
23335 The operands are already known to be FPRs. */
23337 aarch64_strided_registers_p (rtx
*operands
, unsigned int num_operands
,
23338 unsigned int stride
)
23340 for (unsigned int i
= 1; i
< num_operands
; ++i
)
23341 if (REGNO (operands
[i
]) != REGNO (operands
[0]) + i
* stride
)
23346 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
23347 HIGH (exclusive). */
23349 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
23352 HOST_WIDE_INT lane
;
23353 gcc_assert (CONST_INT_P (operand
));
23354 lane
= INTVAL (operand
);
23356 if (lane
< low
|| lane
>= high
)
23359 error_at (EXPR_LOCATION (exp
), "lane %wd out of range %wd - %wd",
23360 lane
, low
, high
- 1);
23362 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
23366 /* Peform endian correction on lane number N, which indexes a vector
23367 of mode MODE, and return the result as an SImode rtx. */
23370 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
23372 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
23375 /* Return TRUE if OP is a valid vector addressing mode. */
23378 aarch64_simd_mem_operand_p (rtx op
)
23380 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
23381 || REG_P (XEXP (op
, 0)));
23384 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
23387 aarch64_sve_ld1r_operand_p (rtx op
)
23389 struct aarch64_address_info addr
;
23393 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
23394 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
23395 && addr
.type
== ADDRESS_REG_IMM
23396 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
23399 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23400 where the size of the read data is specified by `mode` and the size of the
23401 vector elements are specified by `elem_mode`. */
23403 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op
, machine_mode mode
,
23404 scalar_mode elem_mode
)
23406 struct aarch64_address_info addr
;
23408 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
23411 if (addr
.type
== ADDRESS_REG_IMM
)
23412 return offset_4bit_signed_scaled_p (mode
, addr
.const_offset
);
23414 if (addr
.type
== ADDRESS_REG_REG
)
23415 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
23420 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
23422 aarch64_sve_ld1rq_operand_p (rtx op
)
23424 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, TImode
,
23425 GET_MODE_INNER (GET_MODE (op
)));
23428 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23429 accessing a vector where the element size is specified by `elem_mode`. */
23431 aarch64_sve_ld1ro_operand_p (rtx op
, scalar_mode elem_mode
)
23433 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, OImode
, elem_mode
);
23436 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
23438 aarch64_sve_ldff1_operand_p (rtx op
)
23443 struct aarch64_address_info addr
;
23444 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
), false))
23447 if (addr
.type
== ADDRESS_REG_IMM
)
23448 return known_eq (addr
.const_offset
, 0);
23450 return addr
.type
== ADDRESS_REG_REG
;
23453 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
23455 aarch64_sve_ldnf1_operand_p (rtx op
)
23457 struct aarch64_address_info addr
;
23460 && aarch64_classify_address (&addr
, XEXP (op
, 0),
23461 GET_MODE (op
), false)
23462 && addr
.type
== ADDRESS_REG_IMM
);
23465 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23466 The conditions for STR are the same. */
23468 aarch64_sve_ldr_operand_p (rtx op
)
23470 struct aarch64_address_info addr
;
23473 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
23474 false, ADDR_QUERY_ANY
)
23475 && addr
.type
== ADDRESS_REG_IMM
);
23478 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23479 addressing memory of mode MODE. */
23481 aarch64_sve_prefetch_operand_p (rtx op
, machine_mode mode
)
23483 struct aarch64_address_info addr
;
23484 if (!aarch64_classify_address (&addr
, op
, mode
, false, ADDR_QUERY_ANY
))
23487 if (addr
.type
== ADDRESS_REG_IMM
)
23488 return offset_6bit_signed_scaled_p (mode
, addr
.const_offset
);
23490 return addr
.type
== ADDRESS_REG_REG
;
23493 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23494 We need to be able to access the individual pieces, so the range
23495 is different from LD[234] and ST[234]. */
23497 aarch64_sve_struct_memory_operand_p (rtx op
)
23502 machine_mode mode
= GET_MODE (op
);
23503 struct aarch64_address_info addr
;
23504 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
23506 || addr
.type
!= ADDRESS_REG_IMM
)
23509 poly_int64 first
= addr
.const_offset
;
23510 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
23511 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
23512 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
23515 /* Return true if OFFSET is a constant integer and if VNUM is
23516 OFFSET * the number of bytes in an SVE vector. This is the requirement
23517 that exists in SME LDR and STR instructions, where the VL offset must
23518 equal the ZA slice offset. */
23520 aarch64_sme_ldr_vnum_offset_p (rtx offset
, rtx vnum
)
23522 if (!CONST_INT_P (offset
) || !IN_RANGE (INTVAL (offset
), 0, 15))
23525 if (TARGET_STREAMING
)
23527 poly_int64 const_vnum
;
23528 return (poly_int_rtx_p (vnum
, &const_vnum
)
23529 && known_eq (const_vnum
,
23530 INTVAL (offset
) * BYTES_PER_SVE_VECTOR
));
23534 HOST_WIDE_INT factor
;
23535 return (aarch64_sme_vq_unspec_p (vnum
, &factor
)
23536 && factor
== INTVAL (offset
) * 16);
23540 /* Emit a register copy from operand to operand, taking care not to
23541 early-clobber source registers in the process.
23543 COUNT is the number of components into which the copy needs to be
23546 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
23547 unsigned int count
)
23550 int rdest
= REGNO (operands
[0]);
23551 int rsrc
= REGNO (operands
[1]);
23553 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
23555 for (i
= 0; i
< count
; i
++)
23556 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
23557 gen_rtx_REG (mode
, rsrc
+ i
));
23559 for (i
= 0; i
< count
; i
++)
23560 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
23561 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
23564 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23565 one of VSTRUCT modes: OI, CI, or XI. */
23567 aarch64_simd_attr_length_rglist (machine_mode mode
)
23569 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
23570 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
23573 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
23574 alignment of a vector to 128 bits. SVE predicates have an alignment of
23576 static HOST_WIDE_INT
23577 aarch64_simd_vector_alignment (const_tree type
)
23579 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23580 be set for non-predicate vectors of booleans. Modes are the most
23581 direct way we have of identifying real SVE predicate types. */
23582 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
23584 widest_int min_size
23585 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type
)));
23586 return wi::umin (min_size
, 128).to_uhwi ();
23589 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
23591 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
23593 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
23595 /* If the length of the vector is a fixed power of 2, try to align
23596 to that length, otherwise don't try to align at all. */
23597 HOST_WIDE_INT result
;
23598 if (!GET_MODE_BITSIZE (TYPE_MODE (type
)).is_constant (&result
)
23599 || !pow2p_hwi (result
))
23600 result
= TYPE_ALIGN (TREE_TYPE (type
));
23603 return TYPE_ALIGN (type
);
23606 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
23608 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
23613 /* For fixed-length vectors, check that the vectorizer will aim for
23614 full-vector alignment. This isn't true for generic GCC vectors
23615 that are wider than the ABI maximum of 128 bits. */
23616 poly_uint64 preferred_alignment
=
23617 aarch64_vectorize_preferred_vector_alignment (type
);
23618 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
23619 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
23620 preferred_alignment
))
23623 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
23627 /* Return true if the vector misalignment factor is supported by the
23630 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
23631 const_tree type
, int misalignment
,
23634 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
23636 /* Return if movmisalign pattern is not supported for this mode. */
23637 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
23640 /* Misalignment factor is unknown at compile time. */
23641 if (misalignment
== -1)
23644 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
23648 /* If VALS is a vector constant that can be loaded into a register
23649 using DUP, generate instructions to do so and return an RTX to
23650 assign to the register. Otherwise return NULL_RTX. */
23652 aarch64_simd_dup_constant (rtx vals
)
23654 machine_mode mode
= GET_MODE (vals
);
23655 machine_mode inner_mode
= GET_MODE_INNER (mode
);
23658 if (!const_vec_duplicate_p (vals
, &x
))
23661 /* We can load this constant by using DUP and a constant in a
23662 single ARM register. This will be cheaper than a vector
23664 x
= force_reg (inner_mode
, x
);
23665 return gen_vec_duplicate (mode
, x
);
23669 /* Generate code to load VALS, which is a PARALLEL containing only
23670 constants (for vec_init) or CONST_VECTOR, efficiently into a
23671 register. Returns an RTX to copy into the register, or NULL_RTX
23672 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
23674 aarch64_simd_make_constant (rtx vals
)
23676 machine_mode mode
= GET_MODE (vals
);
23678 rtx const_vec
= NULL_RTX
;
23682 if (CONST_VECTOR_P (vals
))
23684 else if (GET_CODE (vals
) == PARALLEL
)
23686 /* A CONST_VECTOR must contain only CONST_INTs and
23687 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23688 Only store valid constants in a CONST_VECTOR. */
23689 int n_elts
= XVECLEN (vals
, 0);
23690 for (i
= 0; i
< n_elts
; ++i
)
23692 rtx x
= XVECEXP (vals
, 0, i
);
23693 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
23696 if (n_const
== n_elts
)
23697 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
23700 gcc_unreachable ();
23702 if (const_vec
!= NULL_RTX
23703 && aarch64_simd_valid_immediate (const_vec
, NULL
))
23704 /* Load using MOVI/MVNI. */
23706 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
23707 /* Loaded using DUP. */
23709 else if (const_vec
!= NULL_RTX
)
23710 /* Load from constant pool. We cannot take advantage of single-cycle
23711 LD1 because we need a PC-relative addressing mode. */
23714 /* A PARALLEL containing something not valid inside CONST_VECTOR.
23715 We cannot construct an initializer. */
23719 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23720 The caller has already tried a divide-and-conquer approach, so do
23721 not consider that case here. */
23724 aarch64_expand_vector_init_fallback (rtx target
, rtx vals
)
23726 machine_mode mode
= GET_MODE (target
);
23727 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
23728 /* The number of vector elements. */
23729 int n_elts
= XVECLEN (vals
, 0);
23730 /* The number of vector elements which are not constant. */
23732 rtx any_const
= NULL_RTX
;
23733 /* The first element of vals. */
23734 rtx v0
= XVECEXP (vals
, 0, 0);
23735 bool all_same
= true;
23737 /* This is a special vec_init<M><N> where N is not an element mode but a
23738 vector mode with half the elements of M. We expect to find two entries
23739 of mode N in VALS and we must put their concatentation into TARGET. */
23740 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
23742 machine_mode narrow_mode
= GET_MODE (XVECEXP (vals
, 0, 0));
23743 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
23744 && known_eq (GET_MODE_SIZE (mode
),
23745 2 * GET_MODE_SIZE (narrow_mode
)));
23746 emit_insn (gen_aarch64_vec_concat (narrow_mode
, target
,
23747 XVECEXP (vals
, 0, 0),
23748 XVECEXP (vals
, 0, 1)));
23752 /* Count the number of variable elements to initialise. */
23753 for (int i
= 0; i
< n_elts
; ++i
)
23755 rtx x
= XVECEXP (vals
, 0, i
);
23756 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
23761 all_same
&= rtx_equal_p (x
, v0
);
23764 /* No variable elements, hand off to aarch64_simd_make_constant which knows
23765 how best to handle this. */
23768 rtx constant
= aarch64_simd_make_constant (vals
);
23769 if (constant
!= NULL_RTX
)
23771 emit_move_insn (target
, constant
);
23776 /* Splat a single non-constant element if we can. */
23779 rtx x
= force_reg (inner_mode
, v0
);
23780 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
23784 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
23785 gcc_assert (icode
!= CODE_FOR_nothing
);
23787 /* If there are only variable elements, try to optimize
23788 the insertion using dup for the most common element
23789 followed by insertions. */
23791 /* The algorithm will fill matches[*][0] with the earliest matching element,
23792 and matches[X][1] with the count of duplicate elements (if X is the
23793 earliest element which has duplicates). */
23795 if (n_var
>= n_elts
- 1 && n_elts
<= 16)
23797 int matches
[16][2] = {0};
23798 for (int i
= 0; i
< n_elts
; i
++)
23800 for (int j
= 0; j
<= i
; j
++)
23802 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
23810 int maxelement
= 0;
23812 rtx const_elem
= NULL_RTX
;
23813 int const_elem_pos
= 0;
23815 for (int i
= 0; i
< n_elts
; i
++)
23817 if (matches
[i
][1] > maxv
)
23820 maxv
= matches
[i
][1];
23822 if (CONST_INT_P (XVECEXP (vals
, 0, i
))
23823 || CONST_DOUBLE_P (XVECEXP (vals
, 0, i
)))
23825 const_elem_pos
= i
;
23826 const_elem
= XVECEXP (vals
, 0, i
);
23830 /* Create a duplicate of the most common element, unless all elements
23831 are equally useless to us, in which case just immediately set the
23832 vector register using the first element. */
23836 /* For vectors of two 64-bit elements, we can do even better. */
23838 && (inner_mode
== E_DImode
23839 || inner_mode
== E_DFmode
))
23842 rtx x0
= XVECEXP (vals
, 0, 0);
23843 rtx x1
= XVECEXP (vals
, 0, 1);
23844 /* Combine can pick up this case, but handling it directly
23845 here leaves clearer RTL.
23847 This is load_pair_lanes<mode>, and also gives us a clean-up
23848 for store_pair_lanes<mode>. */
23849 if (memory_operand (x0
, inner_mode
)
23850 && memory_operand (x1
, inner_mode
)
23851 && aarch64_mergeable_load_pair_p (mode
, x0
, x1
))
23854 if (inner_mode
== DFmode
)
23855 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
23857 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
23862 /* The subreg-move sequence below will move into lane zero of the
23863 vector register. For big-endian we want that position to hold
23864 the last element of VALS. */
23865 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
23867 /* If we have a single constant element, use that for duplicating
23871 maxelement
= const_elem_pos
;
23872 aarch64_emit_move (target
, gen_vec_duplicate (mode
, const_elem
));
23876 rtx x
= force_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
23877 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
23882 rtx x
= force_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
23883 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
23886 /* Insert the rest. */
23887 for (int i
= 0; i
< n_elts
; i
++)
23889 rtx x
= XVECEXP (vals
, 0, i
);
23890 if (matches
[i
][0] == maxelement
)
23892 x
= force_reg (inner_mode
, x
);
23893 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
23898 /* Initialise a vector which is part-variable. We want to first try
23899 to build those lanes which are constant in the most efficient way we
23901 if (n_var
!= n_elts
)
23903 rtx copy
= copy_rtx (vals
);
23905 /* Load constant part of vector. We really don't care what goes into the
23906 parts we will overwrite, but we're more likely to be able to load the
23907 constant efficiently if it has fewer, larger, repeating parts
23908 (see aarch64_simd_valid_immediate). */
23909 for (int i
= 0; i
< n_elts
; i
++)
23911 rtx x
= XVECEXP (vals
, 0, i
);
23912 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
23914 rtx subst
= any_const
;
23915 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
23917 /* Look in the copied vector, as more elements are const. */
23918 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
23919 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
23925 XVECEXP (copy
, 0, i
) = subst
;
23927 aarch64_expand_vector_init_fallback (target
, copy
);
23930 /* Insert the variable lanes directly. */
23931 for (int i
= 0; i
< n_elts
; i
++)
23933 rtx x
= XVECEXP (vals
, 0, i
);
23934 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
23936 x
= force_reg (inner_mode
, x
);
23937 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
23941 /* Return even or odd half of VALS depending on EVEN_P. */
23944 aarch64_unzip_vector_init (machine_mode mode
, rtx vals
, bool even_p
)
23946 int n
= XVECLEN (vals
, 0);
23947 machine_mode new_mode
23948 = aarch64_simd_container_mode (GET_MODE_INNER (mode
),
23949 GET_MODE_BITSIZE (mode
).to_constant () / 2);
23950 rtvec vec
= rtvec_alloc (n
/ 2);
23951 for (int i
= 0; i
< n
/ 2; i
++)
23952 RTVEC_ELT (vec
, i
) = (even_p
) ? XVECEXP (vals
, 0, 2 * i
)
23953 : XVECEXP (vals
, 0, 2 * i
+ 1);
23954 return gen_rtx_PARALLEL (new_mode
, vec
);
23957 /* Return true if SET is a scalar move. */
23960 scalar_move_insn_p (rtx set
)
23962 rtx src
= SET_SRC (set
);
23963 rtx dest
= SET_DEST (set
);
23964 return (is_a
<scalar_mode
> (GET_MODE (dest
))
23965 && aarch64_mov_operand (src
, GET_MODE (dest
)));
23968 /* Similar to seq_cost, but ignore cost for scalar moves. */
23971 seq_cost_ignoring_scalar_moves (const rtx_insn
*seq
, bool speed
)
23975 for (; seq
; seq
= NEXT_INSN (seq
))
23976 if (NONDEBUG_INSN_P (seq
))
23978 if (rtx set
= single_set (seq
))
23980 if (!scalar_move_insn_p (set
))
23981 cost
+= set_rtx_cost (set
, speed
);
23985 int this_cost
= insn_cost (CONST_CAST_RTX_INSN (seq
), speed
);
23996 /* Expand a vector initialization sequence, such that TARGET is
23997 initialized to contain VALS. */
24000 aarch64_expand_vector_init (rtx target
, rtx vals
)
24002 /* Try decomposing the initializer into even and odd halves and
24003 then ZIP them together. Use the resulting sequence if it is
24004 strictly cheaper than loading VALS directly.
24006 Prefer the fallback sequence in the event of a tie, since it
24007 will tend to use fewer registers. */
24009 machine_mode mode
= GET_MODE (target
);
24010 int n_elts
= XVECLEN (vals
, 0);
24013 || maybe_ne (GET_MODE_BITSIZE (mode
), 128))
24015 aarch64_expand_vector_init_fallback (target
, vals
);
24022 for (int i
= 0; i
< 2; i
++)
24025 rtx new_vals
= aarch64_unzip_vector_init (mode
, vals
, i
== 0);
24026 rtx tmp_reg
= gen_reg_rtx (GET_MODE (new_vals
));
24027 aarch64_expand_vector_init (tmp_reg
, new_vals
);
24028 halves
[i
] = gen_rtx_SUBREG (mode
, tmp_reg
, 0);
24029 rtx_insn
*rec_seq
= get_insns ();
24031 costs
[i
] = seq_cost_ignoring_scalar_moves (rec_seq
, !optimize_size
);
24032 emit_insn (rec_seq
);
24035 rtvec v
= gen_rtvec (2, halves
[0], halves
[1]);
24036 rtx_insn
*zip1_insn
24037 = emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
24038 unsigned seq_total_cost
24039 = (!optimize_size
) ? std::max (costs
[0], costs
[1]) : costs
[0] + costs
[1];
24040 seq_total_cost
+= insn_cost (zip1_insn
, !optimize_size
);
24042 rtx_insn
*seq
= get_insns ();
24046 aarch64_expand_vector_init_fallback (target
, vals
);
24047 rtx_insn
*fallback_seq
= get_insns ();
24048 unsigned fallback_seq_cost
24049 = seq_cost_ignoring_scalar_moves (fallback_seq
, !optimize_size
);
24052 emit_insn (seq_total_cost
< fallback_seq_cost
? seq
: fallback_seq
);
24055 /* Emit RTL corresponding to:
24056 insr TARGET, ELEM. */
24059 emit_insr (rtx target
, rtx elem
)
24061 machine_mode mode
= GET_MODE (target
);
24062 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24063 elem
= force_reg (elem_mode
, elem
);
24065 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
24066 gcc_assert (icode
!= CODE_FOR_nothing
);
24067 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
24070 /* Subroutine of aarch64_sve_expand_vector_init for handling
24071 trailing constants.
24072 This function works as follows:
24073 (a) Create a new vector consisting of trailing constants.
24074 (b) Initialize TARGET with the constant vector using emit_move_insn.
24075 (c) Insert remaining elements in TARGET using insr.
24076 NELTS is the total number of elements in original vector while
24077 while NELTS_REQD is the number of elements that are actually
24080 ??? The heuristic used is to do above only if number of constants
24081 is at least half the total number of elements. May need fine tuning. */
24084 aarch64_sve_expand_vector_init_handle_trailing_constants
24085 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
24087 machine_mode mode
= GET_MODE (target
);
24088 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24089 int n_trailing_constants
= 0;
24091 for (int i
= nelts_reqd
- 1;
24092 i
>= 0 && valid_for_const_vector_p (elem_mode
, builder
.elt (i
));
24094 n_trailing_constants
++;
24096 if (n_trailing_constants
>= nelts_reqd
/ 2)
24098 /* Try to use the natural pattern of BUILDER to extend the trailing
24099 constant elements to a full vector. Replace any variables in the
24100 extra elements with zeros.
24102 ??? It would be better if the builders supported "don't care"
24103 elements, with the builder filling in whichever elements
24104 give the most compact encoding. */
24105 rtx_vector_builder
v (mode
, nelts
, 1);
24106 for (int i
= 0; i
< nelts
; i
++)
24108 rtx x
= builder
.elt (i
+ nelts_reqd
- n_trailing_constants
);
24109 if (!valid_for_const_vector_p (elem_mode
, x
))
24110 x
= CONST0_RTX (elem_mode
);
24113 rtx const_vec
= v
.build ();
24114 emit_move_insn (target
, const_vec
);
24116 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
24117 emit_insr (target
, builder
.elt (i
));
24125 /* Subroutine of aarch64_sve_expand_vector_init.
24127 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24128 (b) Skip trailing elements from BUILDER, which are the same as
24129 element NELTS_REQD - 1.
24130 (c) Insert earlier elements in reverse order in TARGET using insr. */
24133 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
24134 const rtx_vector_builder
&builder
,
24137 machine_mode mode
= GET_MODE (target
);
24138 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
24140 struct expand_operand ops
[2];
24141 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
24142 gcc_assert (icode
!= CODE_FOR_nothing
);
24144 create_output_operand (&ops
[0], target
, mode
);
24145 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
24146 expand_insn (icode
, 2, ops
);
24148 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
24149 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
24150 emit_insr (target
, builder
.elt (i
));
24153 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24154 when all trailing elements of builder are same.
24155 This works as follows:
24156 (a) Use expand_insn interface to broadcast last vector element in TARGET.
24157 (b) Insert remaining elements in TARGET using insr.
24159 ??? The heuristic used is to do above if number of same trailing elements
24160 is at least 3/4 of total number of elements, loosely based on
24161 heuristic from mostly_zeros_p. May need fine-tuning. */
24164 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24165 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
24167 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
24168 if (ndups
>= (3 * nelts_reqd
) / 4)
24170 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
24171 nelts_reqd
- ndups
+ 1);
24178 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24179 of elements in BUILDER.
24181 The function tries to initialize TARGET from BUILDER if it fits one
24182 of the special cases outlined below.
24184 Failing that, the function divides BUILDER into two sub-vectors:
24185 v_even = even elements of BUILDER;
24186 v_odd = odd elements of BUILDER;
24188 and recursively calls itself with v_even and v_odd.
24190 if (recursive call succeeded for v_even or v_odd)
24191 TARGET = zip (v_even, v_odd)
24193 The function returns true if it managed to build TARGET from BUILDER
24194 with one of the special cases, false otherwise.
24196 Example: {a, 1, b, 2, c, 3, d, 4}
24198 The vector gets divided into:
24199 v_even = {a, b, c, d}
24200 v_odd = {1, 2, 3, 4}
24202 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24203 initialize tmp2 from constant vector v_odd using emit_move_insn.
24205 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24206 4 elements, so we construct tmp1 from v_even using insr:
24213 TARGET = zip (tmp1, tmp2)
24214 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
24217 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
24218 int nelts
, int nelts_reqd
)
24220 machine_mode mode
= GET_MODE (target
);
24222 /* Case 1: Vector contains trailing constants. */
24224 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24225 (target
, builder
, nelts
, nelts_reqd
))
24228 /* Case 2: Vector contains leading constants. */
24230 rtx_vector_builder
rev_builder (mode
, nelts_reqd
, 1);
24231 for (int i
= 0; i
< nelts_reqd
; i
++)
24232 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
24233 rev_builder
.finalize ();
24235 if (aarch64_sve_expand_vector_init_handle_trailing_constants
24236 (target
, rev_builder
, nelts
, nelts_reqd
))
24238 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
24242 /* Case 3: Vector contains trailing same element. */
24244 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24245 (target
, builder
, nelts_reqd
))
24248 /* Case 4: Vector contains leading same element. */
24250 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24251 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
24253 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
24257 /* Avoid recursing below 4-elements.
24258 ??? The threshold 4 may need fine-tuning. */
24260 if (nelts_reqd
<= 4)
24263 rtx_vector_builder
v_even (mode
, nelts
, 1);
24264 rtx_vector_builder
v_odd (mode
, nelts
, 1);
24266 for (int i
= 0; i
< nelts
* 2; i
+= 2)
24268 v_even
.quick_push (builder
.elt (i
));
24269 v_odd
.quick_push (builder
.elt (i
+ 1));
24272 v_even
.finalize ();
24275 rtx tmp1
= gen_reg_rtx (mode
);
24276 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
24277 nelts
, nelts_reqd
/ 2);
24279 rtx tmp2
= gen_reg_rtx (mode
);
24280 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
24281 nelts
, nelts_reqd
/ 2);
24283 if (!did_even_p
&& !did_odd_p
)
24286 /* Initialize v_even and v_odd using INSR if it didn't match any of the
24287 special cases and zip v_even, v_odd. */
24290 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
24293 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
24295 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
24296 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
24300 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
24303 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
24305 machine_mode mode
= GET_MODE (target
);
24306 int nelts
= XVECLEN (vals
, 0);
24308 rtx_vector_builder
v (mode
, nelts
, 1);
24309 for (int i
= 0; i
< nelts
; i
++)
24310 v
.quick_push (XVECEXP (vals
, 0, i
));
24313 /* If neither sub-vectors of v could be initialized specially,
24314 then use INSR to insert all elements from v into TARGET.
24315 ??? This might not be optimal for vectors with large
24316 initializers like 16-element or above.
24317 For nelts < 4, it probably isn't useful to handle specially. */
24320 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
24321 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
24324 /* Check whether VALUE is a vector constant in which every element
24325 is either a power of 2 or a negated power of 2. If so, return
24326 a constant vector of log2s, and flip CODE between PLUS and MINUS
24327 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
24330 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
24332 if (!CONST_VECTOR_P (value
))
24335 rtx_vector_builder builder
;
24336 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
24339 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
24340 /* 1 if the result of the multiplication must be negated,
24341 0 if it mustn't, or -1 if we don't yet care. */
24343 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
24344 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
24346 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
24347 if (!CONST_SCALAR_INT_P (elt
))
24349 rtx_mode_t
val (elt
, int_mode
);
24350 wide_int pow2
= wi::neg (val
);
24353 /* It matters whether we negate or not. Make that choice,
24354 and make sure that it's consistent with previous elements. */
24355 if (negate
== !wi::neg_p (val
))
24357 negate
= wi::neg_p (val
);
24361 /* POW2 is now the value that we want to be a power of 2. */
24362 int shift
= wi::exact_log2 (pow2
);
24365 builder
.quick_push (gen_int_mode (shift
, int_mode
));
24368 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
24370 else if (negate
== 1)
24371 code
= code
== PLUS
? MINUS
: PLUS
;
24372 return builder
.build ();
24375 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24376 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
24377 operands array, in the same order as for fma_optab. Return true if
24378 the function emitted all the necessary instructions, false if the caller
24379 should generate the pattern normally with the new OPERANDS array. */
24382 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
24384 machine_mode mode
= GET_MODE (operands
[0]);
24385 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
24387 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
24388 NULL_RTX
, true, OPTAB_DIRECT
);
24389 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
24390 operands
[3], product
, operands
[0], true,
24394 operands
[2] = force_reg (mode
, operands
[2]);
24398 /* Likewise, but for a conditional pattern. */
24401 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
24403 machine_mode mode
= GET_MODE (operands
[0]);
24404 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
24406 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
24407 NULL_RTX
, true, OPTAB_DIRECT
);
24408 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
24409 operands
[4], product
, operands
[5]));
24412 operands
[3] = force_reg (mode
, operands
[3]);
24416 static unsigned HOST_WIDE_INT
24417 aarch64_shift_truncation_mask (machine_mode mode
)
24419 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
24421 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
24424 /* Select a format to encode pointers in exception handling data. */
24426 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
24429 switch (aarch64_cmodel
)
24431 case AARCH64_CMODEL_TINY
:
24432 case AARCH64_CMODEL_TINY_PIC
:
24433 case AARCH64_CMODEL_SMALL
:
24434 case AARCH64_CMODEL_SMALL_PIC
:
24435 case AARCH64_CMODEL_SMALL_SPIC
:
24436 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
24438 type
= DW_EH_PE_sdata4
;
24441 /* No assumptions here. 8-byte relocs required. */
24442 type
= DW_EH_PE_sdata8
;
24445 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
24448 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
24451 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
24453 if (TREE_CODE (decl
) == FUNCTION_DECL
)
24455 arm_pcs pcs
= (arm_pcs
) fndecl_abi (decl
).id ();
24456 if (pcs
== ARM_PCS_SIMD
|| pcs
== ARM_PCS_SVE
)
24458 fprintf (stream
, "\t.variant_pcs\t");
24459 assemble_name (stream
, name
);
24460 fprintf (stream
, "\n");
24465 /* The last .arch and .tune assembly strings that we printed. */
24466 static std::string aarch64_last_printed_arch_string
;
24467 static std::string aarch64_last_printed_tune_string
;
24469 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
24470 by the function fndecl. */
24473 aarch64_declare_function_name (FILE *stream
, const char* name
,
24476 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
24478 struct cl_target_option
*targ_options
;
24480 targ_options
= TREE_TARGET_OPTION (target_parts
);
24482 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
24483 gcc_assert (targ_options
);
24485 const struct processor
*this_arch
24486 = aarch64_get_arch (targ_options
->x_selected_arch
);
24488 auto isa_flags
= targ_options
->x_aarch64_asm_isa_flags
;
24489 std::string extension
24490 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
24492 /* Only update the assembler .arch string if it is distinct from the last
24493 such string we printed. */
24494 std::string to_print
= this_arch
->name
+ extension
;
24495 if (to_print
!= aarch64_last_printed_arch_string
)
24497 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
24498 aarch64_last_printed_arch_string
= to_print
;
24501 /* Print the cpu name we're tuning for in the comments, might be
24502 useful to readers of the generated asm. Do it only when it changes
24503 from function to function and verbose assembly is requested. */
24504 const struct processor
*this_tune
24505 = aarch64_get_tune_cpu (targ_options
->x_selected_tune
);
24507 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
24509 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
24511 aarch64_last_printed_tune_string
= this_tune
->name
;
24514 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
24516 /* Don't forget the type directive for ELF. */
24517 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
24518 ASM_OUTPUT_FUNCTION_LABEL (stream
, name
, fndecl
);
24520 cfun
->machine
->label_is_assembled
= true;
24523 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
24526 aarch64_print_patchable_function_entry (FILE *file
,
24527 unsigned HOST_WIDE_INT patch_area_size
,
24530 if (!cfun
->machine
->label_is_assembled
)
24532 /* Emit the patching area before the entry label, if any. */
24533 default_print_patchable_function_entry (file
, patch_area_size
,
24538 rtx pa
= gen_patchable_area (GEN_INT (patch_area_size
),
24539 GEN_INT (record_p
));
24540 basic_block bb
= ENTRY_BLOCK_PTR_FOR_FN (cfun
)->next_bb
;
24542 if (!aarch_bti_enabled ()
24543 || cgraph_node::get (cfun
->decl
)->only_called_directly_p ())
24545 /* Emit the patchable_area at the beginning of the function. */
24546 rtx_insn
*insn
= emit_insn_before (pa
, BB_HEAD (bb
));
24547 INSN_ADDRESSES_NEW (insn
, -1);
24551 rtx_insn
*insn
= next_real_nondebug_insn (get_insns ());
24554 || GET_CODE (PATTERN (insn
)) != UNSPEC_VOLATILE
24555 || XINT (PATTERN (insn
), 1) != UNSPECV_BTI_C
)
24557 /* Emit a BTI_C. */
24558 insn
= emit_insn_before (gen_bti_c (), BB_HEAD (bb
));
24561 /* Emit the patchable_area after BTI_C. */
24562 insn
= emit_insn_after (pa
, insn
);
24563 INSN_ADDRESSES_NEW (insn
, -1);
24566 /* Output patchable area. */
24569 aarch64_output_patchable_area (unsigned int patch_area_size
, bool record_p
)
24571 default_print_patchable_function_entry (asm_out_file
, patch_area_size
,
24575 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
24578 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
24580 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
24581 const char *value
= IDENTIFIER_POINTER (target
);
24582 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
24583 ASM_OUTPUT_DEF (stream
, name
, value
);
24586 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
24587 function symbol references. */
24590 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
24592 default_elf_asm_output_external (stream
, decl
, name
);
24593 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
24596 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24597 Used to output the .cfi_b_key_frame directive when signing the current
24598 function with the B key. */
24601 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
24603 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
24604 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
24605 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
24608 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
24611 aarch64_start_file (void)
24613 struct cl_target_option
*default_options
24614 = TREE_TARGET_OPTION (target_option_default_node
);
24616 const struct processor
*default_arch
24617 = aarch64_get_arch (default_options
->x_selected_arch
);
24618 auto default_isa_flags
= default_options
->x_aarch64_asm_isa_flags
;
24619 std::string extension
24620 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
24621 default_arch
->flags
);
24623 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
24624 aarch64_last_printed_tune_string
= "";
24625 asm_fprintf (asm_out_file
, "\t.arch %s\n",
24626 aarch64_last_printed_arch_string
.c_str ());
24628 default_file_start ();
24631 /* Emit load exclusive. */
24634 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
24635 rtx mem
, rtx model_rtx
)
24637 if (mode
== TImode
)
24638 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
24639 gen_highpart (DImode
, rval
),
24642 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
24645 /* Emit store exclusive. */
24648 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
24649 rtx mem
, rtx rval
, rtx model_rtx
)
24651 if (mode
== TImode
)
24652 emit_insn (gen_aarch64_store_exclusive_pair
24653 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
24654 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
24656 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
24659 /* Mark the previous jump instruction as unlikely. */
24662 aarch64_emit_unlikely_jump (rtx insn
)
24664 rtx_insn
*jump
= emit_jump_insn (insn
);
24665 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
24668 /* We store the names of the various atomic helpers in a 5x5 array.
24669 Return the libcall function given MODE, MODEL and NAMES. */
24672 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
24673 const atomic_ool_names
*names
)
24675 memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
24676 int mode_idx
, model_idx
;
24696 gcc_unreachable ();
24701 case MEMMODEL_RELAXED
:
24704 case MEMMODEL_CONSUME
:
24705 case MEMMODEL_ACQUIRE
:
24708 case MEMMODEL_RELEASE
:
24711 case MEMMODEL_ACQ_REL
:
24712 case MEMMODEL_SEQ_CST
:
24715 case MEMMODEL_SYNC_ACQUIRE
:
24716 case MEMMODEL_SYNC_RELEASE
:
24717 case MEMMODEL_SYNC_SEQ_CST
:
24721 gcc_unreachable ();
24724 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
24725 VISIBILITY_HIDDEN
);
24728 #define DEF0(B, N) \
24729 { "__aarch64_" #B #N "_relax", \
24730 "__aarch64_" #B #N "_acq", \
24731 "__aarch64_" #B #N "_rel", \
24732 "__aarch64_" #B #N "_acq_rel", \
24733 "__aarch64_" #B #N "_sync" }
24735 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24736 { NULL, NULL, NULL, NULL }
24737 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24739 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
24740 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
24741 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
24742 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
24743 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
24744 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
24750 /* Expand a compare and swap pattern. */
24753 aarch64_expand_compare_and_swap (rtx operands
[])
24755 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
24756 machine_mode mode
, r_mode
;
24758 bval
= operands
[0];
24759 rval
= operands
[1];
24761 oldval
= operands
[3];
24762 newval
= operands
[4];
24763 is_weak
= operands
[5];
24764 mod_s
= operands
[6];
24765 mod_f
= operands
[7];
24766 mode
= GET_MODE (mem
);
24768 /* Normally the succ memory model must be stronger than fail, but in the
24769 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24770 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
24771 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
24772 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
24773 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
24776 if (mode
== QImode
|| mode
== HImode
)
24779 rval
= gen_reg_rtx (r_mode
);
24784 /* The CAS insn requires oldval and rval overlap, but we need to
24785 have a copy of oldval saved across the operation to tell if
24786 the operation is successful. */
24787 if (reg_overlap_mentioned_p (rval
, oldval
))
24788 rval
= copy_to_mode_reg (r_mode
, oldval
);
24790 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
24791 if (mode
== TImode
)
24792 newval
= force_reg (mode
, newval
);
24794 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
24796 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
24798 else if (TARGET_OUTLINE_ATOMICS
)
24800 /* Oldval must satisfy compare afterward. */
24801 if (!aarch64_plus_operand (oldval
, mode
))
24802 oldval
= force_reg (mode
, oldval
);
24803 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
24804 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
24805 oldval
, mode
, newval
, mode
,
24806 XEXP (mem
, 0), Pmode
);
24807 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
24811 /* The oldval predicate varies by mode. Test it and force to reg. */
24812 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
24813 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
24814 oldval
= force_reg (mode
, oldval
);
24816 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
24817 is_weak
, mod_s
, mod_f
));
24818 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
24821 if (r_mode
!= mode
)
24822 rval
= gen_lowpart (mode
, rval
);
24823 emit_move_insn (operands
[1], rval
);
24825 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
24826 emit_insn (gen_rtx_SET (bval
, x
));
24829 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24830 sequence implementing an atomic operation. */
24833 aarch64_emit_post_barrier (enum memmodel model
)
24835 const enum memmodel base_model
= memmodel_base (model
);
24837 if (is_mm_sync (model
)
24838 && (base_model
== MEMMODEL_ACQUIRE
24839 || base_model
== MEMMODEL_ACQ_REL
24840 || base_model
== MEMMODEL_SEQ_CST
))
24842 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
24846 /* Split a compare and swap pattern. */
24849 aarch64_split_compare_and_swap (rtx operands
[])
24851 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
24852 gcc_assert (epilogue_completed
);
24854 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
24857 rtx_code_label
*label1
, *label2
;
24858 enum memmodel model
;
24860 rval
= operands
[0];
24862 oldval
= operands
[2];
24863 newval
= operands
[3];
24864 model_rtx
= operands
[5];
24865 scratch
= operands
[7];
24866 mode
= GET_MODE (mem
);
24867 model
= memmodel_from_int (INTVAL (model_rtx
));
24868 is_weak
= operands
[4] != const0_rtx
&& mode
!= TImode
;
24870 /* When OLDVAL is zero and we want the strong version we can emit a tighter
24873 LD[A]XR rval, [mem]
24875 ST[L]XR scratch, newval, [mem]
24876 CBNZ scratch, .label1
24879 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
24880 oldval
== const0_rtx
&& mode
!= TImode
);
24885 label1
= gen_label_rtx ();
24886 emit_label (label1
);
24888 label2
= gen_label_rtx ();
24890 /* The initial load can be relaxed for a __sync operation since a final
24891 barrier will be emitted to stop code hoisting. */
24892 if (is_mm_sync (model
))
24893 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
24895 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
24898 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
24901 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
24902 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
24904 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
24905 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
24906 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
24908 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
24912 x
= aarch64_gen_compare_zero_and_branch (NE
, scratch
, label1
);
24913 aarch64_emit_unlikely_jump (x
);
24916 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
24918 /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
24919 store the returned value and loop if the STLXP fails. */
24920 if (mode
== TImode
)
24922 rtx_code_label
*label3
= gen_label_rtx ();
24923 emit_jump_insn (gen_rtx_SET (pc_rtx
, gen_rtx_LABEL_REF (Pmode
, label3
)));
24926 emit_label (label2
);
24927 aarch64_emit_store_exclusive (mode
, scratch
, mem
, rval
, model_rtx
);
24929 x
= aarch64_gen_compare_zero_and_branch (NE
, scratch
, label1
);
24930 aarch64_emit_unlikely_jump (x
);
24935 emit_label (label2
);
24937 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
24938 to set the condition flags. If this is not used it will be removed by
24941 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
24943 /* Emit any final barrier needed for a __sync operation. */
24944 if (is_mm_sync (model
))
24945 aarch64_emit_post_barrier (model
);
24948 /* Split an atomic operation. */
24951 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
24952 rtx value
, rtx model_rtx
, rtx cond
)
24954 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
24955 gcc_assert (epilogue_completed
);
24957 machine_mode mode
= GET_MODE (mem
);
24958 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
24959 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
24960 const bool is_sync
= is_mm_sync (model
);
24961 rtx_code_label
*label
;
24964 /* Split the atomic operation into a sequence. */
24965 label
= gen_label_rtx ();
24966 emit_label (label
);
24969 new_out
= gen_lowpart (wmode
, new_out
);
24971 old_out
= gen_lowpart (wmode
, old_out
);
24974 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
24976 /* The initial load can be relaxed for a __sync operation since a final
24977 barrier will be emitted to stop code hoisting. */
24979 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
24980 GEN_INT (MEMMODEL_RELAXED
));
24982 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
24991 x
= gen_rtx_AND (wmode
, old_out
, value
);
24992 emit_insn (gen_rtx_SET (new_out
, x
));
24993 x
= gen_rtx_NOT (wmode
, new_out
);
24994 emit_insn (gen_rtx_SET (new_out
, x
));
24998 if (CONST_INT_P (value
))
25000 value
= GEN_INT (-UINTVAL (value
));
25003 /* Fall through. */
25006 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
25007 emit_insn (gen_rtx_SET (new_out
, x
));
25011 aarch64_emit_store_exclusive (mode
, cond
, mem
,
25012 gen_lowpart (mode
, new_out
), model_rtx
);
25014 x
= aarch64_gen_compare_zero_and_branch (NE
, cond
, label
);
25015 aarch64_emit_unlikely_jump (x
);
25017 /* Emit any final barrier needed for a __sync operation. */
25019 aarch64_emit_post_barrier (model
);
25023 aarch64_init_libfuncs (void)
25025 /* Half-precision float operations. The compiler handles all operations
25026 with NULL libfuncs by converting to SFmode. */
25029 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
25030 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
25033 set_optab_libfunc (add_optab
, HFmode
, NULL
);
25034 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
25035 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
25036 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
25037 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
25040 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
25041 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
25042 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
25043 set_optab_libfunc (le_optab
, HFmode
, NULL
);
25044 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
25045 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
25046 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
25049 /* Target hook for c_mode_for_suffix. */
25050 static machine_mode
25051 aarch64_c_mode_for_suffix (char suffix
)
25059 /* We can only represent floating point constants which will fit in
25060 "quarter-precision" values. These values are characterised by
25061 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
25064 (-1)^s * (n/16) * 2^r
25067 's' is the sign bit.
25068 'n' is an integer in the range 16 <= n <= 31.
25069 'r' is an integer in the range -3 <= r <= 4. */
25071 /* Return true iff X can be represented by a quarter-precision
25072 floating point immediate operand X. Note, we cannot represent 0.0. */
25074 aarch64_float_const_representable_p (rtx x
)
25076 /* This represents our current view of how many bits
25077 make up the mantissa. */
25078 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
25080 unsigned HOST_WIDE_INT mantissa
, mask
;
25081 REAL_VALUE_TYPE r
, m
;
25084 x
= unwrap_const_vec_duplicate (x
);
25085 if (!CONST_DOUBLE_P (x
))
25088 if (GET_MODE (x
) == VOIDmode
25089 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
25092 r
= *CONST_DOUBLE_REAL_VALUE (x
);
25094 /* We cannot represent infinities, NaNs or +/-zero. We won't
25095 know if we have +zero until we analyse the mantissa, but we
25096 can reject the other invalid values. */
25097 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
25098 || REAL_VALUE_MINUS_ZERO (r
))
25101 /* For BFmode, only handle 0.0. */
25102 if (GET_MODE (x
) == BFmode
)
25103 return real_iszero (&r
, false);
25105 /* Extract exponent. */
25106 r
= real_value_abs (&r
);
25107 exponent
= REAL_EXP (&r
);
25109 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
25110 highest (sign) bit, with a fixed binary point at bit point_pos.
25111 m1 holds the low part of the mantissa, m2 the high part.
25112 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
25113 bits for the mantissa, this can fail (low bits will be lost). */
25114 real_ldexp (&m
, &r
, point_pos
- exponent
);
25115 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
25117 /* If the low part of the mantissa has bits set we cannot represent
25119 if (w
.ulow () != 0)
25121 /* We have rejected the lower HOST_WIDE_INT, so update our
25122 understanding of how many bits lie in the mantissa and
25123 look only at the high HOST_WIDE_INT. */
25124 mantissa
= w
.elt (1);
25125 point_pos
-= HOST_BITS_PER_WIDE_INT
;
25127 /* We can only represent values with a mantissa of the form 1.xxxx. */
25128 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
25129 if ((mantissa
& mask
) != 0)
25132 /* Having filtered unrepresentable values, we may now remove all
25133 but the highest 5 bits. */
25134 mantissa
>>= point_pos
- 5;
25136 /* We cannot represent the value 0.0, so reject it. This is handled
25141 /* Then, as bit 4 is always set, we can mask it off, leaving
25142 the mantissa in the range [0, 15]. */
25143 mantissa
&= ~(1 << 4);
25144 gcc_assert (mantissa
<= 15);
25146 /* GCC internally does not use IEEE754-like encoding (where normalized
25147 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
25148 Our mantissa values are shifted 4 places to the left relative to
25149 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
25150 by 5 places to correct for GCC's representation. */
25151 exponent
= 5 - exponent
;
25153 return (exponent
>= 0 && exponent
<= 7);
25156 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
25157 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
25158 output MOVI/MVNI, ORR or BIC immediate. */
25160 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
25161 enum simd_immediate_check which
)
25164 static char templ
[40];
25165 const char *mnemonic
;
25166 const char *shift_op
;
25167 unsigned int lane_count
= 0;
25170 struct simd_immediate_info info
;
25172 /* This will return true to show const_vector is legal for use as either
25173 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
25174 It will also update INFO to show how the immediate should be generated.
25175 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
25176 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
25177 gcc_assert (is_valid
);
25179 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25180 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
25182 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
25184 gcc_assert (info
.insn
== simd_immediate_info::MOV
25185 && info
.u
.mov
.shift
== 0);
25186 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25187 move immediate path. */
25188 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
25189 info
.u
.mov
.value
= GEN_INT (0);
25192 const unsigned int buf_size
= 20;
25193 char float_buf
[buf_size
] = {'\0'};
25194 real_to_decimal_for_mode (float_buf
,
25195 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
25196 buf_size
, buf_size
, 1, info
.elt_mode
);
25198 if (lane_count
== 1)
25199 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
25201 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
25202 lane_count
, element_char
, float_buf
);
25207 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
25209 if (which
== AARCH64_CHECK_MOV
)
25211 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
25212 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
25214 if (lane_count
== 1)
25215 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
25216 mnemonic
, UINTVAL (info
.u
.mov
.value
));
25217 else if (info
.u
.mov
.shift
)
25218 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
25219 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
25220 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
25223 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
25224 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
25225 element_char
, UINTVAL (info
.u
.mov
.value
));
25229 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
25230 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
25231 if (info
.u
.mov
.shift
)
25232 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
25233 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
25234 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
25237 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
25238 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
25239 element_char
, UINTVAL (info
.u
.mov
.value
));
25245 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
25248 /* If a floating point number was passed and we desire to use it in an
25249 integer mode do the conversion to integer. */
25250 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
25252 unsigned HOST_WIDE_INT ival
;
25253 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
25254 gcc_unreachable ();
25255 immediate
= gen_int_mode (ival
, mode
);
25258 machine_mode vmode
;
25259 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25260 a 128 bit vector mode. */
25261 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
25263 vmode
= aarch64_simd_container_mode (mode
, width
);
25264 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
25265 return aarch64_output_simd_mov_immediate (v_op
, width
);
25268 /* Return the output string to use for moving immediate CONST_VECTOR
25269 into an SVE register. */
25272 aarch64_output_sve_mov_immediate (rtx const_vector
)
25274 static char templ
[40];
25275 struct simd_immediate_info info
;
25278 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
25279 gcc_assert (is_valid
);
25281 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25283 machine_mode vec_mode
= GET_MODE (const_vector
);
25284 if (aarch64_sve_pred_mode_p (vec_mode
))
25286 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
25287 if (info
.insn
== simd_immediate_info::MOV
)
25289 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
25290 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
25294 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
25295 unsigned int total_bytes
;
25296 if (info
.u
.pattern
== AARCH64_SV_ALL
25297 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
25298 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
25299 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
25301 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
25302 svpattern_token (info
.u
.pattern
));
25307 if (info
.insn
== simd_immediate_info::INDEX
)
25309 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
25310 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
25311 element_char
, INTVAL (info
.u
.index
.base
),
25312 INTVAL (info
.u
.index
.step
));
25316 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
25318 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
25319 info
.u
.mov
.value
= GEN_INT (0);
25322 const int buf_size
= 20;
25323 char float_buf
[buf_size
] = {};
25324 real_to_decimal_for_mode (float_buf
,
25325 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
25326 buf_size
, buf_size
, 1, info
.elt_mode
);
25328 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
25329 element_char
, float_buf
);
25334 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
25335 element_char
, INTVAL (info
.u
.mov
.value
));
25339 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
25340 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25344 aarch64_output_sve_ptrues (rtx const_unspec
)
25346 static char templ
[40];
25348 struct simd_immediate_info info
;
25349 bool is_valid
= aarch64_simd_valid_immediate (const_unspec
, &info
);
25350 gcc_assert (is_valid
&& info
.insn
== simd_immediate_info::PTRUE
);
25352 char element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
25353 snprintf (templ
, sizeof (templ
), "ptrues\t%%0.%c, %s", element_char
,
25354 svpattern_token (info
.u
.pattern
));
25358 /* Split operands into moves from op[1] + op[2] into op[0]. */
25361 aarch64_split_combinev16qi (rtx operands
[3])
25363 machine_mode halfmode
= GET_MODE (operands
[1]);
25365 gcc_assert (halfmode
== V16QImode
);
25367 rtx destlo
= simplify_gen_subreg (halfmode
, operands
[0],
25368 GET_MODE (operands
[0]), 0);
25369 rtx desthi
= simplify_gen_subreg (halfmode
, operands
[0],
25370 GET_MODE (operands
[0]),
25371 GET_MODE_SIZE (halfmode
));
25373 bool skiplo
= rtx_equal_p (destlo
, operands
[1]);
25374 bool skiphi
= rtx_equal_p (desthi
, operands
[2]);
25376 if (skiplo
&& skiphi
)
25378 /* No-op move. Can't split to nothing; emit something. */
25379 emit_note (NOTE_INSN_DELETED
);
25383 /* Special case of reversed high/low parts. */
25384 if (reg_overlap_mentioned_p (operands
[2], destlo
)
25385 && reg_overlap_mentioned_p (operands
[1], desthi
))
25387 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
25388 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
25389 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
25391 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
25393 /* Try to avoid unnecessary moves if part of the result
25394 is in the right place already. */
25396 emit_move_insn (destlo
, operands
[1]);
25398 emit_move_insn (desthi
, operands
[2]);
25403 emit_move_insn (desthi
, operands
[2]);
25405 emit_move_insn (destlo
, operands
[1]);
25409 /* vec_perm support. */
25411 struct expand_vec_perm_d
25413 rtx target
, op0
, op1
;
25414 vec_perm_indices perm
;
25415 machine_mode vmode
;
25416 machine_mode op_mode
;
25417 unsigned int vec_flags
;
25418 unsigned int op_vec_flags
;
25420 bool zero_op0_p
, zero_op1_p
;
25424 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
);
25426 /* Generate a variable permutation. */
25429 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
25431 machine_mode vmode
= GET_MODE (target
);
25432 bool one_vector_p
= rtx_equal_p (op0
, op1
);
25434 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
25435 gcc_checking_assert (GET_MODE (op0
) == vmode
);
25436 gcc_checking_assert (GET_MODE (op1
) == vmode
);
25437 gcc_checking_assert (GET_MODE (sel
) == vmode
);
25438 gcc_checking_assert (TARGET_SIMD
);
25442 if (vmode
== V8QImode
)
25444 /* Expand the argument to a V16QI mode by duplicating it. */
25445 rtx pair
= gen_reg_rtx (V16QImode
);
25446 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
25447 emit_insn (gen_aarch64_qtbl1v8qi (target
, pair
, sel
));
25451 emit_insn (gen_aarch64_qtbl1v16qi (target
, op0
, sel
));
25458 if (vmode
== V8QImode
)
25460 pair
= gen_reg_rtx (V16QImode
);
25461 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
25462 emit_insn (gen_aarch64_qtbl1v8qi (target
, pair
, sel
));
25466 pair
= gen_reg_rtx (V2x16QImode
);
25467 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
25468 emit_insn (gen_aarch64_qtbl2v16qi (target
, pair
, sel
));
25473 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25474 NELT is the number of elements in the vector. */
25477 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
25480 machine_mode vmode
= GET_MODE (target
);
25481 bool one_vector_p
= rtx_equal_p (op0
, op1
);
25484 /* The TBL instruction does not use a modulo index, so we must take care
25485 of that ourselves. */
25486 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
25487 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
25488 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
25490 /* For big-endian, we also need to reverse the index within the vector
25491 (but not which vector). */
25492 if (BYTES_BIG_ENDIAN
)
25494 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
25496 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
25497 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
25498 NULL
, 0, OPTAB_LIB_WIDEN
);
25500 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
25503 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
25506 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
25508 emit_insn (gen_rtx_SET (target
,
25509 gen_rtx_UNSPEC (GET_MODE (target
),
25510 gen_rtvec (2, op0
, op1
), code
)));
25513 /* Expand an SVE vec_perm with the given operands. */
25516 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
25518 machine_mode data_mode
= GET_MODE (target
);
25519 machine_mode sel_mode
= GET_MODE (sel
);
25520 /* Enforced by the pattern condition. */
25521 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
25523 /* Note: vec_perm indices are supposed to wrap when they go beyond the
25524 size of the two value vectors, i.e. the upper bits of the indices
25525 are effectively ignored. SVE TBL instead produces 0 for any
25526 out-of-range indices, so we need to modulo all the vec_perm indices
25527 to ensure they are all in range. */
25528 rtx sel_reg
= force_reg (sel_mode
, sel
);
25530 /* Check if the sel only references the first values vector. */
25531 if (CONST_VECTOR_P (sel
)
25532 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
25534 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
25538 /* Check if the two values vectors are the same. */
25539 if (rtx_equal_p (op0
, op1
))
25541 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
25542 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
25543 NULL
, 0, OPTAB_DIRECT
);
25544 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
25548 /* Run TBL on for each value vector and combine the results. */
25550 rtx res0
= gen_reg_rtx (data_mode
);
25551 rtx res1
= gen_reg_rtx (data_mode
);
25552 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
25553 if (!CONST_VECTOR_P (sel
)
25554 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
25556 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
25558 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
25559 NULL
, 0, OPTAB_DIRECT
);
25561 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
25562 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
25563 NULL
, 0, OPTAB_DIRECT
);
25564 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
25565 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
25566 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
25568 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
25571 /* Recognize patterns suitable for the TRN instructions. */
25573 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
25576 poly_uint64 nelt
= d
->perm
.length ();
25578 machine_mode vmode
= d
->vmode
;
25580 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
25583 /* Note that these are little-endian tests.
25584 We correct for big-endian later. */
25585 if (!d
->perm
[0].is_constant (&odd
)
25586 || (odd
!= 0 && odd
!= 1)
25587 || !d
->perm
.series_p (0, 2, odd
, 2)
25588 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
25597 /* We don't need a big-endian lane correction for SVE; see the comment
25598 at the head of aarch64-sve.md for details. */
25599 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
25601 std::swap (in0
, in1
);
25606 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
25607 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
25611 /* Try to re-encode the PERM constant so it combines odd and even elements.
25612 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25613 We retry with this new constant with the full suite of patterns. */
25615 aarch64_evpc_reencode (struct expand_vec_perm_d
*d
)
25617 expand_vec_perm_d newd
;
25619 if (d
->vec_flags
!= VEC_ADVSIMD
)
25622 /* Get the new mode. Always twice the size of the inner
25623 and half the elements. */
25624 poly_uint64 vec_bits
= GET_MODE_BITSIZE (d
->vmode
);
25625 unsigned int new_elt_bits
= GET_MODE_UNIT_BITSIZE (d
->vmode
) * 2;
25626 auto new_elt_mode
= int_mode_for_size (new_elt_bits
, false).require ();
25627 machine_mode new_mode
= aarch64_simd_container_mode (new_elt_mode
, vec_bits
);
25629 if (new_mode
== word_mode
)
25632 vec_perm_indices newpermindices
;
25634 if (!newpermindices
.new_shrunk_vector (d
->perm
, 2))
25637 newd
.vmode
= new_mode
;
25638 newd
.vec_flags
= VEC_ADVSIMD
;
25639 newd
.op_mode
= newd
.vmode
;
25640 newd
.op_vec_flags
= newd
.vec_flags
;
25641 newd
.target
= d
->target
? gen_lowpart (new_mode
, d
->target
) : NULL
;
25642 newd
.op0
= d
->op0
? gen_lowpart (new_mode
, d
->op0
) : NULL
;
25643 newd
.op1
= d
->op1
? gen_lowpart (new_mode
, d
->op1
) : NULL
;
25644 newd
.testing_p
= d
->testing_p
;
25645 newd
.one_vector_p
= d
->one_vector_p
;
25647 newd
.perm
.new_vector (newpermindices
.encoding (), newd
.one_vector_p
? 1 : 2,
25648 newpermindices
.nelts_per_input ());
25649 return aarch64_expand_vec_perm_const_1 (&newd
);
25652 /* Recognize patterns suitable for the UZP instructions. */
25654 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
25658 machine_mode vmode
= d
->vmode
;
25660 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
25663 /* Note that these are little-endian tests.
25664 We correct for big-endian later. */
25665 if (!d
->perm
[0].is_constant (&odd
)
25666 || (odd
!= 0 && odd
!= 1)
25667 || !d
->perm
.series_p (0, 1, odd
, 2))
25676 /* We don't need a big-endian lane correction for SVE; see the comment
25677 at the head of aarch64-sve.md for details. */
25678 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
25680 std::swap (in0
, in1
);
25685 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
25686 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
25690 /* Recognize patterns suitable for the ZIP instructions. */
25692 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
25695 poly_uint64 nelt
= d
->perm
.length ();
25697 machine_mode vmode
= d
->vmode
;
25699 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
25702 /* Note that these are little-endian tests.
25703 We correct for big-endian later. */
25704 poly_uint64 first
= d
->perm
[0];
25705 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
25706 || !d
->perm
.series_p (0, 2, first
, 1)
25707 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
25709 high
= maybe_ne (first
, 0U);
25717 /* We don't need a big-endian lane correction for SVE; see the comment
25718 at the head of aarch64-sve.md for details. */
25719 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
25721 std::swap (in0
, in1
);
25726 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
25727 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
25731 /* Recognize patterns for the EXT insn. */
25734 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
25736 HOST_WIDE_INT location
;
25739 /* The first element always refers to the first vector.
25740 Check if the extracted indices are increasing by one. */
25741 if ((d
->vec_flags
& VEC_SVE_PRED
)
25742 || !d
->perm
[0].is_constant (&location
)
25743 || !d
->perm
.series_p (0, 1, location
, 1))
25750 /* The case where (location == 0) is a no-op for both big- and little-endian,
25751 and is removed by the mid-end at optimization levels -O1 and higher.
25753 We don't need a big-endian lane correction for SVE; see the comment
25754 at the head of aarch64-sve.md for details. */
25755 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
25757 /* After setup, we want the high elements of the first vector (stored
25758 at the LSB end of the register), and the low elements of the second
25759 vector (stored at the MSB end of the register). So swap. */
25760 std::swap (d
->op0
, d
->op1
);
25761 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25762 to_constant () is safe since this is restricted to Advanced SIMD
25764 location
= d
->perm
.length ().to_constant () - location
;
25767 offset
= GEN_INT (location
);
25768 emit_set_insn (d
->target
,
25769 gen_rtx_UNSPEC (d
->vmode
,
25770 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
25775 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25776 within each 64-bit, 32-bit or 16-bit granule. */
25779 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
25781 HOST_WIDE_INT diff
;
25782 unsigned int i
, size
, unspec
;
25783 machine_mode pred_mode
;
25785 if ((d
->vec_flags
& VEC_SVE_PRED
)
25786 || !d
->one_vector_p
25787 || !d
->perm
[0].is_constant (&diff
)
25791 if (d
->vec_flags
& VEC_SVE_DATA
)
25792 size
= (diff
+ 1) * aarch64_sve_container_bits (d
->vmode
);
25794 size
= (diff
+ 1) * GET_MODE_UNIT_BITSIZE (d
->vmode
);
25797 unspec
= UNSPEC_REV64
;
25798 pred_mode
= VNx2BImode
;
25800 else if (size
== 32)
25802 unspec
= UNSPEC_REV32
;
25803 pred_mode
= VNx4BImode
;
25805 else if (size
== 16)
25807 unspec
= UNSPEC_REV16
;
25808 pred_mode
= VNx8BImode
;
25813 unsigned int step
= diff
+ 1;
25814 for (i
= 0; i
< step
; ++i
)
25815 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
25822 if (d
->vec_flags
& VEC_SVE_DATA
)
25824 rtx pred
= aarch64_ptrue_reg (pred_mode
);
25825 emit_insn (gen_aarch64_sve_revbhw (d
->vmode
, pred_mode
,
25826 d
->target
, pred
, d
->op0
));
25829 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
25830 emit_set_insn (d
->target
, src
);
25834 /* Recognize patterns for the REV insn, which reverses elements within
25838 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
25840 poly_uint64 nelt
= d
->perm
.length ();
25842 if (!d
->one_vector_p
|| d
->vec_flags
== VEC_ADVSIMD
)
25845 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
25852 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
25853 emit_set_insn (d
->target
, src
);
25858 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
25860 rtx out
= d
->target
;
25863 machine_mode vmode
= d
->vmode
;
25866 if ((d
->vec_flags
& VEC_SVE_PRED
)
25867 || d
->perm
.encoding ().encoded_nelts () != 1
25868 || !d
->perm
[0].is_constant (&elt
))
25871 if ((d
->vec_flags
& VEC_SVE_DATA
)
25872 && elt
* (aarch64_sve_container_bits (vmode
) / 8) >= 64)
25879 /* The generic preparation in aarch64_expand_vec_perm_const_1
25880 swaps the operand order and the permute indices if it finds
25881 d->perm[0] to be in the second operand. Thus, we can always
25882 use d->op0 and need not do any extra arithmetic to get the
25883 correct lane number. */
25885 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
25887 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
25888 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
25889 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
25894 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
25896 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
25897 machine_mode vmode
= d
->vmode
;
25899 /* Make sure that the indices are constant. */
25900 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
25901 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
25902 if (!d
->perm
[i
].is_constant ())
25908 /* Generic code will try constant permutation twice. Once with the
25909 original mode and again with the elements lowered to QImode.
25910 So wait and don't do the selector expansion ourselves. */
25911 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
25914 /* to_constant is safe since this routine is specific to Advanced SIMD
25916 unsigned int nelt
= d
->perm
.length ().to_constant ();
25918 /* If one register is the constant vector of 0 then we only need
25919 a one reg TBL and we map any accesses to the vector of 0 to -1. We can't
25920 do this earlier since vec_perm_indices clamps elements to within range so
25921 we can only do it during codegen. */
25924 else if (d
->zero_op1_p
)
25927 for (unsigned int i
= 0; i
< nelt
; ++i
)
25929 auto val
= d
->perm
[i
].to_constant ();
25931 /* If we're selecting from a 0 vector, we can just use an out of range
25933 if ((d
->zero_op0_p
&& val
< nelt
) || (d
->zero_op1_p
&& val
>= nelt
))
25934 rperm
[i
] = constm1_rtx
;
25937 /* If we are remapping a zero register as the first parameter we need
25938 to adjust the indices of the non-zero register. */
25942 /* If big-endian and two vectors we end up with a weird mixed-endian
25943 mode on NEON. Reverse the index within each word but not the word
25944 itself. to_constant is safe because we checked is_constant
25946 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? val
^ (nelt
- 1) : val
);
25950 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
25951 sel
= force_reg (vmode
, sel
);
25953 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
25957 /* Try to implement D using an SVE TBL instruction. */
25960 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
25962 unsigned HOST_WIDE_INT nelt
;
25964 /* Permuting two variable-length vectors could overflow the
25966 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
25972 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
25973 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
25974 if (d
->one_vector_p
)
25975 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
25977 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
25981 /* Try to implement D using SVE dup instruction. */
25984 aarch64_evpc_sve_dup (struct expand_vec_perm_d
*d
)
25986 if (BYTES_BIG_ENDIAN
25987 || !d
->one_vector_p
25988 || d
->vec_flags
!= VEC_SVE_DATA
25989 || d
->op_vec_flags
!= VEC_ADVSIMD
25990 || d
->perm
.encoding ().nelts_per_pattern () != 1
25991 || !known_eq (d
->perm
.encoding ().npatterns (),
25992 GET_MODE_NUNITS (d
->op_mode
))
25993 || !known_eq (GET_MODE_BITSIZE (d
->op_mode
), 128))
25996 int npatterns
= d
->perm
.encoding ().npatterns ();
25997 for (int i
= 0; i
< npatterns
; i
++)
25998 if (!known_eq (d
->perm
[i
], i
))
26004 aarch64_expand_sve_dupq (d
->target
, GET_MODE (d
->target
), d
->op0
);
26008 /* Try to implement D using SVE SEL instruction. */
26011 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
26013 machine_mode vmode
= d
->vmode
;
26014 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
26016 if (d
->vec_flags
!= VEC_SVE_DATA
26020 int n_patterns
= d
->perm
.encoding ().npatterns ();
26021 poly_int64 vec_len
= d
->perm
.length ();
26023 for (int i
= 0; i
< n_patterns
; ++i
)
26024 if (!known_eq (d
->perm
[i
], i
)
26025 && !known_eq (d
->perm
[i
], vec_len
+ i
))
26028 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
26029 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
26030 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
26036 machine_mode pred_mode
= aarch64_sve_pred_mode (vmode
);
26038 /* Build a predicate that is true when op0 elements should be used. */
26039 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
26040 for (int i
= 0; i
< n_patterns
* 2; i
++)
26042 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
26043 : CONST0_RTX (BImode
);
26044 builder
.quick_push (elem
);
26047 rtx const_vec
= builder
.build ();
26048 rtx pred
= force_reg (pred_mode
, const_vec
);
26049 /* TARGET = PRED ? OP0 : OP1. */
26050 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op0
, d
->op1
, pred
));
26054 /* Recognize patterns suitable for the INS instructions. */
26056 aarch64_evpc_ins (struct expand_vec_perm_d
*d
)
26058 machine_mode mode
= d
->vmode
;
26059 unsigned HOST_WIDE_INT nelt
;
26061 if (d
->vec_flags
!= VEC_ADVSIMD
)
26064 /* to_constant is safe since this routine is specific to Advanced SIMD
26066 nelt
= d
->perm
.length ().to_constant ();
26069 HOST_WIDE_INT idx
= -1;
26071 for (unsigned HOST_WIDE_INT i
= 0; i
< nelt
; i
++)
26074 if (!d
->perm
[i
].is_constant (&elt
))
26076 if (elt
== (HOST_WIDE_INT
) i
)
26089 for (unsigned HOST_WIDE_INT i
= 0; i
< nelt
; i
++)
26091 if (d
->perm
[i
].to_constant () == (HOST_WIDE_INT
) (i
+ nelt
))
26105 gcc_assert (idx
!= -1);
26107 unsigned extractindex
= d
->perm
[idx
].to_constant ();
26108 rtx extractv
= d
->op0
;
26109 if (extractindex
>= nelt
)
26112 extractindex
-= nelt
;
26114 gcc_assert (extractindex
< nelt
);
26116 insn_code icode
= code_for_aarch64_simd_vec_copy_lane (mode
);
26117 expand_operand ops
[5];
26118 create_output_operand (&ops
[0], d
->target
, mode
);
26119 create_input_operand (&ops
[1], insv
, mode
);
26120 create_integer_operand (&ops
[2], 1 << idx
);
26121 create_input_operand (&ops
[3], extractv
, mode
);
26122 create_integer_operand (&ops
[4], extractindex
);
26123 expand_insn (icode
, 5, ops
);
26129 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
26131 gcc_assert (d
->op_mode
!= E_VOIDmode
);
26133 /* The pattern matching functions above are written to look for a small
26134 number to begin the sequence (0, 1, N/2). If we begin with an index
26135 from the second operand, we can swap the operands. */
26136 poly_int64 nelt
= d
->perm
.length ();
26137 if (known_ge (d
->perm
[0], nelt
))
26139 d
->perm
.rotate_inputs (1);
26140 std::swap (d
->op0
, d
->op1
);
26143 if (((d
->vec_flags
== VEC_ADVSIMD
&& TARGET_SIMD
)
26144 || d
->vec_flags
== VEC_SVE_DATA
26145 || d
->vec_flags
== (VEC_SVE_DATA
| VEC_PARTIAL
)
26146 || d
->vec_flags
== VEC_SVE_PRED
)
26147 && known_gt (nelt
, 1))
26149 if (d
->vmode
== d
->op_mode
)
26151 if (aarch64_evpc_rev_local (d
))
26153 else if (aarch64_evpc_rev_global (d
))
26155 else if (aarch64_evpc_ext (d
))
26157 else if (aarch64_evpc_dup (d
))
26159 else if (aarch64_evpc_zip (d
))
26161 else if (aarch64_evpc_uzp (d
))
26163 else if (aarch64_evpc_trn (d
))
26165 else if (aarch64_evpc_sel (d
))
26167 else if (aarch64_evpc_ins (d
))
26169 else if (aarch64_evpc_reencode (d
))
26172 if (d
->vec_flags
== VEC_SVE_DATA
)
26173 return aarch64_evpc_sve_tbl (d
);
26174 else if (d
->vec_flags
== VEC_ADVSIMD
)
26175 return aarch64_evpc_tbl (d
);
26179 if (aarch64_evpc_sve_dup (d
))
26186 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
26189 aarch64_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
26190 rtx target
, rtx op0
, rtx op1
,
26191 const vec_perm_indices
&sel
)
26193 struct expand_vec_perm_d d
;
26195 /* Check whether the mask can be applied to a single vector. */
26196 if (sel
.ninputs () == 1
26197 || (op0
&& rtx_equal_p (op0
, op1
)))
26198 d
.one_vector_p
= true;
26199 else if (sel
.all_from_input_p (0))
26201 d
.one_vector_p
= true;
26204 else if (sel
.all_from_input_p (1))
26206 d
.one_vector_p
= true;
26210 d
.one_vector_p
= false;
26212 d
.zero_op0_p
= op0
== CONST0_RTX (op_mode
);
26213 d
.zero_op1_p
= op1
== CONST0_RTX (op_mode
);
26214 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
26215 sel
.nelts_per_input ());
26217 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
26218 d
.op_mode
= op_mode
;
26219 d
.op_vec_flags
= aarch64_classify_vector_mode (d
.op_mode
);
26221 d
.op0
= op0
? force_reg (op_mode
, op0
) : NULL_RTX
;
26225 d
.op1
= op1
? force_reg (op_mode
, op1
) : NULL_RTX
;
26226 d
.testing_p
= !target
;
26229 return aarch64_expand_vec_perm_const_1 (&d
);
26231 rtx_insn
*last
= get_last_insn ();
26232 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
26233 gcc_assert (last
== get_last_insn ());
26237 /* Generate a byte permute mask for a register of mode MODE,
26238 which has NUNITS units. */
26241 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
26243 /* We have to reverse each vector because we dont have
26244 a permuted load that can reverse-load according to ABI rules. */
26246 rtvec v
= rtvec_alloc (16);
26248 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
26250 gcc_assert (BYTES_BIG_ENDIAN
);
26251 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
26253 for (i
= 0; i
< nunits
; i
++)
26254 for (j
= 0; j
< usize
; j
++)
26255 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
26256 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
26257 return force_reg (V16QImode
, mask
);
26260 /* Expand an SVE integer comparison using the SVE equivalent of:
26262 (set TARGET (CODE OP0 OP1)). */
26265 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
26267 machine_mode pred_mode
= GET_MODE (target
);
26268 machine_mode data_mode
= GET_MODE (op0
);
26269 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
26271 if (!rtx_equal_p (target
, res
))
26272 emit_move_insn (target
, res
);
26275 /* Return the UNSPEC_COND_* code for comparison CODE. */
26277 static unsigned int
26278 aarch64_unspec_cond_code (rtx_code code
)
26283 return UNSPEC_COND_FCMNE
;
26285 return UNSPEC_COND_FCMEQ
;
26287 return UNSPEC_COND_FCMLT
;
26289 return UNSPEC_COND_FCMGT
;
26291 return UNSPEC_COND_FCMLE
;
26293 return UNSPEC_COND_FCMGE
;
26295 return UNSPEC_COND_FCMUO
;
26297 gcc_unreachable ();
26303 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26305 where <X> is the operation associated with comparison CODE.
26306 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26309 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
26310 bool known_ptrue_p
, rtx op0
, rtx op1
)
26312 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
26313 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
26314 gen_rtvec (4, pred
, flag
, op0
, op1
),
26315 aarch64_unspec_cond_code (code
));
26316 emit_set_insn (target
, unspec
);
26319 /* Emit the SVE equivalent of:
26321 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26322 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26323 (set TARGET (ior:PRED_MODE TMP1 TMP2))
26325 where <Xi> is the operation associated with comparison CODEi.
26326 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26329 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
26330 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
26332 machine_mode pred_mode
= GET_MODE (pred
);
26333 rtx tmp1
= gen_reg_rtx (pred_mode
);
26334 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
26335 rtx tmp2
= gen_reg_rtx (pred_mode
);
26336 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
26337 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
26340 /* Emit the SVE equivalent of:
26342 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26343 (set TARGET (not TMP))
26345 where <X> is the operation associated with comparison CODE.
26346 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
26349 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
26350 bool known_ptrue_p
, rtx op0
, rtx op1
)
26352 machine_mode pred_mode
= GET_MODE (pred
);
26353 rtx tmp
= gen_reg_rtx (pred_mode
);
26354 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
26355 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
26358 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26360 (set TARGET (CODE OP0 OP1))
26362 If CAN_INVERT_P is true, the caller can also handle inverted results;
26363 return true if the result is in fact inverted. */
26366 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
26367 rtx op0
, rtx op1
, bool can_invert_p
)
26369 machine_mode pred_mode
= GET_MODE (target
);
26370 machine_mode data_mode
= GET_MODE (op0
);
26372 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
26376 /* UNORDERED has no immediate form. */
26377 op1
= force_reg (data_mode
, op1
);
26386 /* There is native support for the comparison. */
26387 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26392 /* This is a trapping operation (LT or GT). */
26393 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
26397 if (!flag_trapping_math
)
26399 /* This would trap for signaling NaNs. */
26400 op1
= force_reg (data_mode
, op1
);
26401 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
26402 ptrue
, true, op0
, op1
);
26410 if (flag_trapping_math
)
26412 /* Work out which elements are ordered. */
26413 rtx ordered
= gen_reg_rtx (pred_mode
);
26414 op1
= force_reg (data_mode
, op1
);
26415 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
26416 ptrue
, true, op0
, op1
);
26418 /* Test the opposite condition for the ordered elements,
26419 then invert the result. */
26423 code
= reverse_condition_maybe_unordered (code
);
26426 aarch64_emit_sve_fp_cond (target
, code
,
26427 ordered
, false, op0
, op1
);
26430 aarch64_emit_sve_invert_fp_cond (target
, code
,
26431 ordered
, false, op0
, op1
);
26437 /* ORDERED has no immediate form. */
26438 op1
= force_reg (data_mode
, op1
);
26442 gcc_unreachable ();
26445 /* There is native support for the inverse comparison. */
26446 code
= reverse_condition_maybe_unordered (code
);
26449 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26452 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
26456 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
26457 of the data being selected and CMP_MODE is the mode of the values being
26461 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
26464 machine_mode pred_mode
= aarch64_get_mask_mode (cmp_mode
).require ();
26465 rtx pred
= gen_reg_rtx (pred_mode
);
26466 if (FLOAT_MODE_P (cmp_mode
))
26468 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
26469 ops
[4], ops
[5], true))
26470 std::swap (ops
[1], ops
[2]);
26473 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
26475 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
26476 ops
[1] = force_reg (data_mode
, ops
[1]);
26477 /* The "false" value can only be zero if the "true" value is a constant. */
26478 if (register_operand (ops
[1], data_mode
)
26479 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
26480 ops
[2] = force_reg (data_mode
, ops
[2]);
26482 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
26483 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
26488 (a) MODE1 and MODE2 use the same layout for bytes that are common
26491 (b) subregs involving the two modes behave as the target-independent
26492 subreg rules require; and
26494 (c) there is at least one register that can hold both modes.
26496 Return false otherwise. */
26499 aarch64_modes_compatible_p (machine_mode mode1
, machine_mode mode2
)
26501 unsigned int flags1
= aarch64_classify_vector_mode (mode1
);
26502 unsigned int flags2
= aarch64_classify_vector_mode (mode2
);
26504 bool sve1_p
= (flags1
& VEC_ANY_SVE
);
26505 bool sve2_p
= (flags2
& VEC_ANY_SVE
);
26507 bool partial_sve1_p
= sve1_p
&& (flags1
& VEC_PARTIAL
);
26508 bool partial_sve2_p
= sve2_p
&& (flags2
& VEC_PARTIAL
);
26510 bool pred1_p
= (flags1
& VEC_SVE_PRED
);
26511 bool pred2_p
= (flags2
& VEC_SVE_PRED
);
26513 bool partial_advsimd_struct1_p
= (flags1
== (VEC_ADVSIMD
| VEC_STRUCT
26515 bool partial_advsimd_struct2_p
= (flags2
== (VEC_ADVSIMD
| VEC_STRUCT
26518 /* Don't allow changes between predicate modes and other modes.
26519 Only predicate registers can hold predicate modes and only
26520 non-predicate registers can hold non-predicate modes, so any
26521 attempt to mix them would require a round trip through memory. */
26522 if (pred1_p
!= pred2_p
)
26525 /* The contents of partial SVE modes are distributed evenly across
26526 the register, whereas GCC expects them to be clustered together.
26527 We therefore need to be careful about mode changes involving them. */
26528 if (partial_sve1_p
&& partial_sve2_p
)
26530 /* Reject changes between partial SVE modes that have different
26531 patterns of significant and insignificant bits. */
26532 if ((aarch64_sve_container_bits (mode1
)
26533 != aarch64_sve_container_bits (mode2
))
26534 || GET_MODE_UNIT_SIZE (mode1
) != GET_MODE_UNIT_SIZE (mode2
))
26537 else if (partial_sve1_p
)
26539 /* The first lane of MODE1 is where GCC expects it, but anything
26540 bigger than that is not. */
26541 if (maybe_gt (GET_MODE_SIZE (mode2
), GET_MODE_UNIT_SIZE (mode1
)))
26544 else if (partial_sve2_p
)
26546 /* Similarly in reverse. */
26547 if (maybe_gt (GET_MODE_SIZE (mode1
), GET_MODE_UNIT_SIZE (mode2
)))
26551 /* Don't allow changes between partial Advanced SIMD structure modes
26552 and other modes that are bigger than 8 bytes. E.g. V16QI and V2x8QI
26553 are the same size, but the former occupies one Q register while the
26554 latter occupies two D registers. */
26555 if (partial_advsimd_struct1_p
!= partial_advsimd_struct2_p
26556 && maybe_gt (GET_MODE_SIZE (mode1
), 8)
26557 && maybe_gt (GET_MODE_SIZE (mode2
), 8))
26560 if (maybe_ne (BITS_PER_SVE_VECTOR
, 128u))
26562 /* Don't allow changes between SVE modes and other modes that might
26563 be bigger than 128 bits. In particular, OImode, CImode and XImode
26564 divide into 128-bit quantities while SVE modes divide into
26565 BITS_PER_SVE_VECTOR quantities. */
26566 if (sve1_p
&& !sve2_p
&& maybe_gt (GET_MODE_BITSIZE (mode2
), 128))
26568 if (sve2_p
&& !sve1_p
&& maybe_gt (GET_MODE_BITSIZE (mode1
), 128))
26572 if (BYTES_BIG_ENDIAN
)
26574 /* Don't allow changes between SVE data modes and non-SVE modes.
26575 See the comment at the head of aarch64-sve.md for details. */
26576 if (sve1_p
!= sve2_p
)
26579 /* Don't allow changes in element size: lane 0 of the new vector
26580 would not then be lane 0 of the old vector. See the comment
26581 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26584 In the worst case, this forces a register to be spilled in
26585 one mode and reloaded in the other, which handles the
26586 endianness correctly. */
26587 if (sve1_p
&& GET_MODE_UNIT_SIZE (mode1
) != GET_MODE_UNIT_SIZE (mode2
))
26593 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always defer
26594 to aarch64_modes_compatible_p. However due to issues with register
26595 allocation it is preferable to avoid tieing integer scalar and FP
26596 scalar modes. Executing integer operations in general registers is
26597 better than treating them as scalar vector operations. This reduces
26598 latency and avoids redundant int<->FP moves. So tie modes if they
26599 are either the same class, or one of them is a vector mode. */
26602 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
26604 if (aarch64_modes_compatible_p (mode1
, mode2
))
26606 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
26608 if (VECTOR_MODE_P (mode1
) || VECTOR_MODE_P (mode2
))
26614 /* Return a new RTX holding the result of moving POINTER forward by
26618 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
26620 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
26622 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
26626 /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
26627 from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
26628 rather than memcpy. Return true iff we succeeded. */
26630 aarch64_expand_cpymem_mops (rtx
*operands
, bool is_memmove
)
26635 /* All three registers are changed by the instruction, so each one
26636 must be a fresh pseudo. */
26637 rtx dst_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[0], 0));
26638 rtx src_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[1], 0));
26639 rtx dst_mem
= replace_equiv_address (operands
[0], dst_addr
);
26640 rtx src_mem
= replace_equiv_address (operands
[1], src_addr
);
26641 rtx sz_reg
= copy_to_mode_reg (DImode
, operands
[2]);
26643 emit_insn (gen_aarch64_movmemdi (dst_mem
, src_mem
, sz_reg
));
26645 emit_insn (gen_aarch64_cpymemdi (dst_mem
, src_mem
, sz_reg
));
26649 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26650 OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true
26651 if this is a memmove rather than memcpy. Return true if we succeed,
26652 otherwise return false, indicating that a libcall should be emitted. */
26654 aarch64_expand_cpymem (rtx
*operands
, bool is_memmove
)
26657 rtx dst
= operands
[0];
26658 rtx src
= operands
[1];
26659 unsigned align
= UINTVAL (operands
[3]);
26661 machine_mode mode
= BLKmode
, next_mode
;
26663 /* Variable-sized or strict-align copies may use the MOPS expansion. */
26664 if (!CONST_INT_P (operands
[2]) || (STRICT_ALIGNMENT
&& align
< 16))
26665 return aarch64_expand_cpymem_mops (operands
, is_memmove
);
26667 unsigned HOST_WIDE_INT size
= UINTVAL (operands
[2]);
26669 /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. */
26670 unsigned max_copy_size
= TARGET_SIMD
? 256 : 128;
26671 unsigned mops_threshold
= is_memmove
? aarch64_mops_memmove_size_threshold
26672 : aarch64_mops_memcpy_size_threshold
;
26674 /* Reduce the maximum size with -Os. */
26675 if (optimize_function_for_size_p (cfun
))
26676 max_copy_size
/= 4;
26678 /* Large copies use MOPS when available or a library call. */
26679 if (size
> max_copy_size
|| (TARGET_MOPS
&& size
> mops_threshold
))
26680 return aarch64_expand_cpymem_mops (operands
, is_memmove
);
26682 /* Default to 32-byte LDP/STP on large copies, however small copies or
26683 no SIMD support fall back to 16-byte chunks.
26684 ??? Although it would be possible to use LDP/STP Qn in streaming mode
26685 (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26686 whether that would improve performance. */
26687 bool use_qregs
= size
> 24 && TARGET_SIMD
;
26689 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
26690 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
26692 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
26693 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
26695 auto_vec
<std::pair
<rtx
, rtx
>, 16> ops
;
26700 /* Find the largest mode in which to do the copy in without over reading
26702 opt_scalar_int_mode mode_iter
;
26703 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
26704 if (GET_MODE_SIZE (mode_iter
.require ()) <= MIN (size
, 16))
26705 mode
= mode_iter
.require ();
26707 gcc_assert (mode
!= BLKmode
);
26709 mode_bytes
= GET_MODE_SIZE (mode
).to_constant ();
26711 /* Prefer Q-register accesses. */
26712 if (mode_bytes
== 16 && use_qregs
)
26715 rtx reg
= gen_reg_rtx (mode
);
26716 rtx load
= gen_move_insn (reg
, adjust_address (src
, mode
, offset
));
26717 rtx store
= gen_move_insn (adjust_address (dst
, mode
, offset
), reg
);
26718 ops
.safe_push ({ load
, store
});
26719 size
-= mode_bytes
;
26720 offset
+= mode_bytes
;
26722 /* Emit trailing copies using overlapping unaligned accesses
26723 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26724 if (size
> 0 && size
< 16 && !STRICT_ALIGNMENT
)
26726 next_mode
= smallest_mode_for_size (size
* BITS_PER_UNIT
, MODE_INT
);
26727 int n_bytes
= GET_MODE_SIZE (next_mode
).to_constant ();
26728 gcc_assert (n_bytes
<= mode_bytes
);
26729 offset
-= n_bytes
- size
;
26734 /* Memcpy interleaves loads with stores, memmove emits all loads first. */
26735 int nops
= ops
.length();
26736 int inc
= is_memmove
|| nops
<= 8 ? nops
: 6;
26738 for (int i
= 0; i
< nops
; i
+= inc
)
26740 int m
= MIN (nops
, i
+ inc
);
26742 for (int j
= i
; j
< m
; j
++)
26743 emit_insn (ops
[j
].first
);
26745 for (int j
= i
; j
< m
; j
++)
26746 emit_insn (ops
[j
].second
);
26751 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
26752 as for the setmem pattern. Return true iff we succeed. */
26754 aarch64_expand_setmem_mops (rtx
*operands
)
26759 /* The first two registers are changed by the instruction, so both
26760 of them must be a fresh pseudo. */
26761 rtx dst_addr
= copy_to_mode_reg (Pmode
, XEXP (operands
[0], 0));
26762 rtx dst_mem
= replace_equiv_address (operands
[0], dst_addr
);
26763 rtx sz_reg
= copy_to_mode_reg (DImode
, operands
[1]);
26764 rtx val
= operands
[2];
26765 if (val
!= CONST0_RTX (QImode
))
26766 val
= force_reg (QImode
, val
);
26767 emit_insn (gen_aarch64_setmemdi (dst_mem
, val
, sz_reg
));
26771 /* Expand setmem, as if from a __builtin_memset. Return true if
26772 we succeed, otherwise return false. */
26775 aarch64_expand_setmem (rtx
*operands
)
26778 unsigned HOST_WIDE_INT len
;
26779 rtx dst
= operands
[0];
26780 rtx val
= operands
[2], src
;
26781 unsigned align
= UINTVAL (operands
[3]);
26783 machine_mode mode
= BLKmode
, next_mode
;
26785 /* Variable-sized or strict-align memset may use the MOPS expansion. */
26786 if (!CONST_INT_P (operands
[1]) || !TARGET_SIMD
26787 || (STRICT_ALIGNMENT
&& align
< 16))
26788 return aarch64_expand_setmem_mops (operands
);
26790 /* Set inline limits for memset. MOPS has a separate threshold. */
26791 unsigned max_set_size
= MAX_SET_SIZE (optimize_function_for_speed_p (cfun
));
26792 unsigned mops_threshold
= aarch64_mops_memset_size_threshold
;
26794 len
= UINTVAL (operands
[1]);
26796 /* Large memset uses MOPS when available or a library call. */
26797 if (len
> max_set_size
|| (TARGET_MOPS
&& len
> mops_threshold
))
26798 return aarch64_expand_setmem_mops (operands
);
26800 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
26801 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
26803 /* Prepare the val using a DUP/MOVI v0.16B, val. */
26804 val
= expand_vector_broadcast (V16QImode
, val
);
26805 val
= force_reg (V16QImode
, val
);
26810 /* Find the largest mode in which to do the copy without
26812 opt_scalar_int_mode mode_iter
;
26813 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
26814 if (GET_MODE_SIZE (mode_iter
.require ()) <= MIN (len
, 16))
26815 mode
= mode_iter
.require ();
26817 gcc_assert (mode
!= BLKmode
);
26819 mode_bytes
= GET_MODE_SIZE (mode
).to_constant ();
26823 /* Prefer Q-register accesses. */
26824 if (mode_bytes
== 16)
26827 src
= lowpart_subreg (mode
, src
, GET_MODE (val
));
26829 emit_move_insn (adjust_address (dst
, mode
, offset
), src
);
26831 offset
+= mode_bytes
;
26833 /* Emit trailing writes using overlapping unaligned accesses
26834 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
26835 if (len
> 0 && len
< 16 && !STRICT_ALIGNMENT
)
26837 next_mode
= smallest_mode_for_size (len
* BITS_PER_UNIT
, MODE_INT
);
26838 int n_bytes
= GET_MODE_SIZE (next_mode
).to_constant ();
26839 gcc_assert (n_bytes
<= mode_bytes
);
26840 offset
-= n_bytes
- len
;
26849 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
26850 SImode stores. Handle the case when the constant has identical
26851 bottom and top halves. This is beneficial when the two stores can be
26852 merged into an STP and we avoid synthesising potentially expensive
26853 immediates twice. Return true if such a split is possible. */
26856 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
26858 rtx lo
= gen_lowpart (SImode
, src
);
26859 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
26861 if (!rtx_equal_p (lo
, hi
))
26864 unsigned int orig_cost
26865 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
26866 unsigned int lo_cost
26867 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
26869 /* We want to transform:
26871 MOVK x1, 0x140, lsl 16
26872 MOVK x1, 0xc0da, lsl 32
26873 MOVK x1, 0x140, lsl 48
26877 MOVK w1, 0x140, lsl 16
26879 So we want to perform this when we save at least one instruction. */
26880 if (orig_cost
<= lo_cost
)
26883 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
26884 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
26887 rtx tmp_reg
= gen_reg_rtx (SImode
);
26888 aarch64_expand_mov_immediate (tmp_reg
, lo
);
26889 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
26890 /* Don't emit an explicit store pair as this may not be always profitable.
26891 Let the sched-fusion logic decide whether to merge them. */
26892 emit_move_insn (mem_lo
, tmp_reg
);
26893 emit_move_insn (mem_hi
, tmp_reg
);
26898 /* Generate RTL for a conditional branch with rtx comparison CODE in
26899 mode CC_MODE. The destination of the unlikely conditional branch
26903 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
26907 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
26908 gen_rtx_REG (cc_mode
, CC_REGNUM
),
26911 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
26912 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
26914 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
26917 /* Generate DImode scratch registers for 128-bit (TImode) addition.
26919 OP1 represents the TImode destination operand 1
26920 OP2 represents the TImode destination operand 2
26921 LOW_DEST represents the low half (DImode) of TImode operand 0
26922 LOW_IN1 represents the low half (DImode) of TImode operand 1
26923 LOW_IN2 represents the low half (DImode) of TImode operand 2
26924 HIGH_DEST represents the high half (DImode) of TImode operand 0
26925 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26926 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
26929 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
26930 rtx
*low_in1
, rtx
*low_in2
,
26931 rtx
*high_dest
, rtx
*high_in1
,
26934 *low_dest
= gen_reg_rtx (DImode
);
26935 *low_in1
= force_lowpart_subreg (DImode
, op1
, TImode
);
26936 *low_in2
= force_lowpart_subreg (DImode
, op2
, TImode
);
26937 *high_dest
= gen_reg_rtx (DImode
);
26938 *high_in1
= force_highpart_subreg (DImode
, op1
, TImode
);
26939 *high_in2
= force_highpart_subreg (DImode
, op2
, TImode
);
26942 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
26944 OP1 represents the TImode destination operand 1
26945 OP2 represents the TImode destination operand 2
26946 LOW_DEST represents the low half (DImode) of TImode operand 0
26947 LOW_IN1 represents the low half (DImode) of TImode operand 1
26948 LOW_IN2 represents the low half (DImode) of TImode operand 2
26949 HIGH_DEST represents the high half (DImode) of TImode operand 0
26950 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26951 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
26955 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
26956 rtx
*low_in1
, rtx
*low_in2
,
26957 rtx
*high_dest
, rtx
*high_in1
,
26960 *low_dest
= gen_reg_rtx (DImode
);
26961 *low_in1
= force_lowpart_subreg (DImode
, op1
, TImode
);
26962 *low_in2
= force_lowpart_subreg (DImode
, op2
, TImode
);
26963 *high_dest
= gen_reg_rtx (DImode
);
26965 *high_in1
= force_highpart_subreg (DImode
, op1
, TImode
);
26966 *high_in2
= force_highpart_subreg (DImode
, op2
, TImode
);
26969 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
26971 OP0 represents the TImode destination operand 0
26972 LOW_DEST represents the low half (DImode) of TImode operand 0
26973 LOW_IN1 represents the low half (DImode) of TImode operand 1
26974 LOW_IN2 represents the low half (DImode) of TImode operand 2
26975 HIGH_DEST represents the high half (DImode) of TImode operand 0
26976 HIGH_IN1 represents the high half (DImode) of TImode operand 1
26977 HIGH_IN2 represents the high half (DImode) of TImode operand 2
26978 UNSIGNED_P is true if the operation is being performed on unsigned
26981 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
26982 rtx low_in2
, rtx high_dest
, rtx high_in1
,
26983 rtx high_in2
, bool unsigned_p
)
26985 if (low_in2
== const0_rtx
)
26987 low_dest
= low_in1
;
26988 high_in2
= force_reg (DImode
, high_in2
);
26990 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
26992 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
26996 if (aarch64_plus_immediate (low_in2
, DImode
))
26997 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
26998 GEN_INT (-UINTVAL (low_in2
))));
27001 low_in2
= force_reg (DImode
, low_in2
);
27002 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
27004 high_in2
= force_reg (DImode
, high_in2
);
27007 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
27009 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
27012 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
27013 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
27017 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
27019 static unsigned HOST_WIDE_INT
27020 aarch64_asan_shadow_offset (void)
27023 return (HOST_WIDE_INT_1
<< 29);
27025 return (HOST_WIDE_INT_1
<< 36);
27029 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
27030 rtx_code code
, tree treeop0
, tree treeop1
)
27032 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
27034 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
27036 struct expand_operand ops
[4];
27039 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
27041 op_mode
= GET_MODE (op0
);
27042 if (op_mode
== VOIDmode
)
27043 op_mode
= GET_MODE (op1
);
27051 icode
= CODE_FOR_cmpsi
;
27056 icode
= CODE_FOR_cmpdi
;
27061 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
27062 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
27067 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
27068 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
27076 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
27077 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
27083 *prep_seq
= get_insns ();
27086 create_fixed_operand (&ops
[0], op0
);
27087 create_fixed_operand (&ops
[1], op1
);
27090 if (!maybe_expand_insn (icode
, 2, ops
))
27095 *gen_seq
= get_insns ();
27098 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
27099 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
27103 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
27104 rtx_code cmp_code
, tree treeop0
, tree treeop1
,
27107 rtx op0
, op1
, target
;
27108 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
27109 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
27111 struct expand_operand ops
[6];
27114 push_to_sequence (*prep_seq
);
27115 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
27117 op_mode
= GET_MODE (op0
);
27118 if (op_mode
== VOIDmode
)
27119 op_mode
= GET_MODE (op1
);
27135 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
27140 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
27148 icode
= code_for_ccmp (cc_mode
, cmp_mode
);
27150 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
27151 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
27157 *prep_seq
= get_insns ();
27160 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
27161 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
27163 if (bit_code
!= AND
)
27165 /* Treat the ccmp patterns as canonical and use them where possible,
27166 but fall back to ccmp_rev patterns if there's no other option. */
27167 rtx_code prev_code
= GET_CODE (prev
);
27168 machine_mode prev_mode
= GET_MODE (XEXP (prev
, 0));
27169 if ((prev_mode
== CCFPmode
|| prev_mode
== CCFPEmode
)
27170 && !(prev_code
== EQ
27172 || prev_code
== ORDERED
27173 || prev_code
== UNORDERED
))
27174 icode
= code_for_ccmp_rev (cc_mode
, cmp_mode
);
27177 rtx_code code
= reverse_condition (prev_code
);
27178 prev
= gen_rtx_fmt_ee (code
, VOIDmode
, XEXP (prev
, 0), const0_rtx
);
27180 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
27183 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
27184 create_fixed_operand (&ops
[1], target
);
27185 create_fixed_operand (&ops
[2], op0
);
27186 create_fixed_operand (&ops
[3], op1
);
27187 create_fixed_operand (&ops
[4], prev
);
27188 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
27190 push_to_sequence (*gen_seq
);
27191 if (!maybe_expand_insn (icode
, 6, ops
))
27197 *gen_seq
= get_insns ();
27200 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
27203 #undef TARGET_GEN_CCMP_FIRST
27204 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27206 #undef TARGET_GEN_CCMP_NEXT
27207 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27209 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
27210 instruction fusion of some sort. */
27213 aarch64_macro_fusion_p (void)
27215 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
27219 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
27220 should be kept together during scheduling. */
27223 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
27226 rtx prev_set
= single_set (prev
);
27227 rtx curr_set
= single_set (curr
);
27228 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
27229 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
27231 if (!aarch64_macro_fusion_p ())
27234 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
27236 /* We are trying to match:
27237 prev (mov) == (set (reg r0) (const_int imm16))
27238 curr (movk) == (set (zero_extract (reg r0)
27241 (const_int imm16_1)) */
27243 set_dest
= SET_DEST (curr_set
);
27245 if (GET_CODE (set_dest
) == ZERO_EXTRACT
27246 && CONST_INT_P (SET_SRC (curr_set
))
27247 && CONST_INT_P (SET_SRC (prev_set
))
27248 && CONST_INT_P (XEXP (set_dest
, 2))
27249 && INTVAL (XEXP (set_dest
, 2)) == 16
27250 && REG_P (XEXP (set_dest
, 0))
27251 && REG_P (SET_DEST (prev_set
))
27252 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
27258 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
27261 /* We're trying to match:
27262 prev (adrp) == (set (reg r1)
27263 (high (symbol_ref ("SYM"))))
27264 curr (add) == (set (reg r0)
27266 (symbol_ref ("SYM"))))
27267 Note that r0 need not necessarily be the same as r1, especially
27268 during pre-regalloc scheduling. */
27270 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
27271 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
27273 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
27274 && REG_P (XEXP (SET_SRC (curr_set
), 0))
27275 && REGNO (XEXP (SET_SRC (curr_set
), 0))
27276 == REGNO (SET_DEST (prev_set
))
27277 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
27278 XEXP (SET_SRC (curr_set
), 1)))
27283 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
27286 /* We're trying to match:
27287 prev (movk) == (set (zero_extract (reg r0)
27290 (const_int imm16_1))
27291 curr (movk) == (set (zero_extract (reg r0)
27294 (const_int imm16_2)) */
27296 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
27297 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
27298 && REG_P (XEXP (SET_DEST (prev_set
), 0))
27299 && REG_P (XEXP (SET_DEST (curr_set
), 0))
27300 && REGNO (XEXP (SET_DEST (prev_set
), 0))
27301 == REGNO (XEXP (SET_DEST (curr_set
), 0))
27302 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
27303 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
27304 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
27305 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
27306 && CONST_INT_P (SET_SRC (prev_set
))
27307 && CONST_INT_P (SET_SRC (curr_set
)))
27311 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
27313 /* We're trying to match:
27314 prev (adrp) == (set (reg r0)
27315 (high (symbol_ref ("SYM"))))
27316 curr (ldr) == (set (reg r1)
27317 (mem (lo_sum (reg r0)
27318 (symbol_ref ("SYM")))))
27320 curr (ldr) == (set (reg r1)
27323 (symbol_ref ("SYM")))))) */
27324 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
27325 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
27327 rtx curr_src
= SET_SRC (curr_set
);
27329 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
27330 curr_src
= XEXP (curr_src
, 0);
27332 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
27333 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
27334 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
27335 == REGNO (SET_DEST (prev_set
))
27336 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
27337 XEXP (SET_SRC (prev_set
), 0)))
27342 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
27343 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
27344 && prev_set
&& curr_set
&& any_condjump_p (curr
)
27345 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
27346 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
27347 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
27350 /* Fuse flag-setting ALU instructions and conditional branch. */
27351 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
27352 && any_condjump_p (curr
))
27354 unsigned int condreg1
, condreg2
;
27356 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
27357 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
27359 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
27361 && modified_in_p (cc_reg_1
, prev
))
27363 enum attr_type prev_type
= get_attr_type (prev
);
27365 /* FIXME: this misses some which is considered simple arthematic
27366 instructions for ThunderX. Simple shifts are missed here. */
27367 if (prev_type
== TYPE_ALUS_SREG
27368 || prev_type
== TYPE_ALUS_IMM
27369 || prev_type
== TYPE_LOGICS_REG
27370 || prev_type
== TYPE_LOGICS_IMM
)
27375 /* Fuse ALU instructions and CBZ/CBNZ. */
27378 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ
)
27379 && any_condjump_p (curr
))
27381 /* We're trying to match:
27382 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27383 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
27385 (label_ref ("SYM"))
27387 if (SET_DEST (curr_set
) == (pc_rtx
)
27388 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
27389 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
27390 && REG_P (SET_DEST (prev_set
))
27391 && REGNO (SET_DEST (prev_set
))
27392 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
27394 /* Fuse ALU operations followed by conditional branch instruction. */
27395 switch (get_attr_type (prev
))
27398 case TYPE_ALU_SREG
:
27401 case TYPE_ADCS_REG
:
27402 case TYPE_ADCS_IMM
:
27403 case TYPE_LOGIC_REG
:
27404 case TYPE_LOGIC_IMM
:
27408 case TYPE_SHIFT_REG
:
27409 case TYPE_SHIFT_IMM
:
27421 /* Fuse A+B+1 and A-B-1 */
27423 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1
))
27425 /* We're trying to match:
27426 prev == (set (r0) (plus (r0) (r1)))
27427 curr == (set (r0) (plus (r0) (const_int 1)))
27429 prev == (set (r0) (minus (r0) (r1)))
27430 curr == (set (r0) (plus (r0) (const_int -1))) */
27432 rtx prev_src
= SET_SRC (prev_set
);
27433 rtx curr_src
= SET_SRC (curr_set
);
27436 if (GET_CODE (prev_src
) == MINUS
)
27439 if (GET_CODE (curr_src
) == PLUS
27440 && (GET_CODE (prev_src
) == PLUS
|| GET_CODE (prev_src
) == MINUS
)
27441 && CONST_INT_P (XEXP (curr_src
, 1))
27442 && INTVAL (XEXP (curr_src
, 1)) == polarity
27443 && REG_P (XEXP (curr_src
, 0))
27444 && REG_P (SET_DEST (prev_set
))
27445 && REGNO (SET_DEST (prev_set
)) == REGNO (XEXP (curr_src
, 0)))
27452 /* Return true iff the instruction fusion described by OP is enabled. */
27455 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
27457 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
27460 /* If MEM is in the form of [base+offset], extract the two parts
27461 of address and set to BASE and OFFSET, otherwise return false
27462 after clearing BASE and OFFSET. */
27465 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
27469 gcc_assert (MEM_P (mem
));
27471 addr
= XEXP (mem
, 0);
27476 *offset
= const0_rtx
;
27480 if (GET_CODE (addr
) == PLUS
27481 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
27483 *base
= XEXP (addr
, 0);
27484 *offset
= XEXP (addr
, 1);
27489 *offset
= NULL_RTX
;
27494 /* Types for scheduling fusion. */
27495 enum sched_fusion_type
27497 SCHED_FUSION_NONE
= 0,
27498 SCHED_FUSION_LD_SIGN_EXTEND
,
27499 SCHED_FUSION_LD_ZERO_EXTEND
,
27505 /* If INSN is a load or store of address in the form of [base+offset],
27506 extract the two parts and set to BASE and OFFSET. Return scheduling
27507 fusion type this INSN is. */
27509 static enum sched_fusion_type
27510 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
27513 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
27515 gcc_assert (INSN_P (insn
));
27516 x
= PATTERN (insn
);
27517 if (GET_CODE (x
) != SET
)
27518 return SCHED_FUSION_NONE
;
27521 dest
= SET_DEST (x
);
27523 machine_mode dest_mode
= GET_MODE (dest
);
27525 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
27526 return SCHED_FUSION_NONE
;
27528 if (GET_CODE (src
) == SIGN_EXTEND
)
27530 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
27531 src
= XEXP (src
, 0);
27532 if (!MEM_P (src
) || GET_MODE (src
) != SImode
)
27533 return SCHED_FUSION_NONE
;
27535 else if (GET_CODE (src
) == ZERO_EXTEND
)
27537 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
27538 src
= XEXP (src
, 0);
27539 if (!MEM_P (src
) || GET_MODE (src
) != SImode
)
27540 return SCHED_FUSION_NONE
;
27543 if (MEM_P (src
) && REG_P (dest
))
27544 extract_base_offset_in_addr (src
, base
, offset
);
27545 else if (MEM_P (dest
) && (REG_P (src
) || src
== const0_rtx
))
27547 fusion
= SCHED_FUSION_ST
;
27548 extract_base_offset_in_addr (dest
, base
, offset
);
27551 return SCHED_FUSION_NONE
;
27553 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
27554 fusion
= SCHED_FUSION_NONE
;
27559 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27561 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27562 and PRI are only calculated for these instructions. For other instruction,
27563 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
27564 type instruction fusion can be added by returning different priorities.
27566 It's important that irrelevant instructions get the largest FUSION_PRI. */
27569 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
27570 int *fusion_pri
, int *pri
)
27574 enum sched_fusion_type fusion
;
27576 gcc_assert (INSN_P (insn
));
27579 fusion
= fusion_load_store (insn
, &base
, &offset
);
27580 if (fusion
== SCHED_FUSION_NONE
)
27587 /* Set FUSION_PRI according to fusion type and base register. */
27588 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
27590 /* Calculate PRI. */
27593 /* INSN with smaller offset goes first. */
27594 off_val
= (int)(INTVAL (offset
));
27596 tmp
-= (off_val
& 0xfffff);
27598 tmp
+= ((- off_val
) & 0xfffff);
27604 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27605 Adjust priority of sha1h instructions so they are scheduled before
27606 other SHA1 instructions. */
27609 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
27611 rtx x
= PATTERN (insn
);
27613 if (GET_CODE (x
) == SET
)
27617 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
27618 return priority
+ 10;
27624 /* If REVERSED is null, return true if memory reference *MEM2 comes
27625 immediately after memory reference *MEM1. Do not change the references
27628 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27629 if they are, try to make them use constant offsets from the same base
27630 register. Return true on success. When returning true, set *REVERSED
27631 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
27633 aarch64_check_consecutive_mems (rtx
*mem1
, rtx
*mem2
, bool *reversed
)
27638 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1
, 0))) == RTX_AUTOINC
27639 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2
, 0))) == RTX_AUTOINC
)
27642 if (!MEM_SIZE_KNOWN_P (*mem1
) || !MEM_SIZE_KNOWN_P (*mem2
))
27645 auto size1
= MEM_SIZE (*mem1
);
27646 auto size2
= MEM_SIZE (*mem2
);
27648 rtx base1
, base2
, offset1
, offset2
;
27649 extract_base_offset_in_addr (*mem1
, &base1
, &offset1
);
27650 extract_base_offset_in_addr (*mem2
, &base2
, &offset2
);
27652 /* Make sure at least one memory is in base+offset form. */
27653 if (!(base1
&& offset1
) && !(base2
&& offset2
))
27656 /* If both mems already use the same base register, just check the
27658 if (base1
&& base2
&& rtx_equal_p (base1
, base2
))
27660 if (!offset1
|| !offset2
)
27663 if (known_eq (UINTVAL (offset1
) + size1
, UINTVAL (offset2
)))
27666 if (known_eq (UINTVAL (offset2
) + size2
, UINTVAL (offset1
)) && reversed
)
27675 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27676 guarantee that the values are consecutive. */
27677 if (MEM_EXPR (*mem1
)
27678 && MEM_EXPR (*mem2
)
27679 && MEM_OFFSET_KNOWN_P (*mem1
)
27680 && MEM_OFFSET_KNOWN_P (*mem2
))
27682 poly_int64 expr_offset1
;
27683 poly_int64 expr_offset2
;
27684 tree expr_base1
= get_addr_base_and_unit_offset (MEM_EXPR (*mem1
),
27686 tree expr_base2
= get_addr_base_and_unit_offset (MEM_EXPR (*mem2
),
27690 || !DECL_P (expr_base1
)
27691 || !operand_equal_p (expr_base1
, expr_base2
, OEP_ADDRESS_OF
))
27694 expr_offset1
+= MEM_OFFSET (*mem1
);
27695 expr_offset2
+= MEM_OFFSET (*mem2
);
27697 if (known_eq (expr_offset1
+ size1
, expr_offset2
))
27699 else if (known_eq (expr_offset2
+ size2
, expr_offset1
) && reversed
)
27708 rtx addr1
= plus_constant (Pmode
, XEXP (*mem2
, 0),
27709 expr_offset1
- expr_offset2
);
27710 *mem1
= replace_equiv_address_nv (*mem1
, addr1
);
27714 rtx addr2
= plus_constant (Pmode
, XEXP (*mem1
, 0),
27715 expr_offset2
- expr_offset1
);
27716 *mem2
= replace_equiv_address_nv (*mem2
, addr2
);
27725 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27729 aarch64_ldpstp_operand_mode_p (machine_mode mode
)
27731 if (!targetm
.hard_regno_mode_ok (V0_REGNUM
, mode
)
27732 || hard_regno_nregs (V0_REGNUM
, mode
) > 1)
27735 const auto size
= GET_MODE_SIZE (mode
);
27736 return known_eq (size
, 4) || known_eq (size
, 8) || known_eq (size
, 16);
27739 /* Return true if MEM1 and MEM2 can be combined into a single access
27740 of mode MODE, with the combined access having the same address as MEM1. */
27743 aarch64_mergeable_load_pair_p (machine_mode mode
, rtx mem1
, rtx mem2
)
27745 if (STRICT_ALIGNMENT
&& MEM_ALIGN (mem1
) < GET_MODE_ALIGNMENT (mode
))
27747 return aarch64_check_consecutive_mems (&mem1
, &mem2
, nullptr);
27750 /* Return true if MEM agrees with the ldp-stp policy model.
27751 Otherwise, false. */
27754 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem
, bool load
, machine_mode mode
)
27756 auto policy
= (load
27757 ? aarch64_tune_params
.ldp_policy_model
27758 : aarch64_tune_params
.stp_policy_model
);
27760 /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair. */
27761 if (policy
== AARCH64_LDP_STP_POLICY_NEVER
)
27764 /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27765 do not emit the load pair unless the alignment is checked to be
27766 at least double the alignment of the type. */
27767 if (policy
== AARCH64_LDP_STP_POLICY_ALIGNED
27768 && !optimize_function_for_size_p (cfun
)
27769 && MEM_ALIGN (mem
) < 2 * GET_MODE_ALIGNMENT (mode
))
27775 /* Given OPERANDS of consecutive load/store, check if we can merge
27776 them into ldp/stp. LOAD is true if they are load instructions. */
27779 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
)
27781 enum reg_class rclass_1
, rclass_2
;
27782 rtx mem_1
, mem_2
, reg_1
, reg_2
;
27786 mem_1
= operands
[1];
27787 mem_2
= operands
[3];
27788 reg_1
= operands
[0];
27789 reg_2
= operands
[2];
27790 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
27791 if (REGNO (reg_1
) == REGNO (reg_2
))
27793 if (reg_overlap_mentioned_p (reg_1
, mem_2
))
27798 mem_1
= operands
[0];
27799 mem_2
= operands
[2];
27800 reg_1
= operands
[1];
27801 reg_2
= operands
[3];
27804 /* The mems cannot be volatile. */
27805 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
27808 /* Check if the addresses are in the form of [base+offset]. */
27809 bool reversed
= false;
27810 if (!aarch64_check_consecutive_mems (&mem_1
, &mem_2
, &reversed
))
27813 /* The operands must be of the same size. */
27814 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
27815 GET_MODE_SIZE (GET_MODE (mem_2
))));
27817 /* The lower memory access must be a mem-pair operand. */
27818 rtx lower_mem
= reversed
? mem_2
: mem_1
;
27819 machine_mode lower_mem_mode
= GET_MODE (lower_mem
);
27820 if (!aarch64_mem_pair_operand (lower_mem
, lower_mem_mode
))
27823 /* Check if lower_mem is ok with the ldp-stp policy model. */
27824 if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem
, load
,
27828 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
27829 rclass_1
= FP_REGS
;
27831 rclass_1
= GENERAL_REGS
;
27833 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
27834 rclass_2
= FP_REGS
;
27836 rclass_2
= GENERAL_REGS
;
27838 /* Check if the registers are of same class. */
27839 if (rclass_1
!= rclass_2
)
27845 /* Given OPERANDS of consecutive load/store that can be merged,
27846 swap them if they are not in ascending order. */
27848 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
27850 int mem_op
= load
? 1 : 0;
27851 bool reversed
= false;
27852 if (!aarch64_check_consecutive_mems (operands
+ mem_op
,
27853 operands
+ mem_op
+ 2, &reversed
))
27854 gcc_unreachable ();
27858 /* Irrespective of whether this is a load or a store,
27859 we do the same swap. */
27860 std::swap (operands
[0], operands
[2]);
27861 std::swap (operands
[1], operands
[3]);
27865 /* Helper function used for generation of load/store pair instructions, called
27866 from peepholes in aarch64-ldpstp.md. OPERANDS is an array of
27867 operands as matched by the peepholes in that file. LOAD_P is true if we're
27868 generating a load pair, otherwise we're generating a store pair. CODE is
27869 either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
27870 standard load/store pair. */
27873 aarch64_finish_ldpstp_peephole (rtx
*operands
, bool load_p
, enum rtx_code code
)
27875 aarch64_swap_ldrstr_operands (operands
, load_p
);
27878 emit_insn (aarch64_gen_load_pair (operands
[0], operands
[2],
27879 operands
[1], code
));
27882 gcc_assert (code
== UNKNOWN
);
27883 emit_insn (aarch64_gen_store_pair (operands
[0], operands
[1],
27888 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
27889 comparison between the two. */
27891 aarch64_host_wide_int_compare (const void *x
, const void *y
)
27893 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
27894 * ((const HOST_WIDE_INT
*) y
));
27897 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
27898 other pointing to a REG rtx containing an offset, compare the offsets
27903 1 iff offset (X) > offset (Y)
27904 0 iff offset (X) == offset (Y)
27905 -1 iff offset (X) < offset (Y) */
27907 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
27909 const rtx
* operands_1
= (const rtx
*) x
;
27910 const rtx
* operands_2
= (const rtx
*) y
;
27911 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
27913 if (MEM_P (operands_1
[0]))
27914 mem_1
= operands_1
[0];
27916 mem_1
= operands_1
[1];
27918 if (MEM_P (operands_2
[0]))
27919 mem_2
= operands_2
[0];
27921 mem_2
= operands_2
[1];
27923 /* Extract the offsets. */
27924 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
27925 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
27927 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
27929 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
27932 /* Given OPERANDS of consecutive load/store, check if we can merge
27933 them into ldp/stp by adjusting the offset. LOAD is true if they
27934 are load instructions. MODE is the mode of memory operands.
27936 Given below consecutive stores:
27938 str w1, [xb, 0x100]
27939 str w1, [xb, 0x104]
27940 str w1, [xb, 0x108]
27941 str w1, [xb, 0x10c]
27943 Though the offsets are out of the range supported by stp, we can
27944 still pair them after adjusting the offset, like:
27946 add scratch, xb, 0x100
27947 stp w1, w1, [scratch]
27948 stp w1, w1, [scratch, 0x8]
27950 The peephole patterns detecting this opportunity should guarantee
27951 the scratch register is avaliable. */
27954 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
27957 const int num_insns
= 4;
27958 enum reg_class rclass
;
27959 HOST_WIDE_INT offvals
[num_insns
], msize
;
27960 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
27964 for (int i
= 0; i
< num_insns
; i
++)
27966 reg
[i
] = operands
[2 * i
];
27967 mem
[i
] = operands
[2 * i
+ 1];
27969 gcc_assert (REG_P (reg
[i
]));
27972 /* Do not attempt to merge the loads if the loads clobber each other. */
27973 for (int i
= 0; i
< 8; i
+= 2)
27974 for (int j
= i
+ 2; j
< 8; j
+= 2)
27975 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
27979 for (int i
= 0; i
< num_insns
; i
++)
27981 mem
[i
] = operands
[2 * i
];
27982 reg
[i
] = operands
[2 * i
+ 1];
27985 /* Skip if memory operand is by itself valid for ldp/stp. */
27986 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
27989 for (int i
= 0; i
< num_insns
; i
++)
27991 /* The mems cannot be volatile. */
27992 if (MEM_VOLATILE_P (mem
[i
]))
27995 /* Check if the addresses are in the form of [base+offset]. */
27996 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
27997 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
28001 /* Check if the registers are of same class. */
28002 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
28003 ? FP_REGS
: GENERAL_REGS
;
28005 for (int i
= 1; i
< num_insns
; i
++)
28006 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
28008 if (rclass
!= FP_REGS
)
28013 if (rclass
!= GENERAL_REGS
)
28017 /* Only the last register in the order in which they occur
28018 may be clobbered by the load. */
28019 if (rclass
== GENERAL_REGS
&& load
)
28020 for (int i
= 0; i
< num_insns
- 1; i
++)
28021 if (reg_mentioned_p (reg
[i
], mem
[i
]))
28024 /* Check if the bases are same. */
28025 for (int i
= 0; i
< num_insns
- 1; i
++)
28026 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
28029 for (int i
= 0; i
< num_insns
; i
++)
28030 offvals
[i
] = INTVAL (offset
[i
]);
28032 msize
= GET_MODE_SIZE (mode
).to_constant ();
28034 /* Check if the offsets can be put in the right order to do a ldp/stp. */
28035 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
28036 aarch64_host_wide_int_compare
);
28038 if (!(offvals
[1] == offvals
[0] + msize
28039 && offvals
[3] == offvals
[2] + msize
))
28042 /* Check that offsets are within range of each other. The ldp/stp
28043 instructions have 7 bit immediate offsets, so use 0x80. */
28044 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
28047 /* The offsets must be aligned with respect to each other. */
28048 if (offvals
[0] % msize
!= offvals
[2] % msize
)
28051 /* Check if mem[0] is ok with the ldp-stp policy model. */
28052 if (!aarch64_mem_ok_with_ldpstp_policy_model (mem
[0], load
, mode
))
28058 /* Given OPERANDS of consecutive load/store, this function pairs them
28059 into LDP/STP after adjusting the offset. It depends on the fact
28060 that the operands can be sorted so the offsets are correct for STP.
28061 MODE is the mode of memory operands. CODE is the rtl operator
28062 which should be applied to all memory operands, it's SIGN_EXTEND,
28063 ZERO_EXTEND or UNKNOWN. */
28066 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
28067 machine_mode mode
, RTX_CODE code
)
28069 rtx base
, offset_1
, offset_2
;
28071 rtx temp_operands
[8];
28072 HOST_WIDE_INT off_val_1
, off_val_2
, base_off
, new_off_1
, new_off_2
,
28073 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
28075 /* We make changes on a copy as we may still bail out. */
28076 for (int i
= 0; i
< 8; i
++)
28077 temp_operands
[i
] = operands
[i
];
28079 /* Sort the operands. Note for cases as below:
28084 We need stable sorting otherwise wrong data may be store to offset 0x320.
28085 Also note the dead store in above case should be optimized away, but no
28086 guarantees here. */
28087 gcc_stablesort(temp_operands
, 4, 2 * sizeof (rtx
*),
28088 aarch64_ldrstr_offset_compare
);
28090 /* Copy the memory operands so that if we have to bail for some
28091 reason the original addresses are unchanged. */
28094 mem_1
= copy_rtx (temp_operands
[1]);
28095 mem_2
= copy_rtx (temp_operands
[5]);
28099 mem_1
= copy_rtx (temp_operands
[0]);
28100 mem_2
= copy_rtx (temp_operands
[4]);
28101 gcc_assert (code
== UNKNOWN
);
28104 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
28105 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
28106 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
28107 && offset_2
!= NULL_RTX
);
28109 /* Adjust offset so it can fit in LDP/STP instruction. */
28110 msize
= GET_MODE_SIZE (mode
).to_constant();
28111 stp_off_upper_limit
= msize
* (0x40 - 1);
28112 stp_off_lower_limit
= - msize
* 0x40;
28114 off_val_1
= INTVAL (offset_1
);
28115 off_val_2
= INTVAL (offset_2
);
28117 /* The base offset is optimally half way between the two STP/LDP offsets. */
28119 base_off
= (off_val_1
+ off_val_2
) / 2;
28121 /* However, due to issues with negative LDP/STP offset generation for
28122 larger modes, for DF, DD, DI and vector modes. we must not use negative
28123 addresses smaller than 9 signed unadjusted bits can store. This
28124 provides the most range in this case. */
28125 base_off
= off_val_1
;
28127 /* Adjust the base so that it is aligned with the addresses but still
28129 if (base_off
% msize
!= off_val_1
% msize
)
28130 /* Fix the offset, bearing in mind we want to make it bigger not
28132 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28133 else if (msize
<= 4)
28134 /* The negative range of LDP/STP is one larger than the positive range. */
28137 /* Check if base offset is too big or too small. We can attempt to resolve
28138 this issue by setting it to the maximum value and seeing if the offsets
28140 if (base_off
>= 0x1000)
28142 base_off
= 0x1000 - 1;
28143 /* We must still make sure that the base offset is aligned with respect
28144 to the address. But it may not be made any bigger. */
28145 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28148 /* Likewise for the case where the base is too small. */
28149 if (base_off
<= -0x1000)
28151 base_off
= -0x1000 + 1;
28152 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
28155 /* Offset of the first STP/LDP. */
28156 new_off_1
= off_val_1
- base_off
;
28158 /* Offset of the second STP/LDP. */
28159 new_off_2
= off_val_2
- base_off
;
28161 /* The offsets must be within the range of the LDP/STP instructions. */
28162 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
28163 || new_off_2
> stp_off_upper_limit
|| new_off_2
< stp_off_lower_limit
)
28166 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
28168 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
28171 if (!aarch64_mem_pair_operand (mem_1
, mode
)
28172 || !aarch64_mem_pair_operand (mem_2
, mode
))
28177 operands
[0] = temp_operands
[0];
28178 operands
[1] = mem_1
;
28179 operands
[2] = temp_operands
[2];
28180 operands
[4] = temp_operands
[4];
28181 operands
[5] = mem_2
;
28182 operands
[6] = temp_operands
[6];
28186 operands
[0] = mem_1
;
28187 operands
[1] = temp_operands
[1];
28188 operands
[3] = temp_operands
[3];
28189 operands
[4] = mem_2
;
28190 operands
[5] = temp_operands
[5];
28191 operands
[7] = temp_operands
[7];
28194 /* Emit adjusting instruction. */
28195 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
28196 /* Emit ldp/stp instructions. */
28199 emit_insn (aarch64_gen_load_pair (operands
[0], operands
[2],
28200 operands
[1], code
));
28201 emit_insn (aarch64_gen_load_pair (operands
[4], operands
[6],
28202 operands
[5], code
));
28206 emit_insn (aarch64_gen_store_pair (operands
[0], operands
[1],
28208 emit_insn (aarch64_gen_store_pair (operands
[4], operands
[5],
28214 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
28215 it isn't worth branching around empty masked ops (including masked
28219 aarch64_empty_mask_is_expensive (unsigned)
28224 /* Return 1 if pseudo register should be created and used to hold
28225 GOT address for PIC code. */
28228 aarch64_use_pseudo_pic_reg (void)
28230 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
28233 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
28236 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
28238 switch (XINT (x
, 1))
28240 case UNSPEC_GOTSMALLPIC
:
28241 case UNSPEC_GOTSMALLPIC28K
:
28242 case UNSPEC_GOTTINYPIC
:
28248 return default_unspec_may_trap_p (x
, flags
);
28252 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28253 return the log2 of that value. Otherwise return -1. */
28256 aarch64_fpconst_pow_of_2 (rtx x
)
28258 const REAL_VALUE_TYPE
*r
;
28260 if (!CONST_DOUBLE_P (x
))
28263 r
= CONST_DOUBLE_REAL_VALUE (x
);
28265 if (REAL_VALUE_NEGATIVE (*r
)
28266 || REAL_VALUE_ISNAN (*r
)
28267 || REAL_VALUE_ISINF (*r
)
28268 || !real_isinteger (r
, DFmode
))
28271 return exact_log2 (real_to_integer (r
));
28274 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28275 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28276 return n. Otherwise return -1. */
28279 aarch64_fpconst_pow2_recip (rtx x
)
28281 REAL_VALUE_TYPE r0
;
28283 if (!CONST_DOUBLE_P (x
))
28286 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
28287 if (exact_real_inverse (DFmode
, &r0
)
28288 && !REAL_VALUE_NEGATIVE (r0
))
28290 int ret
= exact_log2 (real_to_integer (&r0
));
28291 if (ret
>= 1 && ret
<= 32)
28297 /* If X is a vector of equal CONST_DOUBLE values and that value is
28298 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
28301 aarch64_vec_fpconst_pow_of_2 (rtx x
)
28304 if (!CONST_VECTOR_P (x
)
28305 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
28308 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
28311 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
28315 for (int i
= 1; i
< nelts
; i
++)
28316 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
28322 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28325 __fp16 always promotes through this hook.
28326 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28327 through the generic excess precision logic rather than here. */
28330 aarch64_promoted_type (const_tree t
)
28332 if (SCALAR_FLOAT_TYPE_P (t
)
28333 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
28334 return float_type_node
;
28339 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
28342 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
28343 optimization_type opt_type
)
28348 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
28355 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
28357 static unsigned int
28358 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
28361 /* Polynomial invariant 1 == (VG / 2) - 1. */
28362 gcc_assert (i
== 1);
28365 return AARCH64_DWARF_VG
;
28368 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28369 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28372 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
28374 return ((mode
== HFmode
|| mode
== BFmode
)
28376 : default_libgcc_floating_mode_supported_p (mode
));
28379 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28380 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
28383 aarch64_scalar_mode_supported_p (scalar_mode mode
)
28385 if (DECIMAL_FLOAT_MODE_P (mode
))
28386 return default_decimal_float_supported_p ();
28388 return ((mode
== HFmode
|| mode
== BFmode
)
28390 : default_scalar_mode_supported_p (mode
));
28393 /* Set the value of FLT_EVAL_METHOD.
28394 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28396 0: evaluate all operations and constants, whose semantic type has at
28397 most the range and precision of type float, to the range and
28398 precision of float; evaluate all other operations and constants to
28399 the range and precision of the semantic type;
28401 N, where _FloatN is a supported interchange floating type
28402 evaluate all operations and constants, whose semantic type has at
28403 most the range and precision of _FloatN type, to the range and
28404 precision of the _FloatN type; evaluate all other operations and
28405 constants to the range and precision of the semantic type;
28407 If we have the ARMv8.2-A extensions then we support _Float16 in native
28408 precision, so we should set this to 16. Otherwise, we support the type,
28409 but want to evaluate expressions in float precision, so set this to
28412 static enum flt_eval_method
28413 aarch64_excess_precision (enum excess_precision_type type
)
28417 case EXCESS_PRECISION_TYPE_FAST
:
28418 case EXCESS_PRECISION_TYPE_STANDARD
:
28419 /* We can calculate either in 16-bit range and precision or
28420 32-bit range and precision. Make that decision based on whether
28421 we have native support for the ARMv8.2-A 16-bit floating-point
28422 instructions or not. */
28423 return (TARGET_FP_F16INST
28424 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28425 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
28426 case EXCESS_PRECISION_TYPE_IMPLICIT
:
28427 case EXCESS_PRECISION_TYPE_FLOAT16
:
28428 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
28430 gcc_unreachable ();
28432 return FLT_EVAL_METHOD_UNPREDICTABLE
;
28435 /* Implement TARGET_C_BITINT_TYPE_INFO.
28436 Return true if _BitInt(N) is supported and fill its details into *INFO. */
28438 aarch64_bitint_type_info (int n
, struct bitint_info
*info
)
28440 if (TARGET_BIG_END
)
28444 info
->limb_mode
= QImode
;
28446 info
->limb_mode
= HImode
;
28448 info
->limb_mode
= SImode
;
28450 info
->limb_mode
= DImode
;
28452 info
->limb_mode
= TImode
;
28454 /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28455 type {signed,unsigned} __int128[M] where M*128 >= N. However, to be
28456 able to use libgcc's implementation to support large _BitInt's we need
28457 to use a LIMB_MODE that is no larger than 'long long'. This is why we
28458 use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28459 be TImode to ensure we are ABI compliant. */
28460 info
->limb_mode
= DImode
;
28463 info
->abi_limb_mode
= TImode
;
28465 info
->abi_limb_mode
= info
->limb_mode
;
28466 info
->big_endian
= TARGET_BIG_END
;
28467 info
->extended
= false;
28471 /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE. Return TFmode for
28472 TI_LONG_DOUBLE_TYPE which is for long double type, go with the default
28473 one for the others. */
28475 static machine_mode
28476 aarch64_c_mode_for_floating_type (enum tree_index ti
)
28478 if (ti
== TI_LONG_DOUBLE_TYPE
)
28480 return default_mode_for_floating_type (ti
);
28483 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
28484 scheduled for speculative execution. Reject the long-running division
28485 and square-root instructions. */
28488 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
28490 switch (get_attr_type (insn
))
28498 case TYPE_NEON_FP_SQRT_S
:
28499 case TYPE_NEON_FP_SQRT_D
:
28500 case TYPE_NEON_FP_SQRT_S_Q
:
28501 case TYPE_NEON_FP_SQRT_D_Q
:
28502 case TYPE_NEON_FP_DIV_S
:
28503 case TYPE_NEON_FP_DIV_D
:
28504 case TYPE_NEON_FP_DIV_S_Q
:
28505 case TYPE_NEON_FP_DIV_D_Q
:
28512 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
28515 aarch64_compute_pressure_classes (reg_class
*classes
)
28518 classes
[i
++] = GENERAL_REGS
;
28519 classes
[i
++] = FP_REGS
;
28520 /* PR_REGS isn't a useful pressure class because many predicate pseudo
28521 registers need to go in PR_LO_REGS at some point during their
28522 lifetime. Splitting it into two halves has the effect of making
28523 all predicates count against PR_LO_REGS, so that we try whenever
28524 possible to restrict the number of live predicates to 8. This
28525 greatly reduces the amount of spilling in certain loops. */
28526 classes
[i
++] = PR_LO_REGS
;
28527 classes
[i
++] = PR_HI_REGS
;
28531 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
28534 aarch64_can_change_mode_class (machine_mode from
,
28535 machine_mode to
, reg_class_t
)
28537 return aarch64_modes_compatible_p (from
, to
);
28540 /* Implement TARGET_EARLY_REMAT_MODES. */
28543 aarch64_select_early_remat_modes (sbitmap modes
)
28545 /* SVE values are not normally live across a call, so it should be
28546 worth doing early rematerialization even in VL-specific mode. */
28547 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
28548 if (aarch64_sve_mode_p ((machine_mode
) i
))
28549 bitmap_set_bit (modes
, i
);
28552 /* Override the default target speculation_safe_value. */
28554 aarch64_speculation_safe_value (machine_mode mode
,
28555 rtx result
, rtx val
, rtx failval
)
28557 /* Maybe we should warn if falling back to hard barriers. They are
28558 likely to be noticably more expensive than the alternative below. */
28559 if (!aarch64_track_speculation
)
28560 return default_speculation_safe_value (mode
, result
, val
, failval
);
28563 val
= copy_to_mode_reg (mode
, val
);
28565 if (!aarch64_reg_or_zero (failval
, mode
))
28566 failval
= copy_to_mode_reg (mode
, failval
);
28568 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
28572 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28573 Look into the tuning structure for an estimate.
28574 KIND specifies the type of requested estimate: min, max or likely.
28575 For cores with a known SVE width all three estimates are the same.
28576 For generic SVE tuning we want to distinguish the maximum estimate from
28577 the minimum and likely ones.
28578 The likely estimate is the same as the minimum in that case to give a
28579 conservative behavior of auto-vectorizing with SVE when it is a win
28580 even for 128-bit SVE.
28581 When SVE width information is available VAL.coeffs[1] is multiplied by
28582 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
28584 static HOST_WIDE_INT
28585 aarch64_estimated_poly_value (poly_int64 val
,
28586 poly_value_estimate_kind kind
28587 = POLY_VALUE_LIKELY
)
28589 unsigned int width_source
= aarch64_tune_params
.sve_width
;
28591 /* If there is no core-specific information then the minimum and likely
28592 values are based on 128-bit vectors and the maximum is based on
28593 the architectural maximum of 2048 bits. */
28594 if (width_source
== SVE_SCALABLE
)
28597 case POLY_VALUE_MIN
:
28598 case POLY_VALUE_LIKELY
:
28599 return val
.coeffs
[0];
28600 case POLY_VALUE_MAX
:
28601 return val
.coeffs
[0] + val
.coeffs
[1] * 15;
28604 /* Allow sve_width to be a bitmask of different VL, treating the lowest
28605 as likely. This could be made more general if future -mtune options
28607 if (kind
== POLY_VALUE_MAX
)
28608 width_source
= 1 << floor_log2 (width_source
);
28610 width_source
= least_bit_hwi (width_source
);
28612 /* If the core provides width information, use that. */
28613 HOST_WIDE_INT over_128
= width_source
- 128;
28614 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
28618 /* Return true for types that could be supported as SIMD return or
28622 supported_simd_type (tree t
)
28624 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
28626 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
28627 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
28632 /* Determine the lane size for the clone argument/return type. This follows
28633 the LS(P) rule in the VFABIA64. */
28636 lane_size (cgraph_simd_clone_arg_type clone_arg_type
, tree type
)
28638 gcc_assert (clone_arg_type
!= SIMD_CLONE_ARG_TYPE_MASK
);
28640 /* For non map-to-vector types that are pointers we use the element type it
28642 if (POINTER_TYPE_P (type
))
28643 switch (clone_arg_type
)
28647 case SIMD_CLONE_ARG_TYPE_UNIFORM
:
28648 case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP
:
28649 case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP
:
28650 type
= TREE_TYPE (type
);
28654 /* For types (or pointers of non map-to-vector types point to) that are
28655 integers or floating point, we use their size if they are 1, 2, 4 or 8.
28657 if (INTEGRAL_TYPE_P (type
)
28658 || SCALAR_FLOAT_TYPE_P (type
))
28659 switch (TYPE_PRECISION (type
) / BITS_PER_UNIT
)
28667 return TYPE_PRECISION (type
);
28669 /* For any other we use the size of uintptr_t. For map-to-vector types that
28670 are pointers, using the size of uintptr_t is the same as using the size of
28671 their type, seeing all pointers are the same size as uintptr_t. */
28672 return POINTER_SIZE
;
28676 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
28679 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
28680 struct cgraph_simd_clone
*clonei
,
28681 tree base_type ATTRIBUTE_UNUSED
,
28682 int num
, bool explicit_p
)
28685 unsigned int nds_elt_bits
;
28686 unsigned HOST_WIDE_INT const_simdlen
;
28691 /* For now, SVE simdclones won't produce illegal simdlen, So only check
28692 const simdlens here. */
28693 if (maybe_ne (clonei
->simdlen
, 0U)
28694 && clonei
->simdlen
.is_constant (&const_simdlen
)
28695 && (const_simdlen
< 2
28696 || const_simdlen
> 1024
28697 || (const_simdlen
& (const_simdlen
- 1)) != 0))
28700 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28701 "unsupported simdlen %wd", const_simdlen
);
28705 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
28706 /* According to AArch64's Vector ABI the type that determines the simdlen is
28707 the narrowest of types, so we ignore base_type for AArch64. */
28708 if (TREE_CODE (ret_type
) != VOID_TYPE
28709 && !supported_simd_type (ret_type
))
28713 else if (COMPLEX_FLOAT_TYPE_P (ret_type
))
28714 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28715 "GCC does not currently support return type %qT "
28716 "for simd", ret_type
);
28718 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28719 "unsupported return type %qT for simd",
28724 auto_vec
<std::pair
<tree
, unsigned int>> vec_elts (clonei
->nargs
+ 1);
28726 /* We are looking for the NDS type here according to the VFABIA64. */
28727 if (TREE_CODE (ret_type
) != VOID_TYPE
)
28729 nds_elt_bits
= lane_size (SIMD_CLONE_ARG_TYPE_VECTOR
, ret_type
);
28730 vec_elts
.safe_push (std::make_pair (ret_type
, nds_elt_bits
));
28733 nds_elt_bits
= POINTER_SIZE
;
28736 tree type_arg_types
= TYPE_ARG_TYPES (TREE_TYPE (node
->decl
));
28737 bool decl_arg_p
= (node
->definition
|| type_arg_types
== NULL_TREE
);
28738 for (t
= (decl_arg_p
? DECL_ARGUMENTS (node
->decl
) : type_arg_types
), i
= 0;
28739 t
&& t
!= void_list_node
; t
= TREE_CHAIN (t
), i
++)
28741 tree arg_type
= decl_arg_p
? TREE_TYPE (t
) : TREE_VALUE (t
);
28742 if (clonei
->args
[i
].arg_type
!= SIMD_CLONE_ARG_TYPE_UNIFORM
28743 && !supported_simd_type (arg_type
))
28747 else if (COMPLEX_FLOAT_TYPE_P (ret_type
))
28748 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28749 "GCC does not currently support argument type %qT "
28750 "for simd", arg_type
);
28752 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28753 "unsupported argument type %qT for simd",
28757 unsigned lane_bits
= lane_size (clonei
->args
[i
].arg_type
, arg_type
);
28758 if (clonei
->args
[i
].arg_type
== SIMD_CLONE_ARG_TYPE_VECTOR
)
28759 vec_elts
.safe_push (std::make_pair (arg_type
, lane_bits
));
28760 if (nds_elt_bits
> lane_bits
)
28761 nds_elt_bits
= lane_bits
;
28764 clonei
->vecsize_mangle
= 'n';
28765 clonei
->mask_mode
= VOIDmode
;
28766 poly_uint64 simdlen
;
28767 auto_vec
<poly_uint64
> simdlens (2);
28768 /* Keep track of the possible simdlens the clones of this function can have,
28769 and check them later to see if we support them. */
28770 if (known_eq (clonei
->simdlen
, 0U))
28772 simdlen
= exact_div (poly_uint64 (64), nds_elt_bits
);
28773 if (maybe_ne (simdlen
, 1U))
28774 simdlens
.safe_push (simdlen
);
28775 simdlens
.safe_push (simdlen
* 2);
28778 simdlens
.safe_push (clonei
->simdlen
);
28780 clonei
->vecsize_int
= 0;
28781 clonei
->vecsize_float
= 0;
28783 /* We currently do not support generating simdclones where vector arguments
28784 do not fit into a single vector register, i.e. vector types that are more
28785 than 128-bits large. This is because of how we currently represent such
28786 types in ACLE, where we use a struct to allow us to pass them as arguments
28788 Hence why we have to check whether the simdlens available for this
28789 simdclone would cause a vector type to be larger than 128-bits, and reject
28792 while (j
< simdlens
.length ())
28794 bool remove_simdlen
= false;
28795 for (auto elt
: vec_elts
)
28796 if (known_gt (simdlens
[j
] * elt
.second
, 128U))
28798 /* Don't issue a warning for every simdclone when there is no
28799 specific simdlen clause. */
28800 if (explicit_p
&& maybe_ne (clonei
->simdlen
, 0U))
28801 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28802 "GCC does not currently support simdlen %wd for "
28804 constant_lower_bound (simdlens
[j
]), elt
.first
);
28805 remove_simdlen
= true;
28808 if (remove_simdlen
)
28809 simdlens
.ordered_remove (j
);
28815 int count
= simdlens
.length ();
28818 if (explicit_p
&& known_eq (clonei
->simdlen
, 0U))
28820 /* Warn the user if we can't generate any simdclone. */
28821 simdlen
= exact_div (poly_uint64 (64), nds_elt_bits
);
28822 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
28823 "GCC does not currently support a simdclone with simdlens"
28824 " %wd and %wd for these types.",
28825 constant_lower_bound (simdlen
),
28826 constant_lower_bound (simdlen
*2));
28831 gcc_assert (num
< count
);
28832 clonei
->simdlen
= simdlens
[num
];
28836 /* Implement TARGET_SIMD_CLONE_ADJUST. */
28839 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
28841 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
28842 use the correct ABI. */
28844 tree t
= TREE_TYPE (node
->decl
);
28845 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
28846 TYPE_ATTRIBUTES (t
));
28849 /* Implement TARGET_SIMD_CLONE_USABLE. */
28852 aarch64_simd_clone_usable (struct cgraph_node
*node
)
28854 switch (node
->simdclone
->vecsize_mangle
)
28861 gcc_unreachable ();
28865 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
28868 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
28870 auto check_attr
= [&](const char *ns
, const char *name
) {
28871 tree attr1
= lookup_attribute (ns
, name
, TYPE_ATTRIBUTES (type1
));
28872 tree attr2
= lookup_attribute (ns
, name
, TYPE_ATTRIBUTES (type2
));
28873 if (!attr1
&& !attr2
)
28876 return attr1
&& attr2
&& attribute_value_equal (attr1
, attr2
);
28879 if (!check_attr ("gnu", "aarch64_vector_pcs"))
28881 if (!check_attr ("gnu", "Advanced SIMD type"))
28883 if (!check_attr ("gnu", "SVE type"))
28885 if (!check_attr ("gnu", "SVE sizeless type"))
28887 if (!check_attr ("arm", "streaming"))
28889 if (!check_attr ("arm", "streaming_compatible"))
28891 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1
), "za")
28892 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2
), "za"))
28894 if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1
), "zt0")
28895 != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2
), "zt0"))
28900 /* Implement TARGET_MERGE_DECL_ATTRIBUTES. */
28903 aarch64_merge_decl_attributes (tree olddecl
, tree newdecl
)
28905 tree old_attrs
= DECL_ATTRIBUTES (olddecl
);
28906 tree old_new
= lookup_attribute ("arm", "new", old_attrs
);
28908 tree new_attrs
= DECL_ATTRIBUTES (newdecl
);
28909 tree new_new
= lookup_attribute ("arm", "new", new_attrs
);
28911 if (DECL_INITIAL (olddecl
) && new_new
)
28913 error ("cannot apply attribute %qs to %q+D after the function"
28914 " has been defined", "new", newdecl
);
28915 inform (DECL_SOURCE_LOCATION (olddecl
), "%q+D defined here",
28920 if (old_new
&& new_new
)
28922 old_attrs
= remove_attribute ("arm", "new", old_attrs
);
28923 TREE_VALUE (new_new
) = chainon (TREE_VALUE (new_new
),
28924 TREE_VALUE (old_new
));
28927 aarch64_check_arm_new_against_type (TREE_VALUE (new_new
), newdecl
);
28930 return merge_attributes (old_attrs
, new_attrs
);
28933 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
28935 static const char *
28936 aarch64_get_multilib_abi_name (void)
28938 if (TARGET_BIG_END
)
28939 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
28940 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
28943 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
28944 global variable based guard use the default else
28945 return a null tree. */
28947 aarch64_stack_protect_guard (void)
28949 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
28950 return default_stack_protect_guard ();
28955 /* Return the diagnostic message string if the binary operation OP is
28956 not permitted on TYPE1 and TYPE2, NULL otherwise. */
28958 static const char *
28959 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED
, const_tree type1
,
28962 if (VECTOR_TYPE_P (type1
)
28963 && VECTOR_TYPE_P (type2
)
28964 && !TYPE_INDIVISIBLE_P (type1
)
28965 && !TYPE_INDIVISIBLE_P (type2
)
28966 && (aarch64_sve::builtin_type_p (type1
)
28967 != aarch64_sve::builtin_type_p (type2
)))
28968 return N_("cannot combine GNU and SVE vectors in a binary operation");
28970 /* Operation allowed. */
28974 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
28975 compiler that we automatically ignore the top byte of our pointers, which
28976 allows using -fsanitize=hwaddress. */
28978 aarch64_can_tag_addresses ()
28980 return !TARGET_ILP32
;
28983 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
28984 section at the end if needed. */
28985 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
28986 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
28987 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
28989 aarch64_file_end_indicate_exec_stack ()
28991 file_end_indicate_exec_stack ();
28993 unsigned feature_1_and
= 0;
28994 if (aarch_bti_enabled ())
28995 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
28997 if (aarch_ra_sign_scope
!= AARCH_FUNCTION_NONE
)
28998 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
29002 /* Generate .note.gnu.property section. */
29003 switch_to_section (get_section (".note.gnu.property",
29004 SECTION_NOTYPE
, NULL
));
29006 /* PT_NOTE header: namesz, descsz, type.
29007 namesz = 4 ("GNU\0")
29008 descsz = 16 (Size of the program property array)
29009 [(12 + padding) * Number of array elements]
29010 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
29011 assemble_align (POINTER_SIZE
);
29012 assemble_integer (GEN_INT (4), 4, 32, 1);
29013 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
29014 assemble_integer (GEN_INT (5), 4, 32, 1);
29016 /* PT_NOTE name. */
29017 assemble_string ("GNU", 4);
29019 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
29020 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
29022 data = feature_1_and. */
29023 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
29024 assemble_integer (GEN_INT (4), 4, 32, 1);
29025 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
29027 /* Pad the size of the note to the required alignment. */
29028 assemble_align (POINTER_SIZE
);
29031 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
29032 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
29033 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
29035 /* Helper function for straight line speculation.
29036 Return what barrier should be emitted for straight line speculation
29038 When not mitigating against straight line speculation this function returns
29040 When mitigating against straight line speculation, use:
29041 * SB when the v8.5-A SB extension is enabled.
29042 * DSB+ISB otherwise. */
29044 aarch64_sls_barrier (int mitigation_required
)
29046 return mitigation_required
29047 ? (TARGET_SB
? "sb" : "dsb\tsy\n\tisb")
29051 static GTY (()) tree aarch64_sls_shared_thunks
[30];
29052 static GTY (()) bool aarch64_sls_shared_thunks_needed
= false;
29053 const char *indirect_symbol_names
[30] = {
29054 "__call_indirect_x0",
29055 "__call_indirect_x1",
29056 "__call_indirect_x2",
29057 "__call_indirect_x3",
29058 "__call_indirect_x4",
29059 "__call_indirect_x5",
29060 "__call_indirect_x6",
29061 "__call_indirect_x7",
29062 "__call_indirect_x8",
29063 "__call_indirect_x9",
29064 "__call_indirect_x10",
29065 "__call_indirect_x11",
29066 "__call_indirect_x12",
29067 "__call_indirect_x13",
29068 "__call_indirect_x14",
29069 "__call_indirect_x15",
29070 "", /* "__call_indirect_x16", */
29071 "", /* "__call_indirect_x17", */
29072 "__call_indirect_x18",
29073 "__call_indirect_x19",
29074 "__call_indirect_x20",
29075 "__call_indirect_x21",
29076 "__call_indirect_x22",
29077 "__call_indirect_x23",
29078 "__call_indirect_x24",
29079 "__call_indirect_x25",
29080 "__call_indirect_x26",
29081 "__call_indirect_x27",
29082 "__call_indirect_x28",
29083 "__call_indirect_x29",
29086 /* Function to create a BLR thunk. This thunk is used to mitigate straight
29087 line speculation. Instead of a simple BLR that can be speculated past,
29088 we emit a BL to this thunk, and this thunk contains a BR to the relevant
29089 register. These thunks have the relevant speculation barries put after
29090 their indirect branch so that speculation is blocked.
29092 We use such a thunk so the speculation barriers are kept off the
29093 architecturally executed path in order to reduce the performance overhead.
29095 When optimizing for size we use stubs shared by the linked object.
29096 When optimizing for performance we emit stubs for each function in the hope
29097 that the branch predictor can better train on jumps specific for a given
29100 aarch64_sls_create_blr_label (int regnum
)
29102 gcc_assert (STUB_REGNUM_P (regnum
));
29103 if (optimize_function_for_size_p (cfun
))
29105 /* For the thunks shared between different functions in this compilation
29106 unit we use a named symbol -- this is just for users to more easily
29107 understand the generated assembly. */
29108 aarch64_sls_shared_thunks_needed
= true;
29109 const char *thunk_name
= indirect_symbol_names
[regnum
];
29110 if (aarch64_sls_shared_thunks
[regnum
] == NULL
)
29112 /* Build a decl representing this function stub and record it for
29113 later. We build a decl here so we can use the GCC machinery for
29114 handling sections automatically (through `get_named_section` and
29115 `make_decl_one_only`). That saves us a lot of trouble handling
29116 the specifics of different output file formats. */
29117 tree decl
= build_decl (BUILTINS_LOCATION
, FUNCTION_DECL
,
29118 get_identifier (thunk_name
),
29119 build_function_type_list (void_type_node
,
29121 DECL_RESULT (decl
) = build_decl (BUILTINS_LOCATION
, RESULT_DECL
,
29122 NULL_TREE
, void_type_node
);
29123 TREE_PUBLIC (decl
) = 1;
29124 TREE_STATIC (decl
) = 1;
29125 DECL_IGNORED_P (decl
) = 1;
29126 DECL_ARTIFICIAL (decl
) = 1;
29127 make_decl_one_only (decl
, DECL_ASSEMBLER_NAME (decl
));
29128 resolve_unique_section (decl
, 0, false);
29129 aarch64_sls_shared_thunks
[regnum
] = decl
;
29132 return gen_rtx_SYMBOL_REF (Pmode
, thunk_name
);
29135 if (cfun
->machine
->call_via
[regnum
] == NULL
)
29136 cfun
->machine
->call_via
[regnum
]
29137 = gen_rtx_LABEL_REF (Pmode
, gen_label_rtx ());
29138 return cfun
->machine
->call_via
[regnum
];
29141 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29142 aarch64_sls_emit_shared_blr_thunks below. */
29144 aarch64_sls_emit_function_stub (FILE *out_file
, int regnum
)
29146 /* Save in x16 and branch to that function so this transformation does
29147 not prevent jumping to `BTI c` instructions. */
29148 asm_fprintf (out_file
, "\tmov\tx16, x%d\n", regnum
);
29149 asm_fprintf (out_file
, "\tbr\tx16\n");
29152 /* Emit all BLR stubs for this particular function.
29153 Here we emit all the BLR stubs needed for the current function. Since we
29154 emit these stubs in a consecutive block we know there will be no speculation
29155 gadgets between each stub, and hence we only emit a speculation barrier at
29156 the end of the stub sequences.
29158 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
29160 aarch64_sls_emit_blr_function_thunks (FILE *out_file
)
29162 if (! aarch64_harden_sls_blr_p ())
29165 bool any_functions_emitted
= false;
29166 /* We must save and restore the current function section since this assembly
29167 is emitted at the end of the function. This means it can be emitted *just
29168 after* the cold section of a function. That cold part would be emitted in
29169 a different section. That switch would trigger a `.cfi_endproc` directive
29170 to be emitted in the original section and a `.cfi_startproc` directive to
29171 be emitted in the new section. Switching to the original section without
29172 restoring would mean that the `.cfi_endproc` emitted as a function ends
29173 would happen in a different section -- leaving an unmatched
29174 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29175 in the standard text section. */
29176 section
*save_text_section
= in_section
;
29177 switch_to_section (function_section (current_function_decl
));
29178 for (int regnum
= 0; regnum
< 30; ++regnum
)
29180 rtx specu_label
= cfun
->machine
->call_via
[regnum
];
29181 if (specu_label
== NULL
)
29184 targetm
.asm_out
.print_operand (out_file
, specu_label
, 0);
29185 asm_fprintf (out_file
, ":\n");
29186 aarch64_sls_emit_function_stub (out_file
, regnum
);
29187 any_functions_emitted
= true;
29189 if (any_functions_emitted
)
29190 /* Can use the SB if needs be here, since this stub will only be used
29191 by the current function, and hence for the current target. */
29192 asm_fprintf (out_file
, "\t%s\n", aarch64_sls_barrier (true));
29193 switch_to_section (save_text_section
);
29196 /* Emit shared BLR stubs for the current compilation unit.
29197 Over the course of compiling this unit we may have converted some BLR
29198 instructions to a BL to a shared stub function. This is where we emit those
29200 This function is for the stubs shared between different functions in this
29201 compilation unit. We share when optimizing for size instead of speed.
29203 This function is called through the TARGET_ASM_FILE_END hook. */
29205 aarch64_sls_emit_shared_blr_thunks (FILE *out_file
)
29207 if (! aarch64_sls_shared_thunks_needed
)
29210 for (int regnum
= 0; regnum
< 30; ++regnum
)
29212 tree decl
= aarch64_sls_shared_thunks
[regnum
];
29216 const char *name
= indirect_symbol_names
[regnum
];
29217 switch_to_section (get_named_section (decl
, NULL
, 0));
29218 ASM_OUTPUT_ALIGN (out_file
, 2);
29219 targetm
.asm_out
.globalize_label (out_file
, name
);
29220 /* Only emits if the compiler is configured for an assembler that can
29221 handle visibility directives. */
29222 targetm
.asm_out
.assemble_visibility (decl
, VISIBILITY_HIDDEN
);
29223 ASM_OUTPUT_TYPE_DIRECTIVE (out_file
, name
, "function");
29224 ASM_OUTPUT_LABEL (out_file
, name
);
29225 aarch64_sls_emit_function_stub (out_file
, regnum
);
29226 /* Use the most conservative target to ensure it can always be used by any
29227 function in the translation unit. */
29228 asm_fprintf (out_file
, "\tdsb\tsy\n\tisb\n");
29229 ASM_DECLARE_FUNCTION_SIZE (out_file
, name
, decl
);
29233 /* Implement TARGET_ASM_FILE_END. */
29235 aarch64_asm_file_end ()
29237 aarch64_sls_emit_shared_blr_thunks (asm_out_file
);
29238 /* Since this function will be called for the ASM_FILE_END hook, we ensure
29239 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29240 for FreeBSD) still gets called. */
29241 #ifdef TARGET_ASM_FILE_END
29242 TARGET_ASM_FILE_END ();
29247 aarch64_indirect_call_asm (rtx addr
)
29249 gcc_assert (REG_P (addr
));
29250 if (aarch64_harden_sls_blr_p ())
29252 rtx stub_label
= aarch64_sls_create_blr_label (REGNO (addr
));
29253 output_asm_insn ("bl\t%0", &stub_label
);
29256 output_asm_insn ("blr\t%0", &addr
);
29260 /* Emit the assembly instruction to load the thread pointer into DEST.
29261 Select between different tpidr_elN registers depending on -mtp= setting. */
29264 aarch64_output_load_tp (rtx dest
)
29266 const char *tpidrs
[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29267 "tpidr_el3", "tpidrro_el0"};
29269 snprintf (buffer
, sizeof (buffer
), "mrs\t%%0, %s",
29270 tpidrs
[aarch64_tpidr_register
]);
29271 output_asm_insn (buffer
, &dest
);
29275 /* Set up the value of REG_ALLOC_ORDER from scratch.
29277 It was previously good practice to put call-clobbered registers ahead
29278 of call-preserved registers, but that isn't necessary these days.
29279 IRA's model of register save/restore costs is much more sophisticated
29280 than the model that a simple ordering could provide. We leave
29281 HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29284 However, it is still useful to list registers that are members of
29285 multiple classes after registers that are members of fewer classes.
29286 For example, we have:
29288 - FP_LO8_REGS: v0-v7
29289 - FP_LO_REGS: v0-v15
29292 If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29293 we run the risk of starving other (lower-priority) pseudos that
29294 require FP_LO8_REGS or FP_LO_REGS. Allocating FP_LO_REGS in the
29295 order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29296 Allocating downwards rather than upwards avoids this problem, at least
29297 in code that has reasonable register pressure.
29299 The situation for predicate registers is similar. */
29302 aarch64_adjust_reg_alloc_order ()
29304 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; ++i
)
29305 if (IN_RANGE (i
, V0_REGNUM
, V31_REGNUM
))
29306 reg_alloc_order
[i
] = V31_REGNUM
- (i
- V0_REGNUM
);
29307 else if (IN_RANGE (i
, P0_REGNUM
, P15_REGNUM
))
29308 reg_alloc_order
[i
] = P15_REGNUM
- (i
- P0_REGNUM
);
29310 reg_alloc_order
[i
] = i
;
29313 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29314 of vector mode MODE to select half the elements of that vector.
29315 Allow any combination of indices except duplicates (or out of range of
29316 the mode units). */
29319 aarch64_parallel_select_half_p (machine_mode mode
, rtx par
)
29321 int nunits
= XVECLEN (par
, 0);
29322 if (!known_eq (GET_MODE_NUNITS (mode
), nunits
* 2))
29324 int mode_nunits
= nunits
* 2;
29325 /* Put all the elements of PAR into a hash_set and use its
29326 uniqueness guarantees to check that we don't try to insert the same
29328 hash_set
<rtx
> parset
;
29329 for (int i
= 0; i
< nunits
; ++i
)
29331 rtx elt
= XVECEXP (par
, 0, i
);
29332 if (!CONST_INT_P (elt
)
29333 || !IN_RANGE (INTVAL (elt
), 0, mode_nunits
- 1)
29334 || parset
.add (elt
))
29340 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29341 contain any common elements. */
29344 aarch64_pars_overlap_p (rtx par1
, rtx par2
)
29346 int len1
= XVECLEN (par1
, 0);
29347 int len2
= XVECLEN (par2
, 0);
29348 hash_set
<rtx
> parset
;
29349 for (int i
= 0; i
< len1
; ++i
)
29350 parset
.add (XVECEXP (par1
, 0, i
));
29351 for (int i
= 0; i
< len2
; ++i
)
29352 if (parset
.contains (XVECEXP (par2
, 0, i
)))
29357 /* Implement OPTIMIZE_MODE_SWITCHING. */
29360 aarch64_optimize_mode_switching (aarch64_mode_entity entity
)
29362 bool have_sme_state
= (aarch64_cfun_incoming_pstate_za () != 0
29363 || (aarch64_cfun_has_new_state ("za")
29364 && df_regs_ever_live_p (ZA_REGNUM
))
29365 || (aarch64_cfun_has_new_state ("zt0")
29366 && df_regs_ever_live_p (ZT0_REGNUM
)));
29368 if (have_sme_state
&& nonlocal_goto_handler_labels
)
29370 static bool reported
;
29373 sorry ("non-local gotos in functions with SME state");
29380 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29381 case aarch64_mode_entity::LOCAL_SME_STATE
:
29382 return have_sme_state
&& !nonlocal_goto_handler_labels
;
29384 gcc_unreachable ();
29387 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER. */
29390 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode
,
29391 aarch64_tristate_mode prev_mode
)
29393 if (mode
== aarch64_tristate_mode::YES
)
29395 gcc_assert (prev_mode
== aarch64_tristate_mode::NO
);
29396 aarch64_init_tpidr2_block ();
29399 gcc_unreachable ();
29402 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE. */
29405 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode
,
29406 aarch64_local_sme_state prev_mode
)
29408 /* Back-propagation should ensure that we're always starting from
29410 gcc_assert (prev_mode
!= aarch64_local_sme_state::ANY
);
29412 if (prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
)
29414 /* Commit any uncommitted lazy save. This leaves ZA either active
29415 and zero (lazy save case) or off (normal case).
29419 mrs <temp>, tpidr2_el0
29420 cbz <temp>, no_save
29421 bl __arm_tpidr2_save
29422 msr tpidr2_el0, xzr
29423 zero { za } // Only if ZA is live
29424 zero { zt0 } // Only if ZT0 is live
29426 auto tmp_reg
= gen_reg_rtx (DImode
);
29427 emit_insn (gen_aarch64_read_tpidr2 (tmp_reg
));
29428 auto label
= gen_label_rtx ();
29429 rtx branch
= aarch64_gen_compare_zero_and_branch (EQ
, tmp_reg
, label
);
29430 auto jump
= emit_jump_insn (branch
);
29431 JUMP_LABEL (jump
) = label
;
29432 emit_insn (gen_aarch64_tpidr2_save ());
29433 emit_insn (gen_aarch64_clear_tpidr2 ());
29434 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
29435 || mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
29437 if (aarch64_cfun_has_state ("za"))
29438 emit_insn (gen_aarch64_initial_zero_za ());
29439 if (aarch64_cfun_has_state ("zt0"))
29440 emit_insn (gen_aarch64_sme_zero_zt0 ());
29442 emit_label (label
);
29445 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
29446 || mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
29448 if (prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
29450 /* Make ZA active after being inactive.
29452 First handle the case in which the lazy save we set up was
29453 committed by a callee. If the function's source-level ZA state
29454 is live then we must conditionally restore it from the lazy
29455 save buffer. Otherwise we can just force PSTATE.ZA to 1. */
29456 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
)
29457 emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29459 emit_insn (gen_aarch64_smstart_za ());
29461 /* Now handle the case in which the lazy save was not committed.
29462 In that case, ZA still contains the current function's ZA state,
29463 and we just need to cancel the lazy save. */
29464 emit_insn (gen_aarch64_clear_tpidr2 ());
29466 /* Restore the ZT0 state, if we have some. */
29467 if (aarch64_cfun_has_state ("zt0"))
29468 aarch64_restore_zt0 (true);
29473 if (prev_mode
== aarch64_local_sme_state::SAVED_LOCAL
)
29475 /* Retrieve the current function's ZA state from the lazy save
29477 aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29479 /* Restore the ZT0 state, if we have some. */
29480 if (aarch64_cfun_has_state ("zt0"))
29481 aarch64_restore_zt0 (true);
29485 if (prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
29486 || prev_mode
== aarch64_local_sme_state::OFF
)
29488 /* INACTIVE_CALLER means that we are enabling ZA for the first
29489 time in this function. The code above means that ZA is either
29490 active and zero (if we committed a lazy save) or off. Handle
29491 the latter case by forcing ZA on.
29493 OFF means that PSTATE.ZA is guaranteed to be 0. We just need
29496 Both cases leave ZA zeroed. */
29497 emit_insn (gen_aarch64_smstart_za ());
29499 /* Restore the ZT0 state, if we have some. */
29500 if (prev_mode
== aarch64_local_sme_state::OFF
29501 && aarch64_cfun_has_state ("zt0"))
29502 aarch64_restore_zt0 (true);
29506 if (prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
29507 || prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
)
29508 /* A simple change in liveness, such as in a CFG structure where
29509 ZA is only conditionally defined. No code is needed. */
29512 gcc_unreachable ();
29515 if (mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
29517 if (prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
29518 || prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
29519 || prev_mode
== aarch64_local_sme_state::INACTIVE_CALLER
)
29521 /* Save the ZT0 state, if we have some. */
29522 if (aarch64_cfun_has_state ("zt0"))
29523 aarch64_save_zt0 ();
29525 /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29526 case of setting up a lazy save buffer before a call.
29527 A transition from INACTIVE_CALLER is similar, except that
29528 the contents of ZA are known to be zero.
29530 A transition from ACTIVE_DEAD means that ZA is live at the
29531 point of the transition, but is dead on at least one incoming
29532 edge. (That is, ZA is only conditionally initialized.)
29533 For efficiency, we want to set up a lazy save even for
29534 dead contents, since forcing ZA off would make later code
29535 restore ZA from the lazy save buffer. */
29536 emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29540 if (prev_mode
== aarch64_local_sme_state::SAVED_LOCAL
29541 || prev_mode
== aarch64_local_sme_state::OFF
)
29542 /* We're simply discarding the information about which inactive
29546 gcc_unreachable ();
29549 if (mode
== aarch64_local_sme_state::INACTIVE_CALLER
29550 || mode
== aarch64_local_sme_state::OFF
)
29552 /* Save the ZT0 state, if we have some. */
29553 if ((prev_mode
== aarch64_local_sme_state::ACTIVE_LIVE
29554 || prev_mode
== aarch64_local_sme_state::ACTIVE_DEAD
)
29555 && mode
== aarch64_local_sme_state::OFF
29556 && aarch64_cfun_has_state ("zt0"))
29557 aarch64_save_zt0 ();
29559 /* The transition to INACTIVE_CALLER is used before returning from
29560 new("za") functions. Any state in ZA belongs to the current
29561 function rather than a caller, but that state is no longer
29562 needed. Clear any pending lazy save and turn ZA off.
29564 The transition to OFF is used before calling a private-ZA function.
29565 We committed any incoming lazy save above, so at this point any
29566 contents in ZA belong to the current function. */
29567 if (prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
)
29568 emit_insn (gen_aarch64_clear_tpidr2 ());
29570 if (prev_mode
!= aarch64_local_sme_state::OFF
29571 && prev_mode
!= aarch64_local_sme_state::SAVED_LOCAL
)
29572 emit_insn (gen_aarch64_smstop_za ());
29577 if (mode
== aarch64_local_sme_state::SAVED_LOCAL
)
29579 /* This is a transition to an exception handler. */
29580 gcc_assert (prev_mode
== aarch64_local_sme_state::OFF
29581 || prev_mode
== aarch64_local_sme_state::INACTIVE_LOCAL
);
29585 gcc_unreachable ();
29588 /* Implement TARGET_MODE_EMIT. */
29591 aarch64_mode_emit (int entity
, int mode
, int prev_mode
, HARD_REG_SET live
)
29593 if (mode
== prev_mode
)
29597 switch (aarch64_mode_entity (entity
))
29599 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29600 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode
),
29601 aarch64_tristate_mode (prev_mode
));
29604 case aarch64_mode_entity::LOCAL_SME_STATE
:
29605 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode
),
29606 aarch64_local_sme_state (prev_mode
));
29609 rtx_insn
*seq
= get_insns ();
29612 /* Get the set of clobbered registers that are currently live. */
29613 HARD_REG_SET clobbers
= {};
29614 for (rtx_insn
*insn
= seq
; insn
; insn
= NEXT_INSN (insn
))
29616 if (!NONDEBUG_INSN_P (insn
))
29618 vec_rtx_properties properties
;
29619 properties
.add_insn (insn
, false);
29620 for (rtx_obj_reference ref
: properties
.refs ())
29621 if (ref
.is_write () && HARD_REGISTER_NUM_P (ref
.regno
))
29622 SET_HARD_REG_BIT (clobbers
, ref
.regno
);
29626 /* Emit instructions to save clobbered registers to pseudos. Queue
29627 instructions to restore the registers afterwards.
29629 This should only needed in rare situations. */
29630 auto_vec
<rtx
, 33> after
;
29631 for (unsigned int regno
= R0_REGNUM
; regno
< R30_REGNUM
; ++regno
)
29632 if (TEST_HARD_REG_BIT (clobbers
, regno
))
29634 rtx hard_reg
= gen_rtx_REG (DImode
, regno
);
29635 rtx pseudo_reg
= gen_reg_rtx (DImode
);
29636 emit_move_insn (pseudo_reg
, hard_reg
);
29637 after
.quick_push (gen_move_insn (hard_reg
, pseudo_reg
));
29639 if (TEST_HARD_REG_BIT (clobbers
, CC_REGNUM
))
29641 rtx pseudo_reg
= gen_reg_rtx (DImode
);
29642 emit_insn (gen_aarch64_save_nzcv (pseudo_reg
));
29643 after
.quick_push (gen_aarch64_restore_nzcv (pseudo_reg
));
29646 /* Emit the transition instructions themselves. */
29649 /* Restore the clobbered registers. */
29650 for (auto *insn
: after
)
29654 /* Return true if INSN references the SME state represented by hard register
29658 aarch64_insn_references_sme_state_p (rtx_insn
*insn
, unsigned int regno
)
29661 FOR_EACH_INSN_DEF (ref
, insn
)
29662 if (!DF_REF_FLAGS_IS_SET (ref
, DF_REF_MUST_CLOBBER
)
29663 && DF_REF_REGNO (ref
) == regno
)
29665 FOR_EACH_INSN_USE (ref
, insn
)
29666 if (DF_REF_REGNO (ref
) == regno
)
29671 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE. */
29673 static aarch64_local_sme_state
29674 aarch64_mode_needed_local_sme_state (rtx_insn
*insn
, HARD_REG_SET live
)
29677 && find_reg_note (insn
, REG_EH_REGION
, NULL_RTX
))
29679 static bool reported
;
29682 sorry ("catching non-call exceptions in functions with SME state");
29685 /* Aim for graceful error recovery by picking the value that is
29686 least likely to generate an ICE. */
29687 return aarch64_local_sme_state::INACTIVE_LOCAL
;
29690 /* A non-local goto is equivalent to a return. We disallow non-local
29691 receivers in functions with SME state, so we know that the target
29692 expects ZA to be dormant or off. */
29694 && find_reg_note (insn
, REG_NON_LOCAL_GOTO
, NULL_RTX
))
29695 return aarch64_local_sme_state::INACTIVE_CALLER
;
29697 /* start_private_za_call and end_private_za_call bracket a sequence
29698 that calls a private-ZA function. Force ZA to be turned off if the
29699 function doesn't have any live ZA state, otherwise require ZA to be
29701 auto icode
= recog_memoized (insn
);
29702 if (icode
== CODE_FOR_aarch64_start_private_za_call
29703 || icode
== CODE_FOR_aarch64_end_private_za_call
)
29704 return (TEST_HARD_REG_BIT (live
, ZA_REGNUM
)
29705 ? aarch64_local_sme_state::INACTIVE_LOCAL
29706 : aarch64_local_sme_state::OFF
);
29708 /* Force ZA to contain the current function's ZA state if INSN wants
29709 to access it. Do the same for accesses to ZT0, since ZA and ZT0
29710 are both controlled by PSTATE.ZA. */
29711 if (aarch64_insn_references_sme_state_p (insn
, ZA_REGNUM
)
29712 || aarch64_insn_references_sme_state_p (insn
, ZT0_REGNUM
))
29713 return (TEST_HARD_REG_BIT (live
, ZA_REGNUM
)
29714 ? aarch64_local_sme_state::ACTIVE_LIVE
29715 : aarch64_local_sme_state::ACTIVE_DEAD
);
29717 return aarch64_local_sme_state::ANY
;
29720 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER. */
29722 static aarch64_tristate_mode
29723 aarch64_mode_needed_za_save_buffer (rtx_insn
*insn
, HARD_REG_SET live
)
29725 /* We need to set up a lazy save buffer no later than the first
29726 transition to INACTIVE_LOCAL (which involves setting up a lazy save). */
29727 if (aarch64_mode_needed_local_sme_state (insn
, live
)
29728 == aarch64_local_sme_state::INACTIVE_LOCAL
)
29729 return aarch64_tristate_mode::YES
;
29731 /* Also make sure that the lazy save buffer is set up before the first
29732 insn that throws internally. The exception handler will sometimes
29734 if (find_reg_note (insn
, REG_EH_REGION
, NULL_RTX
))
29735 return aarch64_tristate_mode::YES
;
29737 return aarch64_tristate_mode::MAYBE
;
29740 /* Implement TARGET_MODE_NEEDED. */
29743 aarch64_mode_needed (int entity
, rtx_insn
*insn
, HARD_REG_SET live
)
29745 switch (aarch64_mode_entity (entity
))
29747 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29748 return int (aarch64_mode_needed_za_save_buffer (insn
, live
));
29750 case aarch64_mode_entity::LOCAL_SME_STATE
:
29751 return int (aarch64_mode_needed_local_sme_state (insn
, live
));
29753 gcc_unreachable ();
29756 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE. */
29758 static aarch64_local_sme_state
29759 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode
,
29762 /* Note places where ZA dies, so that we can try to avoid saving and
29763 restoring state that isn't needed. */
29764 if (mode
== aarch64_local_sme_state::ACTIVE_LIVE
29765 && !TEST_HARD_REG_BIT (live
, ZA_REGNUM
))
29766 return aarch64_local_sme_state::ACTIVE_DEAD
;
29768 /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29770 if (mode
== aarch64_local_sme_state::ACTIVE_DEAD
29771 && TEST_HARD_REG_BIT (live
, ZA_REGNUM
))
29772 return aarch64_local_sme_state::ACTIVE_LIVE
;
29777 /* Implement TARGET_MODE_AFTER. */
29780 aarch64_mode_after (int entity
, int mode
, rtx_insn
*, HARD_REG_SET live
)
29782 switch (aarch64_mode_entity (entity
))
29784 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29787 case aarch64_mode_entity::LOCAL_SME_STATE
:
29788 return int (aarch64_mode_after_local_sme_state
29789 (aarch64_local_sme_state (mode
), live
));
29791 gcc_unreachable ();
29794 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE. */
29796 static aarch64_local_sme_state
29797 aarch64_local_sme_confluence (aarch64_local_sme_state mode1
,
29798 aarch64_local_sme_state mode2
)
29800 /* Perform a symmetrical check for two values. */
29801 auto is_pair
= [&](aarch64_local_sme_state val1
,
29802 aarch64_local_sme_state val2
)
29804 return ((mode1
== val1
&& mode2
== val2
)
29805 || (mode1
== val2
&& mode2
== val1
));
29808 /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
29809 to a caller. OFF is one of the options. */
29810 if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER
,
29811 aarch64_local_sme_state::OFF
))
29812 return aarch64_local_sme_state::INACTIVE_CALLER
;
29814 /* Similarly for dormant contents belonging to the current function. */
29815 if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL
,
29816 aarch64_local_sme_state::OFF
))
29817 return aarch64_local_sme_state::INACTIVE_LOCAL
;
29819 /* Treat a conditionally-initialized value as a fully-initialized value. */
29820 if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE
,
29821 aarch64_local_sme_state::ACTIVE_DEAD
))
29822 return aarch64_local_sme_state::ACTIVE_LIVE
;
29824 return aarch64_local_sme_state::ANY
;
29827 /* Implement TARGET_MODE_CONFLUENCE. */
29830 aarch64_mode_confluence (int entity
, int mode1
, int mode2
)
29832 gcc_assert (mode1
!= mode2
);
29833 switch (aarch64_mode_entity (entity
))
29835 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29836 return int (aarch64_tristate_mode::MAYBE
);
29838 case aarch64_mode_entity::LOCAL_SME_STATE
:
29839 return int (aarch64_local_sme_confluence
29840 (aarch64_local_sme_state (mode1
),
29841 aarch64_local_sme_state (mode2
)));
29843 gcc_unreachable ();
29846 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
29847 NO throughput, or makes one transition from NO to YES. */
29849 static aarch64_tristate_mode
29850 aarch64_one_shot_backprop (aarch64_tristate_mode mode1
,
29851 aarch64_tristate_mode mode2
)
29853 /* Keep bringing the transition forward until it starts from NO. */
29854 if (mode1
== aarch64_tristate_mode::MAYBE
29855 && mode2
== aarch64_tristate_mode::YES
)
29858 return aarch64_tristate_mode::MAYBE
;
29861 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE. */
29863 static aarch64_local_sme_state
29864 aarch64_local_sme_backprop (aarch64_local_sme_state mode1
,
29865 aarch64_local_sme_state mode2
)
29867 /* We always need to know what the current state is when transitioning
29868 to a new state. Force any location with indeterminate starting state
29870 if (mode1
== aarch64_local_sme_state::ANY
)
29873 case aarch64_local_sme_state::INACTIVE_CALLER
:
29874 case aarch64_local_sme_state::OFF
:
29875 case aarch64_local_sme_state::ACTIVE_DEAD
:
29876 /* The current function's ZA state is not live. */
29877 return aarch64_local_sme_state::ACTIVE_DEAD
;
29879 case aarch64_local_sme_state::INACTIVE_LOCAL
:
29880 case aarch64_local_sme_state::ACTIVE_LIVE
:
29881 /* The current function's ZA state is live. */
29882 return aarch64_local_sme_state::ACTIVE_LIVE
;
29884 case aarch64_local_sme_state::SAVED_LOCAL
:
29885 /* This is a transition to an exception handler. Since we don't
29886 support non-call exceptions for SME functions, the source of
29887 the transition must be known. We'll assert later if that's
29889 return aarch64_local_sme_state::ANY
;
29891 case aarch64_local_sme_state::ANY
:
29892 return aarch64_local_sme_state::ANY
;
29895 return aarch64_local_sme_state::ANY
;
29898 /* Implement TARGET_MODE_BACKPROP. */
29901 aarch64_mode_backprop (int entity
, int mode1
, int mode2
)
29903 switch (aarch64_mode_entity (entity
))
29905 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29906 return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1
),
29907 aarch64_tristate_mode (mode2
)));
29909 case aarch64_mode_entity::LOCAL_SME_STATE
:
29910 return int (aarch64_local_sme_backprop
29911 (aarch64_local_sme_state (mode1
),
29912 aarch64_local_sme_state (mode2
)));
29914 gcc_unreachable ();
29917 /* Implement TARGET_MODE_ENTRY. */
29920 aarch64_mode_entry (int entity
)
29922 switch (aarch64_mode_entity (entity
))
29924 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29925 return int (aarch64_tristate_mode::NO
);
29927 case aarch64_mode_entity::LOCAL_SME_STATE
:
29928 return int (aarch64_cfun_shared_flags ("za") != 0
29929 ? aarch64_local_sme_state::ACTIVE_LIVE
29930 : aarch64_cfun_incoming_pstate_za () != 0
29931 ? aarch64_local_sme_state::ACTIVE_DEAD
29932 : aarch64_local_sme_state::INACTIVE_CALLER
);
29934 gcc_unreachable ();
29937 /* Implement TARGET_MODE_EXIT. */
29940 aarch64_mode_exit (int entity
)
29942 switch (aarch64_mode_entity (entity
))
29944 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29945 return int (aarch64_tristate_mode::MAYBE
);
29947 case aarch64_mode_entity::LOCAL_SME_STATE
:
29948 return int (aarch64_cfun_shared_flags ("za") != 0
29949 ? aarch64_local_sme_state::ACTIVE_LIVE
29950 : aarch64_cfun_incoming_pstate_za () != 0
29951 ? aarch64_local_sme_state::ACTIVE_DEAD
29952 : aarch64_local_sme_state::INACTIVE_CALLER
);
29954 gcc_unreachable ();
29957 /* Implement TARGET_MODE_EH_HANDLER. */
29960 aarch64_mode_eh_handler (int entity
)
29962 switch (aarch64_mode_entity (entity
))
29964 case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER
:
29965 /* Require a lazy save buffer to be allocated before the first
29966 insn that can throw. */
29967 return int (aarch64_tristate_mode::YES
);
29969 case aarch64_mode_entity::LOCAL_SME_STATE
:
29970 return int (aarch64_local_sme_state::SAVED_LOCAL
);
29972 gcc_unreachable ();
29975 /* Implement TARGET_MODE_PRIORITY. */
29978 aarch64_mode_priority (int, int n
)
29983 /* Implement TARGET_MD_ASM_ADJUST. */
29986 aarch64_md_asm_adjust (vec
<rtx
> &outputs
, vec
<rtx
> &inputs
,
29987 vec
<machine_mode
> &input_modes
,
29988 vec
<const char *> &constraints
,
29989 vec
<rtx
> &uses
, vec
<rtx
> &clobbers
,
29990 HARD_REG_SET
&clobbered_regs
, location_t loc
)
29992 rtx_insn
*seq
= arm_md_asm_adjust (outputs
, inputs
, input_modes
, constraints
,
29993 uses
, clobbers
, clobbered_regs
, loc
);
29995 /* "za" in the clobber list of a function with ZA state is defined to
29996 mean that the asm can read from and write to ZA. We can model the
29997 read using a USE, but unfortunately, it's not possible to model the
29998 write directly. Use a separate insn to model the effect.
30000 We must ensure that ZA is active on entry, which is enforced by using
30001 SME_STATE_REGNUM. The asm must ensure that ZA is active on return.
30003 The same thing applies to ZT0. */
30005 for (unsigned int i
= clobbers
.length (); i
-- > 0; )
30007 rtx x
= clobbers
[i
];
30009 && (REGNO (x
) == ZA_REGNUM
|| REGNO (x
) == ZT0_REGNUM
))
30011 auto id
= cfun
->machine
->next_asm_update_za_id
++;
30016 rtx id_rtx
= gen_int_mode (id
, SImode
);
30017 emit_insn (REGNO (x
) == ZA_REGNUM
30018 ? gen_aarch64_asm_update_za (id_rtx
)
30019 : gen_aarch64_asm_update_zt0 (id_rtx
));
30020 seq
= get_insns ();
30023 auto mode
= REGNO (x
) == ZA_REGNUM
? VNx16QImode
: V8DImode
;
30024 uses
.safe_push (gen_rtx_REG (mode
, REGNO (x
)));
30025 uses
.safe_push (gen_rtx_REG (DImode
, SME_STATE_REGNUM
));
30027 clobbers
.ordered_remove (i
);
30028 CLEAR_HARD_REG_BIT (clobbered_regs
, REGNO (x
));
30034 /* BB is the target of an exception or nonlocal goto edge, which means
30035 that PSTATE.SM is known to be 0 on entry. Put it into the state that
30036 the current function requires. */
30039 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb
)
30041 if (TARGET_NON_STREAMING
)
30045 rtx_insn
*guard_label
= nullptr;
30046 if (TARGET_STREAMING_COMPATIBLE
)
30047 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30048 AARCH64_FL_SM_OFF
);
30049 aarch64_sme_mode_switch_regs args_switch
;
30050 args_switch
.add_call_preserved_regs (df_get_live_in (bb
));
30051 args_switch
.emit_prologue ();
30052 aarch64_switch_pstate_sm (AARCH64_FL_SM_OFF
, AARCH64_FL_SM_ON
);
30053 args_switch
.emit_epilogue ();
30055 emit_label (guard_label
);
30056 auto seq
= get_insns ();
30059 emit_insn_after (seq
, bb_note (bb
));
30063 /* JUMP is a nonlocal goto. Its target requires PSTATE.SM to be 0 on entry,
30064 so arrange to make it so. */
30067 aarch64_switch_pstate_sm_for_jump (rtx_insn
*jump
)
30069 if (TARGET_NON_STREAMING
)
30073 rtx_insn
*guard_label
= nullptr;
30074 if (TARGET_STREAMING_COMPATIBLE
)
30075 guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30076 AARCH64_FL_SM_OFF
);
30077 aarch64_switch_pstate_sm (AARCH64_FL_SM_ON
, AARCH64_FL_SM_OFF
);
30079 emit_label (guard_label
);
30080 auto seq
= get_insns ();
30083 emit_insn_before (seq
, jump
);
30087 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30088 to switch to the new mode and the instructions needed to restore the
30089 original mode. Return true if something changed. */
30091 aarch64_switch_pstate_sm_for_call (rtx_call_insn
*call
)
30093 /* Mode switches for sibling calls are handled via the epilogue. */
30094 if (SIBLING_CALL_P (call
))
30097 auto callee_isa_mode
= aarch64_insn_callee_isa_mode (call
);
30098 if (!aarch64_call_switches_pstate_sm (callee_isa_mode
))
30101 /* Switch mode before the call, preserving any argument registers
30102 across the switch. */
30104 rtx_insn
*args_guard_label
= nullptr;
30105 if (TARGET_STREAMING_COMPATIBLE
)
30106 args_guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30108 aarch64_sme_mode_switch_regs args_switch
;
30109 args_switch
.add_call_args (call
);
30110 args_switch
.emit_prologue ();
30111 aarch64_switch_pstate_sm (AARCH64_ISA_MODE
, callee_isa_mode
);
30112 args_switch
.emit_epilogue ();
30113 if (args_guard_label
)
30114 emit_label (args_guard_label
);
30115 auto args_seq
= get_insns ();
30117 emit_insn_before (args_seq
, call
);
30119 if (find_reg_note (call
, REG_NORETURN
, NULL_RTX
))
30122 /* Switch mode after the call, preserving any return registers across
30125 rtx_insn
*return_guard_label
= nullptr;
30126 if (TARGET_STREAMING_COMPATIBLE
)
30127 return_guard_label
= aarch64_guard_switch_pstate_sm (IP0_REGNUM
,
30129 aarch64_sme_mode_switch_regs return_switch
;
30130 return_switch
.add_call_result (call
);
30131 return_switch
.emit_prologue ();
30132 aarch64_switch_pstate_sm (callee_isa_mode
, AARCH64_ISA_MODE
);
30133 return_switch
.emit_epilogue ();
30134 if (return_guard_label
)
30135 emit_label (return_guard_label
);
30136 auto result_seq
= get_insns ();
30138 emit_insn_after (result_seq
, call
);
30144 const pass_data pass_data_switch_pstate_sm
=
30147 "smstarts", // name
30148 OPTGROUP_NONE
, // optinfo_flags
30150 0, // properties_required
30151 0, // properties_provided
30152 0, // properties_destroyed
30153 0, // todo_flags_start
30154 TODO_df_finish
, // todo_flags_finish
30157 class pass_switch_pstate_sm
: public rtl_opt_pass
30160 pass_switch_pstate_sm (gcc::context
*ctxt
)
30161 : rtl_opt_pass (pass_data_switch_pstate_sm
, ctxt
)
30164 // opt_pass methods:
30165 bool gate (function
*) override final
;
30166 unsigned int execute (function
*) override final
;
30170 pass_switch_pstate_sm::gate (function
*fn
)
30172 return (aarch64_fndecl_pstate_sm (fn
->decl
) != AARCH64_FL_SM_OFF
30173 || cfun
->machine
->call_switches_pstate_sm
);
30176 /* Emit any instructions needed to switch PSTATE.SM. */
30178 pass_switch_pstate_sm::execute (function
*fn
)
30182 auto_sbitmap
blocks (last_basic_block_for_fn (cfun
));
30183 bitmap_clear (blocks
);
30184 FOR_EACH_BB_FN (bb
, fn
)
30186 if (has_abnormal_call_or_eh_pred_edge_p (bb
)
30187 && aarch64_switch_pstate_sm_for_landing_pad (bb
))
30188 bitmap_set_bit (blocks
, bb
->index
);
30190 if (cfun
->machine
->call_switches_pstate_sm
)
30193 FOR_BB_INSNS (bb
, insn
)
30194 if (auto *call
= dyn_cast
<rtx_call_insn
*> (insn
))
30195 if (aarch64_switch_pstate_sm_for_call (call
))
30196 bitmap_set_bit (blocks
, bb
->index
);
30199 auto end
= BB_END (bb
);
30201 && find_reg_note (end
, REG_NON_LOCAL_GOTO
, NULL_RTX
)
30202 && aarch64_switch_pstate_sm_for_jump (end
))
30203 bitmap_set_bit (blocks
, bb
->index
);
30205 find_many_sub_basic_blocks (blocks
);
30206 clear_aux_for_blocks ();
30213 make_pass_switch_pstate_sm (gcc::context
*ctxt
)
30215 return new pass_switch_pstate_sm (ctxt
);
30218 /* Parse an implementation-defined system register name of
30219 the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30220 Return true if name matched against above pattern, false
30223 aarch64_is_implem_def_reg (const char *regname
)
30226 unsigned name_len
= strlen (regname
);
30227 if (name_len
< 12 || name_len
> 14)
30230 auto cterm_valid_p
= [&]()
30232 bool leading_zero_p
= false;
30236 if (regname
[pos
] != 'c')
30239 while (regname
[pos
] != '_')
30241 if (leading_zero_p
)
30243 if (i
== 0 && regname
[pos
] == '0')
30244 leading_zero_p
= true;
30247 if (!ISDIGIT (regname
[pos
]))
30249 n
[i
++] = regname
[pos
++];
30256 if (regname
[pos
] != 's')
30259 if (regname
[pos
] < '0' || regname
[pos
] > '3')
30262 if (regname
[pos
++] != '_')
30264 if (regname
[pos
] < '0' || regname
[pos
] > '7')
30267 if (regname
[pos
++] != '_')
30269 if (!cterm_valid_p ())
30271 if (regname
[pos
++] != '_')
30273 if (!cterm_valid_p ())
30275 if (regname
[pos
++] != '_')
30277 if (regname
[pos
] < '0' || regname
[pos
] > '7')
30282 /* Return true if REGNAME matches either a known permitted system
30283 register name, or a generic sysreg specification. For use in
30284 back-end predicate `aarch64_sysreg_string'. */
30286 aarch64_valid_sysreg_name_p (const char *regname
)
30288 const sysreg_t
*sysreg
= aarch64_lookup_sysreg_map (regname
);
30289 if (sysreg
== NULL
)
30290 return aarch64_is_implem_def_reg (regname
);
30291 if (sysreg
->arch_reqs
)
30292 return (aarch64_isa_flags
& sysreg
->arch_reqs
);
30296 /* Return the generic sysreg specification for a valid system register
30297 name, otherwise NULL. WRITE_P is true iff the register is being
30298 written to. IS128OP indicates the requested system register should
30299 be checked for a 128-bit implementation. */
30301 aarch64_retrieve_sysreg (const char *regname
, bool write_p
, bool is128op
)
30303 const sysreg_t
*sysreg
= aarch64_lookup_sysreg_map (regname
);
30304 if (sysreg
== NULL
)
30306 if (aarch64_is_implem_def_reg (regname
))
30311 if (is128op
&& !(sysreg
->properties
& F_REG_128
))
30313 if ((write_p
&& (sysreg
->properties
& F_REG_READ
))
30314 || (!write_p
&& (sysreg
->properties
& F_REG_WRITE
)))
30316 if ((~aarch64_isa_flags
& sysreg
->arch_reqs
) != 0)
30318 return sysreg
->encoding
;
30321 /* Target-specific selftests. */
30325 namespace selftest
{
30327 /* Selftest for the RTL loader.
30328 Verify that the RTL loader copes with a dump from
30329 print_rtx_function. This is essentially just a test that class
30330 function_reader can handle a real dump, but it also verifies
30331 that lookup_reg_by_dump_name correctly handles hard regs.
30332 The presence of hard reg names in the dump means that the test is
30333 target-specific, hence it is in this file. */
30336 aarch64_test_loading_full_dump ()
30338 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
30340 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
30342 rtx_insn
*insn_1
= get_insn_by_uid (1);
30343 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
30345 rtx_insn
*insn_15
= get_insn_by_uid (15);
30346 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
30347 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
30349 /* Verify crtl->return_rtx. */
30350 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
30351 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
30352 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
30355 /* Test the fractional_cost class. */
30358 aarch64_test_fractional_cost ()
30360 using cf
= fractional_cost
;
30362 ASSERT_EQ (cf (0, 20), 0);
30364 ASSERT_EQ (cf (4, 2), 2);
30365 ASSERT_EQ (3, cf (9, 3));
30367 ASSERT_NE (cf (5, 2), 2);
30368 ASSERT_NE (3, cf (8, 3));
30370 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30371 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30372 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30374 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30375 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30376 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30377 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30378 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30379 ASSERT_EQ (3 - cf (10, 3), 0);
30381 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30382 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30384 ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30385 ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30386 ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30387 ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30388 ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30389 ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30390 ASSERT_TRUE (cf (239, 240) <= 1);
30391 ASSERT_TRUE (cf (240, 240) <= 1);
30392 ASSERT_FALSE (cf (241, 240) <= 1);
30393 ASSERT_FALSE (2 <= cf (207, 104));
30394 ASSERT_TRUE (2 <= cf (208, 104));
30395 ASSERT_TRUE (2 <= cf (209, 104));
30397 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30398 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30399 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30400 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30401 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30402 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30403 ASSERT_TRUE (cf (239, 240) < 1);
30404 ASSERT_FALSE (cf (240, 240) < 1);
30405 ASSERT_FALSE (cf (241, 240) < 1);
30406 ASSERT_FALSE (2 < cf (207, 104));
30407 ASSERT_FALSE (2 < cf (208, 104));
30408 ASSERT_TRUE (2 < cf (209, 104));
30410 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30411 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30412 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30413 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30414 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30415 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30416 ASSERT_FALSE (cf (239, 240) >= 1);
30417 ASSERT_TRUE (cf (240, 240) >= 1);
30418 ASSERT_TRUE (cf (241, 240) >= 1);
30419 ASSERT_TRUE (2 >= cf (207, 104));
30420 ASSERT_TRUE (2 >= cf (208, 104));
30421 ASSERT_FALSE (2 >= cf (209, 104));
30423 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30424 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30425 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30426 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30427 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30428 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30429 ASSERT_FALSE (cf (239, 240) > 1);
30430 ASSERT_FALSE (cf (240, 240) > 1);
30431 ASSERT_TRUE (cf (241, 240) > 1);
30432 ASSERT_TRUE (2 > cf (207, 104));
30433 ASSERT_FALSE (2 > cf (208, 104));
30434 ASSERT_FALSE (2 > cf (209, 104));
30436 ASSERT_EQ (cf (1, 2).ceil (), 1);
30437 ASSERT_EQ (cf (11, 7).ceil (), 2);
30438 ASSERT_EQ (cf (20, 1).ceil (), 20);
30439 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30440 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30441 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30442 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30443 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30445 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30448 /* Calculate whether our system register data, as imported from
30449 `aarch64-sys-reg.def' has any duplicate entries. */
30451 aarch64_test_sysreg_encoding_clashes (void)
30453 using dup_instances_t
= hash_map
<nofree_string_hash
,
30454 std::vector
<const sysreg_t
*>>;
30456 dup_instances_t duplicate_instances
;
30458 /* Every time an encoding is established to come up more than once
30459 we add it to a "clash-analysis queue", which is then used to extract
30460 necessary information from our hash map when establishing whether
30461 repeated encodings are valid. */
30463 /* 1) Collect recurrence information. */
30464 for (unsigned i
= 0; i
< ARRAY_SIZE (aarch64_sysregs
); i
++)
30466 const sysreg_t
*reg
= aarch64_sysregs
+ i
;
30468 std::vector
<const sysreg_t
*> *tmp
30469 = &duplicate_instances
.get_or_insert (reg
->encoding
);
30471 tmp
->push_back (reg
);
30474 /* 2) Carry out analysis on collected data. */
30475 for (auto instance
: duplicate_instances
)
30477 unsigned nrep
= instance
.second
.size ();
30479 for (unsigned i
= 0; i
< nrep
; i
++)
30480 for (unsigned j
= i
+ 1; j
< nrep
; j
++)
30482 const sysreg_t
*a
= instance
.second
[i
];
30483 const sysreg_t
*b
= instance
.second
[j
];
30484 ASSERT_TRUE ((a
->properties
!= b
->properties
)
30485 || (a
->arch_reqs
!= b
->arch_reqs
));
30490 /* Run all target-specific selftests. */
30493 aarch64_run_selftests (void)
30495 aarch64_test_loading_full_dump ();
30496 aarch64_test_fractional_cost ();
30497 aarch64_test_sysreg_encoding_clashes ();
30500 } // namespace selftest
30502 #endif /* #if CHECKING_P */
30504 #undef TARGET_STACK_PROTECT_GUARD
30505 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30507 #undef TARGET_ADDRESS_COST
30508 #define TARGET_ADDRESS_COST aarch64_address_cost
30510 /* This hook will determines whether unnamed bitfields affect the alignment
30511 of the containing structure. The hook returns true if the structure
30512 should inherit the alignment requirements of an unnamed bitfield's
30514 #undef TARGET_ALIGN_ANON_BITFIELD
30515 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30517 #undef TARGET_ASM_ALIGNED_DI_OP
30518 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30520 #undef TARGET_ASM_ALIGNED_HI_OP
30521 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30523 #undef TARGET_ASM_ALIGNED_SI_OP
30524 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30526 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30527 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30528 hook_bool_const_tree_hwi_hwi_const_tree_true
30530 #undef TARGET_ASM_FILE_START
30531 #define TARGET_ASM_FILE_START aarch64_start_file
30533 #undef TARGET_ASM_OUTPUT_MI_THUNK
30534 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30536 #undef TARGET_ASM_SELECT_RTX_SECTION
30537 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30539 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30540 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30542 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30543 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30545 #undef TARGET_BUILD_BUILTIN_VA_LIST
30546 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30548 #undef TARGET_CALLEE_COPIES
30549 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30551 #undef TARGET_FRAME_POINTER_REQUIRED
30552 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30554 #undef TARGET_CAN_ELIMINATE
30555 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30557 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30558 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30559 aarch64_function_attribute_inlinable_p
30561 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30562 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30564 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30565 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30567 #undef TARGET_CAN_INLINE_P
30568 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30570 #undef TARGET_CANNOT_FORCE_CONST_MEM
30571 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30573 #undef TARGET_CASE_VALUES_THRESHOLD
30574 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30576 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30577 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30579 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30580 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30582 /* Only the least significant bit is used for initialization guard
30584 #undef TARGET_CXX_GUARD_MASK_BIT
30585 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30587 #undef TARGET_C_MODE_FOR_SUFFIX
30588 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30590 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30591 #undef TARGET_DEFAULT_TARGET_FLAGS
30592 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30595 #undef TARGET_CLASS_MAX_NREGS
30596 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30598 #undef TARGET_BUILTIN_DECL
30599 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30601 #undef TARGET_BUILTIN_RECIPROCAL
30602 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30604 #undef TARGET_C_EXCESS_PRECISION
30605 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30607 #undef TARGET_C_BITINT_TYPE_INFO
30608 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
30610 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
30611 #define TARGET_C_MODE_FOR_FLOATING_TYPE aarch64_c_mode_for_floating_type
30613 #undef TARGET_EXPAND_BUILTIN
30614 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30616 #undef TARGET_EXPAND_BUILTIN_VA_START
30617 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30619 #undef TARGET_FOLD_BUILTIN
30620 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30622 #undef TARGET_FUNCTION_ARG
30623 #define TARGET_FUNCTION_ARG aarch64_function_arg
30625 #undef TARGET_FUNCTION_ARG_ADVANCE
30626 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30628 #undef TARGET_FUNCTION_ARG_BOUNDARY
30629 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30631 #undef TARGET_FUNCTION_ARG_PADDING
30632 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30634 #undef TARGET_GET_RAW_RESULT_MODE
30635 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30636 #undef TARGET_GET_RAW_ARG_MODE
30637 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30639 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30640 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30642 #undef TARGET_FUNCTION_VALUE
30643 #define TARGET_FUNCTION_VALUE aarch64_function_value
30645 #undef TARGET_FUNCTION_VALUE_REGNO_P
30646 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30648 #undef TARGET_START_CALL_ARGS
30649 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30651 #undef TARGET_END_CALL_ARGS
30652 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30654 #undef TARGET_GIMPLE_FOLD_BUILTIN
30655 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30657 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30658 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30660 #undef TARGET_INIT_BUILTINS
30661 #define TARGET_INIT_BUILTINS aarch64_init_builtins
30663 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30664 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30665 aarch64_ira_change_pseudo_allocno_class
30667 #undef TARGET_LEGITIMATE_ADDRESS_P
30668 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30670 #undef TARGET_LEGITIMATE_CONSTANT_P
30671 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30673 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30674 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30675 aarch64_legitimize_address_displacement
30677 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30678 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30680 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30681 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30682 aarch64_libgcc_floating_mode_supported_p
30684 #undef TARGET_MANGLE_TYPE
30685 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30687 #undef TARGET_INVALID_BINARY_OP
30688 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30690 #undef TARGET_VERIFY_TYPE_CONTEXT
30691 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30693 #undef TARGET_MEMORY_MOVE_COST
30694 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30696 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30697 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30699 #undef TARGET_MUST_PASS_IN_STACK
30700 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30702 /* This target hook should return true if accesses to volatile bitfields
30703 should use the narrowest mode possible. It should return false if these
30704 accesses should use the bitfield container type. */
30705 #undef TARGET_NARROW_VOLATILE_BITFIELD
30706 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30708 #undef TARGET_OPTION_OVERRIDE
30709 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30711 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30712 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30713 aarch64_override_options_after_change
30715 #undef TARGET_OFFLOAD_OPTIONS
30716 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30718 #undef TARGET_OPTION_RESTORE
30719 #define TARGET_OPTION_RESTORE aarch64_option_restore
30721 #undef TARGET_OPTION_PRINT
30722 #define TARGET_OPTION_PRINT aarch64_option_print
30724 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30725 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30727 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30728 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30729 aarch64_option_valid_version_attribute_p
30731 #undef TARGET_SET_CURRENT_FUNCTION
30732 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30734 #undef TARGET_PASS_BY_REFERENCE
30735 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30737 #undef TARGET_PREFERRED_RELOAD_CLASS
30738 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30740 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30741 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30743 #undef TARGET_DWARF_FRAME_REG_MODE
30744 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30746 #undef TARGET_PROMOTED_TYPE
30747 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30749 #undef TARGET_SECONDARY_RELOAD
30750 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30752 #undef TARGET_SECONDARY_MEMORY_NEEDED
30753 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30755 #undef TARGET_SHIFT_TRUNCATION_MASK
30756 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30758 #undef TARGET_SETUP_INCOMING_VARARGS
30759 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30761 #undef TARGET_STRUCT_VALUE_RTX
30762 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
30764 #undef TARGET_REGISTER_MOVE_COST
30765 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30767 #undef TARGET_RETURN_IN_MEMORY
30768 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30770 #undef TARGET_RETURN_IN_MSB
30771 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30773 #undef TARGET_RTX_COSTS
30774 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30776 #undef TARGET_INSN_COST
30777 #define TARGET_INSN_COST aarch64_insn_cost
30779 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30780 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30782 #undef TARGET_SCHED_ISSUE_RATE
30783 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
30785 #undef TARGET_SCHED_VARIABLE_ISSUE
30786 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
30788 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
30789 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
30790 aarch64_sched_first_cycle_multipass_dfa_lookahead
30792 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
30793 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
30794 aarch64_first_cycle_multipass_dfa_lookahead_guard
30796 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
30797 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
30798 aarch64_get_separate_components
30800 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
30801 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
30802 aarch64_components_for_bb
30804 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
30805 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
30806 aarch64_disqualify_components
30808 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
30809 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
30810 aarch64_emit_prologue_components
30812 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
30813 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
30814 aarch64_emit_epilogue_components
30816 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
30817 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
30818 aarch64_set_handled_components
30820 #undef TARGET_TRAMPOLINE_INIT
30821 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
30823 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
30824 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
30826 #undef TARGET_VECTOR_MODE_SUPPORTED_P
30827 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
30829 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
30830 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
30832 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
30833 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
30835 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
30836 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
30837 aarch64_builtin_support_vector_misalignment
30839 #undef TARGET_ARRAY_MODE
30840 #define TARGET_ARRAY_MODE aarch64_array_mode
30842 #undef TARGET_ARRAY_MODE_SUPPORTED_P
30843 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
30845 #undef TARGET_VECTORIZE_CREATE_COSTS
30846 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
30848 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
30849 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
30850 aarch64_builtin_vectorization_cost
30852 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
30853 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
30855 #undef TARGET_VECTORIZE_BUILTINS
30856 #define TARGET_VECTORIZE_BUILTINS
30858 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
30859 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
30860 aarch64_autovectorize_vector_modes
30862 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
30863 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
30864 aarch64_atomic_assign_expand_fenv
30866 /* Section anchor support. */
30868 #undef TARGET_MIN_ANCHOR_OFFSET
30869 #define TARGET_MIN_ANCHOR_OFFSET -256
30871 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
30872 byte offset; we can do much more for larger data types, but have no way
30873 to determine the size of the access. We assume accesses are aligned. */
30874 #undef TARGET_MAX_ANCHOR_OFFSET
30875 #define TARGET_MAX_ANCHOR_OFFSET 4095
30877 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
30878 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
30879 aarch64_vectorize_preferred_div_as_shifts_over_mult
30881 #undef TARGET_VECTOR_ALIGNMENT
30882 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
30884 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
30885 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
30886 aarch64_vectorize_preferred_vector_alignment
30887 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
30888 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
30889 aarch64_simd_vector_alignment_reachable
30891 /* vec_perm support. */
30893 #undef TARGET_VECTORIZE_VEC_PERM_CONST
30894 #define TARGET_VECTORIZE_VEC_PERM_CONST \
30895 aarch64_vectorize_vec_perm_const
30897 #undef TARGET_VECTORIZE_RELATED_MODE
30898 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
30899 #undef TARGET_VECTORIZE_GET_MASK_MODE
30900 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
30901 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
30902 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
30903 aarch64_empty_mask_is_expensive
30904 #undef TARGET_PREFERRED_ELSE_VALUE
30905 #define TARGET_PREFERRED_ELSE_VALUE \
30906 aarch64_preferred_else_value
30908 #undef TARGET_INIT_LIBFUNCS
30909 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
30911 #undef TARGET_FIXED_CONDITION_CODE_REGS
30912 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
30914 #undef TARGET_FLAGS_REGNUM
30915 #define TARGET_FLAGS_REGNUM CC_REGNUM
30917 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
30918 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
30920 #undef TARGET_ASAN_SHADOW_OFFSET
30921 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
30923 #undef TARGET_LEGITIMIZE_ADDRESS
30924 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
30926 #undef TARGET_SCHED_CAN_SPECULATE_INSN
30927 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
30929 #undef TARGET_CAN_USE_DOLOOP_P
30930 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
30932 #undef TARGET_SCHED_ADJUST_PRIORITY
30933 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
30935 #undef TARGET_SCHED_MACRO_FUSION_P
30936 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
30938 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
30939 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
30941 #undef TARGET_SCHED_FUSION_PRIORITY
30942 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
30944 #undef TARGET_UNSPEC_MAY_TRAP_P
30945 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
30947 #undef TARGET_USE_PSEUDO_PIC_REG
30948 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
30950 #undef TARGET_PRINT_OPERAND
30951 #define TARGET_PRINT_OPERAND aarch64_print_operand
30953 #undef TARGET_PRINT_OPERAND_ADDRESS
30954 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
30956 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
30957 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
30959 #undef TARGET_OPTAB_SUPPORTED_P
30960 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
30962 #undef TARGET_OMIT_STRUCT_RETURN_REG
30963 #define TARGET_OMIT_STRUCT_RETURN_REG true
30965 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
30966 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
30967 aarch64_dwarf_poly_indeterminate_value
30969 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
30970 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
30971 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
30973 #undef TARGET_HARD_REGNO_NREGS
30974 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
30975 #undef TARGET_HARD_REGNO_MODE_OK
30976 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
30978 #undef TARGET_MODES_TIEABLE_P
30979 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
30981 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
30982 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
30983 aarch64_hard_regno_call_part_clobbered
30985 #undef TARGET_INSN_CALLEE_ABI
30986 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
30988 #undef TARGET_CONSTANT_ALIGNMENT
30989 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
30991 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
30992 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
30993 aarch64_stack_clash_protection_alloca_probe_range
30995 #undef TARGET_COMPUTE_PRESSURE_CLASSES
30996 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
30998 #undef TARGET_CAN_CHANGE_MODE_CLASS
30999 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
31001 #undef TARGET_SELECT_EARLY_REMAT_MODES
31002 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
31004 #undef TARGET_SPECULATION_SAFE_VALUE
31005 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
31007 #undef TARGET_ESTIMATED_POLY_VALUE
31008 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
31010 #undef TARGET_ATTRIBUTE_TABLE
31011 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
31013 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
31014 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
31015 aarch64_simd_clone_compute_vecsize_and_simdlen
31017 #undef TARGET_SIMD_CLONE_ADJUST
31018 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
31020 #undef TARGET_SIMD_CLONE_USABLE
31021 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
31023 #undef TARGET_COMP_TYPE_ATTRIBUTES
31024 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
31026 #undef TARGET_MERGE_DECL_ATTRIBUTES
31027 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
31029 #undef TARGET_GET_MULTILIB_ABI_NAME
31030 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
31032 #undef TARGET_FNTYPE_ABI
31033 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
31035 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
31036 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
31039 #undef TARGET_RUN_TARGET_SELFTESTS
31040 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
31041 #endif /* #if CHECKING_P */
31043 #undef TARGET_ASM_POST_CFI_STARTPROC
31044 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
31046 #undef TARGET_STRICT_ARGUMENT_NAMING
31047 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
31049 #undef TARGET_MODE_EMIT
31050 #define TARGET_MODE_EMIT aarch64_mode_emit
31052 #undef TARGET_MODE_NEEDED
31053 #define TARGET_MODE_NEEDED aarch64_mode_needed
31055 #undef TARGET_MODE_AFTER
31056 #define TARGET_MODE_AFTER aarch64_mode_after
31058 #undef TARGET_MODE_CONFLUENCE
31059 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
31061 #undef TARGET_MODE_BACKPROP
31062 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
31064 #undef TARGET_MODE_ENTRY
31065 #define TARGET_MODE_ENTRY aarch64_mode_entry
31067 #undef TARGET_MODE_EXIT
31068 #define TARGET_MODE_EXIT aarch64_mode_exit
31070 #undef TARGET_MODE_EH_HANDLER
31071 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
31073 #undef TARGET_MODE_PRIORITY
31074 #define TARGET_MODE_PRIORITY aarch64_mode_priority
31076 #undef TARGET_MD_ASM_ADJUST
31077 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31079 #undef TARGET_ASM_FILE_END
31080 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31082 #undef TARGET_ASM_FUNCTION_EPILOGUE
31083 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31085 #undef TARGET_HAVE_SHADOW_CALL_STACK
31086 #define TARGET_HAVE_SHADOW_CALL_STACK true
31088 #undef TARGET_CONST_ANCHOR
31089 #define TARGET_CONST_ANCHOR 0x1000000
31091 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31092 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31094 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31095 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31097 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31098 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31100 #undef TARGET_OPTION_FUNCTION_VERSIONS
31101 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31103 #undef TARGET_COMPARE_VERSION_PRIORITY
31104 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31106 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31107 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31108 aarch64_generate_version_dispatcher_body
31110 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31111 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31112 aarch64_get_function_versions_dispatcher
31114 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31115 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31117 struct gcc_target targetm
= TARGET_INITIALIZER
;
31119 #include "gt-aarch64.h"