PR96463: Optimise svld1rq from vectors for little endian AArch64 targets.
[official-gcc.git] / gcc / config / aarch64 / aarch64.cc
blobd21e041eccbc755b73703e144cd71559f86dc241
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2022 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #include "config.h"
26 #include "system.h"
27 #include "coretypes.h"
28 #include "backend.h"
29 #include "target.h"
30 #include "rtl.h"
31 #include "tree.h"
32 #include "memmodel.h"
33 #include "gimple.h"
34 #include "cfghooks.h"
35 #include "cfgloop.h"
36 #include "df.h"
37 #include "tm_p.h"
38 #include "stringpool.h"
39 #include "attribs.h"
40 #include "optabs.h"
41 #include "regs.h"
42 #include "emit-rtl.h"
43 #include "recog.h"
44 #include "cgraph.h"
45 #include "diagnostic.h"
46 #include "insn-attr.h"
47 #include "alias.h"
48 #include "fold-const.h"
49 #include "stor-layout.h"
50 #include "calls.h"
51 #include "varasm.h"
52 #include "output.h"
53 #include "flags.h"
54 #include "explow.h"
55 #include "expr.h"
56 #include "reload.h"
57 #include "langhooks.h"
58 #include "opts.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78 #include "gimple-pretty-print.h"
79 #include "tree-ssa-loop-niter.h"
80 #include "fractional-cost.h"
81 #include "rtlanal.h"
82 #include "tree-dfa.h"
83 #include "asan.h"
85 /* This file should be included last. */
86 #include "target-def.h"
88 /* Defined for convenience. */
89 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
91 /* Information about a legitimate vector immediate operand. */
92 struct simd_immediate_info
94 enum insn_type { MOV, MVN, INDEX, PTRUE };
95 enum modifier_type { LSL, MSL };
97 simd_immediate_info () {}
98 simd_immediate_info (scalar_float_mode, rtx);
99 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
100 insn_type = MOV, modifier_type = LSL,
101 unsigned int = 0);
102 simd_immediate_info (scalar_mode, rtx, rtx);
103 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
105 /* The mode of the elements. */
106 scalar_mode elt_mode;
108 /* The instruction to use to move the immediate into a vector. */
109 insn_type insn;
111 union
113 /* For MOV and MVN. */
114 struct
116 /* The value of each element. */
117 rtx value;
119 /* The kind of shift modifier to use, and the number of bits to shift.
120 This is (LSL, 0) if no shift is needed. */
121 modifier_type modifier;
122 unsigned int shift;
123 } mov;
125 /* For INDEX. */
126 struct
128 /* The value of the first element and the step to be added for each
129 subsequent element. */
130 rtx base, step;
131 } index;
133 /* For PTRUE. */
134 aarch64_svpattern pattern;
135 } u;
138 /* Construct a floating-point immediate in which each element has mode
139 ELT_MODE_IN and value VALUE_IN. */
140 inline simd_immediate_info
141 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
142 : elt_mode (elt_mode_in), insn (MOV)
144 u.mov.value = value_in;
145 u.mov.modifier = LSL;
146 u.mov.shift = 0;
149 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
150 and value VALUE_IN. The other parameters are as for the structure
151 fields. */
152 inline simd_immediate_info
153 ::simd_immediate_info (scalar_int_mode elt_mode_in,
154 unsigned HOST_WIDE_INT value_in,
155 insn_type insn_in, modifier_type modifier_in,
156 unsigned int shift_in)
157 : elt_mode (elt_mode_in), insn (insn_in)
159 u.mov.value = gen_int_mode (value_in, elt_mode_in);
160 u.mov.modifier = modifier_in;
161 u.mov.shift = shift_in;
164 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
165 and where element I is equal to BASE_IN + I * STEP_IN. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
168 : elt_mode (elt_mode_in), insn (INDEX)
170 u.index.base = base_in;
171 u.index.step = step_in;
174 /* Construct a predicate that controls elements of mode ELT_MODE_IN
175 and has PTRUE pattern PATTERN_IN. */
176 inline simd_immediate_info
177 ::simd_immediate_info (scalar_int_mode elt_mode_in,
178 aarch64_svpattern pattern_in)
179 : elt_mode (elt_mode_in), insn (PTRUE)
181 u.pattern = pattern_in;
184 namespace {
186 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
187 class pure_scalable_type_info
189 public:
190 /* Represents the result of analyzing a type. All values are nonzero,
191 in the possibly forlorn hope that accidental conversions to bool
192 trigger a warning. */
193 enum analysis_result
195 /* The type does not have an ABI identity; i.e. it doesn't contain
196 at least one object whose type is a Fundamental Data Type. */
197 NO_ABI_IDENTITY = 1,
199 /* The type is definitely a Pure Scalable Type. */
200 IS_PST,
202 /* The type is definitely not a Pure Scalable Type. */
203 ISNT_PST,
205 /* It doesn't matter for PCS purposes whether the type is a Pure
206 Scalable Type or not, since the type will be handled the same
207 way regardless.
209 Specifically, this means that if the type is a Pure Scalable Type,
210 there aren't enough argument registers to hold it, and so it will
211 need to be passed or returned in memory. If the type isn't a
212 Pure Scalable Type, it's too big to be passed or returned in core
213 or SIMD&FP registers, and so again will need to go in memory. */
214 DOESNT_MATTER
217 /* Aggregates of 17 bytes or more are normally passed and returned
218 in memory, so aggregates of that size can safely be analyzed as
219 DOESNT_MATTER. We need to be able to collect enough pieces to
220 represent a PST that is smaller than that. Since predicates are
221 2 bytes in size for -msve-vector-bits=128, that means we need to be
222 able to store at least 8 pieces.
224 We also need to be able to store enough pieces to represent
225 a single vector in each vector argument register and a single
226 predicate in each predicate argument register. This means that
227 we need at least 12 pieces. */
228 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
229 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
231 /* Describes one piece of a PST. Each piece is one of:
233 - a single Scalable Vector Type (SVT)
234 - a single Scalable Predicate Type (SPT)
235 - a PST containing 2, 3 or 4 SVTs, with no padding
237 It either represents a single built-in type or a PST formed from
238 multiple homogeneous built-in types. */
239 struct piece
241 rtx get_rtx (unsigned int, unsigned int) const;
243 /* The number of vector and predicate registers that the piece
244 occupies. One of the two is always zero. */
245 unsigned int num_zr;
246 unsigned int num_pr;
248 /* The mode of the registers described above. */
249 machine_mode mode;
251 /* If this piece is formed from multiple homogeneous built-in types,
252 this is the mode of the built-in types, otherwise it is MODE. */
253 machine_mode orig_mode;
255 /* The offset in bytes of the piece from the start of the type. */
256 poly_uint64_pod offset;
259 /* Divides types analyzed as IS_PST into individual pieces. The pieces
260 are in memory order. */
261 auto_vec<piece, MAX_PIECES> pieces;
263 unsigned int num_zr () const;
264 unsigned int num_pr () const;
266 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
268 analysis_result analyze (const_tree);
269 bool analyze_registers (const_tree);
271 private:
272 analysis_result analyze_array (const_tree);
273 analysis_result analyze_record (const_tree);
274 void add_piece (const piece &);
278 /* The current code model. */
279 enum aarch64_code_model aarch64_cmodel;
281 /* The number of 64-bit elements in an SVE vector. */
282 poly_uint16 aarch64_sve_vg;
284 #ifdef HAVE_AS_TLS
285 #undef TARGET_HAVE_TLS
286 #define TARGET_HAVE_TLS 1
287 #endif
289 static bool aarch64_composite_type_p (const_tree, machine_mode);
290 static bool aarch64_return_in_memory_1 (const_tree);
291 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
292 const_tree,
293 machine_mode *, int *,
294 bool *, bool);
295 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
296 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
297 static void aarch64_override_options_after_change (void);
298 static bool aarch64_vector_mode_supported_p (machine_mode);
299 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
300 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
301 const_tree type,
302 int misalignment,
303 bool is_packed);
304 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
305 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
306 aarch64_addr_query_type);
307 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
309 /* The processor for which instructions should be scheduled. */
310 enum aarch64_processor aarch64_tune = cortexa53;
312 /* Mask to specify which instruction scheduling options should be used. */
313 uint64_t aarch64_tune_flags = 0;
315 /* Global flag for PC relative loads. */
316 bool aarch64_pcrelative_literal_loads;
318 /* Global flag for whether frame pointer is enabled. */
319 bool aarch64_use_frame_pointer;
321 #define BRANCH_PROTECT_STR_MAX 255
322 char *accepted_branch_protection_string = NULL;
324 static enum aarch64_parse_opt_result
325 aarch64_parse_branch_protection (const char*, char**);
327 /* Support for command line parsing of boolean flags in the tuning
328 structures. */
329 struct aarch64_flag_desc
331 const char* name;
332 unsigned int flag;
335 #define AARCH64_FUSION_PAIR(name, internal_name) \
336 { name, AARCH64_FUSE_##internal_name },
337 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
339 { "none", AARCH64_FUSE_NOTHING },
340 #include "aarch64-fusion-pairs.def"
341 { "all", AARCH64_FUSE_ALL },
342 { NULL, AARCH64_FUSE_NOTHING }
345 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
346 { name, AARCH64_EXTRA_TUNE_##internal_name },
347 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
349 { "none", AARCH64_EXTRA_TUNE_NONE },
350 #include "aarch64-tuning-flags.def"
351 { "all", AARCH64_EXTRA_TUNE_ALL },
352 { NULL, AARCH64_EXTRA_TUNE_NONE }
355 /* Tuning parameters. */
357 static const struct cpu_addrcost_table generic_addrcost_table =
360 1, /* hi */
361 0, /* si */
362 0, /* di */
363 1, /* ti */
365 0, /* pre_modify */
366 0, /* post_modify */
367 0, /* post_modify_ld3_st3 */
368 0, /* post_modify_ld4_st4 */
369 0, /* register_offset */
370 0, /* register_sextend */
371 0, /* register_zextend */
372 0 /* imm_offset */
375 static const struct cpu_addrcost_table exynosm1_addrcost_table =
378 0, /* hi */
379 0, /* si */
380 0, /* di */
381 2, /* ti */
383 0, /* pre_modify */
384 0, /* post_modify */
385 0, /* post_modify_ld3_st3 */
386 0, /* post_modify_ld4_st4 */
387 1, /* register_offset */
388 1, /* register_sextend */
389 2, /* register_zextend */
390 0, /* imm_offset */
393 static const struct cpu_addrcost_table xgene1_addrcost_table =
396 1, /* hi */
397 0, /* si */
398 0, /* di */
399 1, /* ti */
401 1, /* pre_modify */
402 1, /* post_modify */
403 1, /* post_modify_ld3_st3 */
404 1, /* post_modify_ld4_st4 */
405 0, /* register_offset */
406 1, /* register_sextend */
407 1, /* register_zextend */
408 0, /* imm_offset */
411 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
414 1, /* hi */
415 1, /* si */
416 1, /* di */
417 2, /* ti */
419 0, /* pre_modify */
420 0, /* post_modify */
421 0, /* post_modify_ld3_st3 */
422 0, /* post_modify_ld4_st4 */
423 2, /* register_offset */
424 3, /* register_sextend */
425 3, /* register_zextend */
426 0, /* imm_offset */
429 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
432 1, /* hi */
433 1, /* si */
434 1, /* di */
435 2, /* ti */
437 0, /* pre_modify */
438 0, /* post_modify */
439 0, /* post_modify_ld3_st3 */
440 0, /* post_modify_ld4_st4 */
441 2, /* register_offset */
442 3, /* register_sextend */
443 3, /* register_zextend */
444 0, /* imm_offset */
447 static const struct cpu_addrcost_table tsv110_addrcost_table =
450 1, /* hi */
451 0, /* si */
452 0, /* di */
453 1, /* ti */
455 0, /* pre_modify */
456 0, /* post_modify */
457 0, /* post_modify_ld3_st3 */
458 0, /* post_modify_ld4_st4 */
459 0, /* register_offset */
460 1, /* register_sextend */
461 1, /* register_zextend */
462 0, /* imm_offset */
465 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
468 1, /* hi */
469 1, /* si */
470 1, /* di */
471 2, /* ti */
473 1, /* pre_modify */
474 1, /* post_modify */
475 1, /* post_modify_ld3_st3 */
476 1, /* post_modify_ld4_st4 */
477 3, /* register_offset */
478 3, /* register_sextend */
479 3, /* register_zextend */
480 2, /* imm_offset */
483 static const struct cpu_addrcost_table a64fx_addrcost_table =
486 1, /* hi */
487 1, /* si */
488 1, /* di */
489 2, /* ti */
491 0, /* pre_modify */
492 0, /* post_modify */
493 0, /* post_modify_ld3_st3 */
494 0, /* post_modify_ld4_st4 */
495 2, /* register_offset */
496 3, /* register_sextend */
497 3, /* register_zextend */
498 0, /* imm_offset */
501 static const struct cpu_addrcost_table neoversev1_addrcost_table =
504 1, /* hi */
505 0, /* si */
506 0, /* di */
507 1, /* ti */
509 0, /* pre_modify */
510 0, /* post_modify */
511 3, /* post_modify_ld3_st3 */
512 3, /* post_modify_ld4_st4 */
513 0, /* register_offset */
514 0, /* register_sextend */
515 0, /* register_zextend */
516 0 /* imm_offset */
519 static const struct cpu_addrcost_table neoversen2_addrcost_table =
522 1, /* hi */
523 0, /* si */
524 0, /* di */
525 1, /* ti */
527 0, /* pre_modify */
528 0, /* post_modify */
529 2, /* post_modify_ld3_st3 */
530 2, /* post_modify_ld4_st4 */
531 0, /* register_offset */
532 0, /* register_sextend */
533 0, /* register_zextend */
534 0 /* imm_offset */
537 static const struct cpu_addrcost_table demeter_addrcost_table =
540 1, /* hi */
541 0, /* si */
542 0, /* di */
543 1, /* ti */
545 0, /* pre_modify */
546 0, /* post_modify */
547 2, /* post_modify_ld3_st3 */
548 2, /* post_modify_ld4_st4 */
549 0, /* register_offset */
550 0, /* register_sextend */
551 0, /* register_zextend */
552 0 /* imm_offset */
555 static const struct cpu_regmove_cost generic_regmove_cost =
557 1, /* GP2GP */
558 /* Avoid the use of slow int<->fp moves for spilling by setting
559 their cost higher than memmov_cost. */
560 5, /* GP2FP */
561 5, /* FP2GP */
562 2 /* FP2FP */
565 static const struct cpu_regmove_cost cortexa57_regmove_cost =
567 1, /* GP2GP */
568 /* Avoid the use of slow int<->fp moves for spilling by setting
569 their cost higher than memmov_cost. */
570 5, /* GP2FP */
571 5, /* FP2GP */
572 2 /* FP2FP */
575 static const struct cpu_regmove_cost cortexa53_regmove_cost =
577 1, /* GP2GP */
578 /* Avoid the use of slow int<->fp moves for spilling by setting
579 their cost higher than memmov_cost. */
580 5, /* GP2FP */
581 5, /* FP2GP */
582 2 /* FP2FP */
585 static const struct cpu_regmove_cost exynosm1_regmove_cost =
587 1, /* GP2GP */
588 /* Avoid the use of slow int<->fp moves for spilling by setting
589 their cost higher than memmov_cost (actual, 4 and 9). */
590 9, /* GP2FP */
591 9, /* FP2GP */
592 1 /* FP2FP */
595 static const struct cpu_regmove_cost thunderx_regmove_cost =
597 2, /* GP2GP */
598 2, /* GP2FP */
599 6, /* FP2GP */
600 4 /* FP2FP */
603 static const struct cpu_regmove_cost xgene1_regmove_cost =
605 1, /* GP2GP */
606 /* Avoid the use of slow int<->fp moves for spilling by setting
607 their cost higher than memmov_cost. */
608 8, /* GP2FP */
609 8, /* FP2GP */
610 2 /* FP2FP */
613 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
615 2, /* GP2GP */
616 /* Avoid the use of int<->fp moves for spilling. */
617 6, /* GP2FP */
618 6, /* FP2GP */
619 4 /* FP2FP */
622 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
624 1, /* GP2GP */
625 /* Avoid the use of int<->fp moves for spilling. */
626 5, /* GP2FP */
627 6, /* FP2GP */
628 3, /* FP2FP */
631 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
633 1, /* GP2GP */
634 /* Avoid the use of int<->fp moves for spilling. */
635 4, /* GP2FP */
636 5, /* FP2GP */
637 4 /* FP2FP */
640 static const struct cpu_regmove_cost tsv110_regmove_cost =
642 1, /* GP2GP */
643 /* Avoid the use of slow int<->fp moves for spilling by setting
644 their cost higher than memmov_cost. */
645 2, /* GP2FP */
646 3, /* FP2GP */
647 2 /* FP2FP */
650 static const struct cpu_regmove_cost a64fx_regmove_cost =
652 1, /* GP2GP */
653 /* Avoid the use of slow int<->fp moves for spilling by setting
654 their cost higher than memmov_cost. */
655 5, /* GP2FP */
656 7, /* FP2GP */
657 2 /* FP2FP */
660 static const struct cpu_regmove_cost neoversen2_regmove_cost =
662 1, /* GP2GP */
663 /* Spilling to int<->fp instead of memory is recommended so set
664 realistic costs compared to memmov_cost. */
665 3, /* GP2FP */
666 2, /* FP2GP */
667 2 /* FP2FP */
670 static const struct cpu_regmove_cost neoversev1_regmove_cost =
672 1, /* GP2GP */
673 /* Spilling to int<->fp instead of memory is recommended so set
674 realistic costs compared to memmov_cost. */
675 3, /* GP2FP */
676 2, /* FP2GP */
677 2 /* FP2FP */
680 static const struct cpu_regmove_cost demeter_regmove_cost =
682 1, /* GP2GP */
683 /* Spilling to int<->fp instead of memory is recommended so set
684 realistic costs compared to memmov_cost. */
685 3, /* GP2FP */
686 2, /* FP2GP */
687 2 /* FP2FP */
690 /* Generic costs for Advanced SIMD vector operations. */
691 static const advsimd_vec_cost generic_advsimd_vector_cost =
693 1, /* int_stmt_cost */
694 1, /* fp_stmt_cost */
695 0, /* ld2_st2_permute_cost */
696 0, /* ld3_st3_permute_cost */
697 0, /* ld4_st4_permute_cost */
698 2, /* permute_cost */
699 2, /* reduc_i8_cost */
700 2, /* reduc_i16_cost */
701 2, /* reduc_i32_cost */
702 2, /* reduc_i64_cost */
703 2, /* reduc_f16_cost */
704 2, /* reduc_f32_cost */
705 2, /* reduc_f64_cost */
706 2, /* store_elt_extra_cost */
707 2, /* vec_to_scalar_cost */
708 1, /* scalar_to_vec_cost */
709 1, /* align_load_cost */
710 1, /* unalign_load_cost */
711 1, /* unalign_store_cost */
712 1 /* store_cost */
715 /* Generic costs for SVE vector operations. */
716 static const sve_vec_cost generic_sve_vector_cost =
719 1, /* int_stmt_cost */
720 1, /* fp_stmt_cost */
721 0, /* ld2_st2_permute_cost */
722 0, /* ld3_st3_permute_cost */
723 0, /* ld4_st4_permute_cost */
724 2, /* permute_cost */
725 2, /* reduc_i8_cost */
726 2, /* reduc_i16_cost */
727 2, /* reduc_i32_cost */
728 2, /* reduc_i64_cost */
729 2, /* reduc_f16_cost */
730 2, /* reduc_f32_cost */
731 2, /* reduc_f64_cost */
732 2, /* store_elt_extra_cost */
733 2, /* vec_to_scalar_cost */
734 1, /* scalar_to_vec_cost */
735 1, /* align_load_cost */
736 1, /* unalign_load_cost */
737 1, /* unalign_store_cost */
738 1 /* store_cost */
740 2, /* clast_cost */
741 2, /* fadda_f16_cost */
742 2, /* fadda_f32_cost */
743 2, /* fadda_f64_cost */
744 4, /* gather_load_x32_cost */
745 2, /* gather_load_x64_cost */
746 1 /* scatter_store_elt_cost */
749 /* Generic costs for vector insn classes. */
750 static const struct cpu_vector_cost generic_vector_cost =
752 1, /* scalar_int_stmt_cost */
753 1, /* scalar_fp_stmt_cost */
754 1, /* scalar_load_cost */
755 1, /* scalar_store_cost */
756 3, /* cond_taken_branch_cost */
757 1, /* cond_not_taken_branch_cost */
758 &generic_advsimd_vector_cost, /* advsimd */
759 &generic_sve_vector_cost, /* sve */
760 nullptr /* issue_info */
763 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
765 2, /* int_stmt_cost */
766 5, /* fp_stmt_cost */
767 0, /* ld2_st2_permute_cost */
768 0, /* ld3_st3_permute_cost */
769 0, /* ld4_st4_permute_cost */
770 3, /* permute_cost */
771 13, /* reduc_i8_cost */
772 13, /* reduc_i16_cost */
773 13, /* reduc_i32_cost */
774 13, /* reduc_i64_cost */
775 13, /* reduc_f16_cost */
776 13, /* reduc_f32_cost */
777 13, /* reduc_f64_cost */
778 13, /* store_elt_extra_cost */
779 13, /* vec_to_scalar_cost */
780 4, /* scalar_to_vec_cost */
781 6, /* align_load_cost */
782 6, /* unalign_load_cost */
783 1, /* unalign_store_cost */
784 1 /* store_cost */
787 static const sve_vec_cost a64fx_sve_vector_cost =
790 2, /* int_stmt_cost */
791 5, /* fp_stmt_cost */
792 0, /* ld2_st2_permute_cost */
793 0, /* ld3_st3_permute_cost */
794 0, /* ld4_st4_permute_cost */
795 3, /* permute_cost */
796 13, /* reduc_i8_cost */
797 13, /* reduc_i16_cost */
798 13, /* reduc_i32_cost */
799 13, /* reduc_i64_cost */
800 13, /* reduc_f16_cost */
801 13, /* reduc_f32_cost */
802 13, /* reduc_f64_cost */
803 13, /* store_elt_extra_cost */
804 13, /* vec_to_scalar_cost */
805 4, /* scalar_to_vec_cost */
806 6, /* align_load_cost */
807 6, /* unalign_load_cost */
808 1, /* unalign_store_cost */
809 1 /* store_cost */
811 13, /* clast_cost */
812 13, /* fadda_f16_cost */
813 13, /* fadda_f32_cost */
814 13, /* fadda_f64_cost */
815 64, /* gather_load_x32_cost */
816 32, /* gather_load_x64_cost */
817 1 /* scatter_store_elt_cost */
820 static const struct cpu_vector_cost a64fx_vector_cost =
822 1, /* scalar_int_stmt_cost */
823 5, /* scalar_fp_stmt_cost */
824 4, /* scalar_load_cost */
825 1, /* scalar_store_cost */
826 3, /* cond_taken_branch_cost */
827 1, /* cond_not_taken_branch_cost */
828 &a64fx_advsimd_vector_cost, /* advsimd */
829 &a64fx_sve_vector_cost, /* sve */
830 nullptr /* issue_info */
833 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
835 1, /* int_stmt_cost */
836 3, /* fp_stmt_cost */
837 0, /* ld2_st2_permute_cost */
838 0, /* ld3_st3_permute_cost */
839 0, /* ld4_st4_permute_cost */
840 2, /* permute_cost */
841 1, /* reduc_i8_cost */
842 1, /* reduc_i16_cost */
843 1, /* reduc_i32_cost */
844 1, /* reduc_i64_cost */
845 1, /* reduc_f16_cost */
846 1, /* reduc_f32_cost */
847 1, /* reduc_f64_cost */
848 1, /* store_elt_extra_cost */
849 1, /* vec_to_scalar_cost */
850 1, /* scalar_to_vec_cost */
851 1, /* align_load_cost */
852 1, /* unalign_load_cost */
853 1, /* unalign_store_cost */
854 1 /* store_cost */
857 /* QDF24XX costs for vector insn classes. */
858 static const struct cpu_vector_cost qdf24xx_vector_cost =
860 1, /* scalar_int_stmt_cost */
861 1, /* scalar_fp_stmt_cost */
862 1, /* scalar_load_cost */
863 1, /* scalar_store_cost */
864 3, /* cond_taken_branch_cost */
865 1, /* cond_not_taken_branch_cost */
866 &qdf24xx_advsimd_vector_cost, /* advsimd */
867 nullptr, /* sve */
868 nullptr /* issue_info */
872 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
874 4, /* int_stmt_cost */
875 1, /* fp_stmt_cost */
876 0, /* ld2_st2_permute_cost */
877 0, /* ld3_st3_permute_cost */
878 0, /* ld4_st4_permute_cost */
879 4, /* permute_cost */
880 2, /* reduc_i8_cost */
881 2, /* reduc_i16_cost */
882 2, /* reduc_i32_cost */
883 2, /* reduc_i64_cost */
884 2, /* reduc_f16_cost */
885 2, /* reduc_f32_cost */
886 2, /* reduc_f64_cost */
887 2, /* store_elt_extra_cost */
888 2, /* vec_to_scalar_cost */
889 2, /* scalar_to_vec_cost */
890 3, /* align_load_cost */
891 5, /* unalign_load_cost */
892 5, /* unalign_store_cost */
893 1 /* store_cost */
896 /* ThunderX costs for vector insn classes. */
897 static const struct cpu_vector_cost thunderx_vector_cost =
899 1, /* scalar_int_stmt_cost */
900 1, /* scalar_fp_stmt_cost */
901 3, /* scalar_load_cost */
902 1, /* scalar_store_cost */
903 3, /* cond_taken_branch_cost */
904 3, /* cond_not_taken_branch_cost */
905 &thunderx_advsimd_vector_cost, /* advsimd */
906 nullptr, /* sve */
907 nullptr /* issue_info */
910 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
912 2, /* int_stmt_cost */
913 2, /* fp_stmt_cost */
914 0, /* ld2_st2_permute_cost */
915 0, /* ld3_st3_permute_cost */
916 0, /* ld4_st4_permute_cost */
917 2, /* permute_cost */
918 3, /* reduc_i8_cost */
919 3, /* reduc_i16_cost */
920 3, /* reduc_i32_cost */
921 3, /* reduc_i64_cost */
922 3, /* reduc_f16_cost */
923 3, /* reduc_f32_cost */
924 3, /* reduc_f64_cost */
925 3, /* store_elt_extra_cost */
926 3, /* vec_to_scalar_cost */
927 2, /* scalar_to_vec_cost */
928 5, /* align_load_cost */
929 5, /* unalign_load_cost */
930 1, /* unalign_store_cost */
931 1 /* store_cost */
934 static const struct cpu_vector_cost tsv110_vector_cost =
936 1, /* scalar_int_stmt_cost */
937 1, /* scalar_fp_stmt_cost */
938 5, /* scalar_load_cost */
939 1, /* scalar_store_cost */
940 1, /* cond_taken_branch_cost */
941 1, /* cond_not_taken_branch_cost */
942 &tsv110_advsimd_vector_cost, /* advsimd */
943 nullptr, /* sve */
944 nullptr /* issue_info */
947 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
949 2, /* int_stmt_cost */
950 2, /* fp_stmt_cost */
951 0, /* ld2_st2_permute_cost */
952 0, /* ld3_st3_permute_cost */
953 0, /* ld4_st4_permute_cost */
954 3, /* permute_cost */
955 8, /* reduc_i8_cost */
956 8, /* reduc_i16_cost */
957 8, /* reduc_i32_cost */
958 8, /* reduc_i64_cost */
959 8, /* reduc_f16_cost */
960 8, /* reduc_f32_cost */
961 8, /* reduc_f64_cost */
962 8, /* store_elt_extra_cost */
963 8, /* vec_to_scalar_cost */
964 8, /* scalar_to_vec_cost */
965 4, /* align_load_cost */
966 4, /* unalign_load_cost */
967 1, /* unalign_store_cost */
968 1 /* store_cost */
971 /* Cortex-A57 costs for vector insn classes. */
972 static const struct cpu_vector_cost cortexa57_vector_cost =
974 1, /* scalar_int_stmt_cost */
975 1, /* scalar_fp_stmt_cost */
976 4, /* scalar_load_cost */
977 1, /* scalar_store_cost */
978 1, /* cond_taken_branch_cost */
979 1, /* cond_not_taken_branch_cost */
980 &cortexa57_advsimd_vector_cost, /* advsimd */
981 nullptr, /* sve */
982 nullptr /* issue_info */
985 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
987 3, /* int_stmt_cost */
988 3, /* fp_stmt_cost */
989 0, /* ld2_st2_permute_cost */
990 0, /* ld3_st3_permute_cost */
991 0, /* ld4_st4_permute_cost */
992 3, /* permute_cost */
993 3, /* reduc_i8_cost */
994 3, /* reduc_i16_cost */
995 3, /* reduc_i32_cost */
996 3, /* reduc_i64_cost */
997 3, /* reduc_f16_cost */
998 3, /* reduc_f32_cost */
999 3, /* reduc_f64_cost */
1000 3, /* store_elt_extra_cost */
1001 3, /* vec_to_scalar_cost */
1002 3, /* scalar_to_vec_cost */
1003 5, /* align_load_cost */
1004 5, /* unalign_load_cost */
1005 1, /* unalign_store_cost */
1006 1 /* store_cost */
1009 static const struct cpu_vector_cost exynosm1_vector_cost =
1011 1, /* scalar_int_stmt_cost */
1012 1, /* scalar_fp_stmt_cost */
1013 5, /* scalar_load_cost */
1014 1, /* scalar_store_cost */
1015 1, /* cond_taken_branch_cost */
1016 1, /* cond_not_taken_branch_cost */
1017 &exynosm1_advsimd_vector_cost, /* advsimd */
1018 nullptr, /* sve */
1019 nullptr /* issue_info */
1022 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1024 2, /* int_stmt_cost */
1025 2, /* fp_stmt_cost */
1026 0, /* ld2_st2_permute_cost */
1027 0, /* ld3_st3_permute_cost */
1028 0, /* ld4_st4_permute_cost */
1029 2, /* permute_cost */
1030 4, /* reduc_i8_cost */
1031 4, /* reduc_i16_cost */
1032 4, /* reduc_i32_cost */
1033 4, /* reduc_i64_cost */
1034 4, /* reduc_f16_cost */
1035 4, /* reduc_f32_cost */
1036 4, /* reduc_f64_cost */
1037 4, /* store_elt_extra_cost */
1038 4, /* vec_to_scalar_cost */
1039 4, /* scalar_to_vec_cost */
1040 10, /* align_load_cost */
1041 10, /* unalign_load_cost */
1042 2, /* unalign_store_cost */
1043 2 /* store_cost */
1046 /* Generic costs for vector insn classes. */
1047 static const struct cpu_vector_cost xgene1_vector_cost =
1049 1, /* scalar_int_stmt_cost */
1050 1, /* scalar_fp_stmt_cost */
1051 5, /* scalar_load_cost */
1052 1, /* scalar_store_cost */
1053 2, /* cond_taken_branch_cost */
1054 1, /* cond_not_taken_branch_cost */
1055 &xgene1_advsimd_vector_cost, /* advsimd */
1056 nullptr, /* sve */
1057 nullptr /* issue_info */
1060 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1062 4, /* int_stmt_cost */
1063 5, /* fp_stmt_cost */
1064 0, /* ld2_st2_permute_cost */
1065 0, /* ld3_st3_permute_cost */
1066 0, /* ld4_st4_permute_cost */
1067 10, /* permute_cost */
1068 6, /* reduc_i8_cost */
1069 6, /* reduc_i16_cost */
1070 6, /* reduc_i32_cost */
1071 6, /* reduc_i64_cost */
1072 6, /* reduc_f16_cost */
1073 6, /* reduc_f32_cost */
1074 6, /* reduc_f64_cost */
1075 6, /* store_elt_extra_cost */
1076 6, /* vec_to_scalar_cost */
1077 5, /* scalar_to_vec_cost */
1078 4, /* align_load_cost */
1079 4, /* unalign_load_cost */
1080 1, /* unalign_store_cost */
1081 1 /* store_cost */
1084 /* Costs for vector insn classes for Vulcan. */
1085 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1087 1, /* scalar_int_stmt_cost */
1088 6, /* scalar_fp_stmt_cost */
1089 4, /* scalar_load_cost */
1090 1, /* scalar_store_cost */
1091 2, /* cond_taken_branch_cost */
1092 1, /* cond_not_taken_branch_cost */
1093 &thunderx2t99_advsimd_vector_cost, /* advsimd */
1094 nullptr, /* sve */
1095 nullptr /* issue_info */
1098 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1100 5, /* int_stmt_cost */
1101 5, /* fp_stmt_cost */
1102 0, /* ld2_st2_permute_cost */
1103 0, /* ld3_st3_permute_cost */
1104 0, /* ld4_st4_permute_cost */
1105 10, /* permute_cost */
1106 5, /* reduc_i8_cost */
1107 5, /* reduc_i16_cost */
1108 5, /* reduc_i32_cost */
1109 5, /* reduc_i64_cost */
1110 5, /* reduc_f16_cost */
1111 5, /* reduc_f32_cost */
1112 5, /* reduc_f64_cost */
1113 5, /* store_elt_extra_cost */
1114 5, /* vec_to_scalar_cost */
1115 5, /* scalar_to_vec_cost */
1116 4, /* align_load_cost */
1117 4, /* unalign_load_cost */
1118 4, /* unalign_store_cost */
1119 4 /* store_cost */
1122 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1124 1, /* scalar_int_stmt_cost */
1125 5, /* scalar_fp_stmt_cost */
1126 4, /* scalar_load_cost */
1127 1, /* scalar_store_cost */
1128 2, /* cond_taken_branch_cost */
1129 1, /* cond_not_taken_branch_cost */
1130 &thunderx3t110_advsimd_vector_cost, /* advsimd */
1131 nullptr, /* sve */
1132 nullptr /* issue_info */
1135 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1137 3, /* int_stmt_cost */
1138 3, /* fp_stmt_cost */
1139 0, /* ld2_st2_permute_cost */
1140 0, /* ld3_st3_permute_cost */
1141 0, /* ld4_st4_permute_cost */
1142 2, /* permute_cost */
1143 12, /* reduc_i8_cost */
1144 9, /* reduc_i16_cost */
1145 6, /* reduc_i32_cost */
1146 5, /* reduc_i64_cost */
1147 9, /* reduc_f16_cost */
1148 6, /* reduc_f32_cost */
1149 5, /* reduc_f64_cost */
1150 8, /* store_elt_extra_cost */
1151 6, /* vec_to_scalar_cost */
1152 7, /* scalar_to_vec_cost */
1153 5, /* align_load_cost */
1154 5, /* unalign_load_cost */
1155 2, /* unalign_store_cost */
1156 2 /* store_cost */
1159 /* Ampere-1 costs for vector insn classes. */
1160 static const struct cpu_vector_cost ampere1_vector_cost =
1162 1, /* scalar_int_stmt_cost */
1163 1, /* scalar_fp_stmt_cost */
1164 4, /* scalar_load_cost */
1165 1, /* scalar_store_cost */
1166 1, /* cond_taken_branch_cost */
1167 1, /* cond_not_taken_branch_cost */
1168 &ampere1_advsimd_vector_cost, /* advsimd */
1169 nullptr, /* sve */
1170 nullptr /* issue_info */
1173 /* Generic costs for branch instructions. */
1174 static const struct cpu_branch_cost generic_branch_cost =
1176 1, /* Predictable. */
1177 3 /* Unpredictable. */
1180 /* Generic approximation modes. */
1181 static const cpu_approx_modes generic_approx_modes =
1183 AARCH64_APPROX_NONE, /* division */
1184 AARCH64_APPROX_NONE, /* sqrt */
1185 AARCH64_APPROX_NONE /* recip_sqrt */
1188 /* Approximation modes for Exynos M1. */
1189 static const cpu_approx_modes exynosm1_approx_modes =
1191 AARCH64_APPROX_NONE, /* division */
1192 AARCH64_APPROX_ALL, /* sqrt */
1193 AARCH64_APPROX_ALL /* recip_sqrt */
1196 /* Approximation modes for X-Gene 1. */
1197 static const cpu_approx_modes xgene1_approx_modes =
1199 AARCH64_APPROX_NONE, /* division */
1200 AARCH64_APPROX_NONE, /* sqrt */
1201 AARCH64_APPROX_ALL /* recip_sqrt */
1204 /* Generic prefetch settings (which disable prefetch). */
1205 static const cpu_prefetch_tune generic_prefetch_tune =
1207 0, /* num_slots */
1208 -1, /* l1_cache_size */
1209 -1, /* l1_cache_line_size */
1210 -1, /* l2_cache_size */
1211 true, /* prefetch_dynamic_strides */
1212 -1, /* minimum_stride */
1213 -1 /* default_opt_level */
1216 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1218 0, /* num_slots */
1219 -1, /* l1_cache_size */
1220 64, /* l1_cache_line_size */
1221 -1, /* l2_cache_size */
1222 true, /* prefetch_dynamic_strides */
1223 -1, /* minimum_stride */
1224 -1 /* default_opt_level */
1227 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1229 4, /* num_slots */
1230 32, /* l1_cache_size */
1231 64, /* l1_cache_line_size */
1232 512, /* l2_cache_size */
1233 false, /* prefetch_dynamic_strides */
1234 2048, /* minimum_stride */
1235 3 /* default_opt_level */
1238 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1240 8, /* num_slots */
1241 32, /* l1_cache_size */
1242 128, /* l1_cache_line_size */
1243 16*1024, /* l2_cache_size */
1244 true, /* prefetch_dynamic_strides */
1245 -1, /* minimum_stride */
1246 3 /* default_opt_level */
1249 static const cpu_prefetch_tune thunderx_prefetch_tune =
1251 8, /* num_slots */
1252 32, /* l1_cache_size */
1253 128, /* l1_cache_line_size */
1254 -1, /* l2_cache_size */
1255 true, /* prefetch_dynamic_strides */
1256 -1, /* minimum_stride */
1257 -1 /* default_opt_level */
1260 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1262 8, /* num_slots */
1263 32, /* l1_cache_size */
1264 64, /* l1_cache_line_size */
1265 256, /* l2_cache_size */
1266 true, /* prefetch_dynamic_strides */
1267 -1, /* minimum_stride */
1268 -1 /* default_opt_level */
1271 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1273 8, /* num_slots */
1274 32, /* l1_cache_size */
1275 64, /* l1_cache_line_size */
1276 256, /* l2_cache_size */
1277 true, /* prefetch_dynamic_strides */
1278 -1, /* minimum_stride */
1279 -1 /* default_opt_level */
1282 static const cpu_prefetch_tune tsv110_prefetch_tune =
1284 0, /* num_slots */
1285 64, /* l1_cache_size */
1286 64, /* l1_cache_line_size */
1287 512, /* l2_cache_size */
1288 true, /* prefetch_dynamic_strides */
1289 -1, /* minimum_stride */
1290 -1 /* default_opt_level */
1293 static const cpu_prefetch_tune xgene1_prefetch_tune =
1295 8, /* num_slots */
1296 32, /* l1_cache_size */
1297 64, /* l1_cache_line_size */
1298 256, /* l2_cache_size */
1299 true, /* prefetch_dynamic_strides */
1300 -1, /* minimum_stride */
1301 -1 /* default_opt_level */
1304 static const cpu_prefetch_tune a64fx_prefetch_tune =
1306 8, /* num_slots */
1307 64, /* l1_cache_size */
1308 256, /* l1_cache_line_size */
1309 32768, /* l2_cache_size */
1310 true, /* prefetch_dynamic_strides */
1311 -1, /* minimum_stride */
1312 -1 /* default_opt_level */
1315 static const cpu_prefetch_tune ampere1_prefetch_tune =
1317 0, /* num_slots */
1318 64, /* l1_cache_size */
1319 64, /* l1_cache_line_size */
1320 2048, /* l2_cache_size */
1321 true, /* prefetch_dynamic_strides */
1322 -1, /* minimum_stride */
1323 -1 /* default_opt_level */
1326 static const struct tune_params generic_tunings =
1328 &cortexa57_extra_costs,
1329 &generic_addrcost_table,
1330 &generic_regmove_cost,
1331 &generic_vector_cost,
1332 &generic_branch_cost,
1333 &generic_approx_modes,
1334 SVE_NOT_IMPLEMENTED, /* sve_width */
1335 { 4, /* load_int. */
1336 4, /* store_int. */
1337 4, /* load_fp. */
1338 4, /* store_fp. */
1339 4, /* load_pred. */
1340 4 /* store_pred. */
1341 }, /* memmov_cost. */
1342 2, /* issue_rate */
1343 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1344 "16:12", /* function_align. */
1345 "4", /* jump_align. */
1346 "8", /* loop_align. */
1347 2, /* int_reassoc_width. */
1348 4, /* fp_reassoc_width. */
1349 1, /* vec_reassoc_width. */
1350 2, /* min_div_recip_mul_sf. */
1351 2, /* min_div_recip_mul_df. */
1352 0, /* max_case_values. */
1353 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1354 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1355 Neoverse V1. It does not have a noticeable effect on A64FX and should
1356 have at most a very minor effect on SVE2 cores. */
1357 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
1358 &generic_prefetch_tune
1361 static const struct tune_params cortexa35_tunings =
1363 &cortexa53_extra_costs,
1364 &generic_addrcost_table,
1365 &cortexa53_regmove_cost,
1366 &generic_vector_cost,
1367 &generic_branch_cost,
1368 &generic_approx_modes,
1369 SVE_NOT_IMPLEMENTED, /* sve_width */
1370 { 4, /* load_int. */
1371 4, /* store_int. */
1372 4, /* load_fp. */
1373 4, /* store_fp. */
1374 4, /* load_pred. */
1375 4 /* store_pred. */
1376 }, /* memmov_cost. */
1377 1, /* issue_rate */
1378 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1379 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1380 "16", /* function_align. */
1381 "4", /* jump_align. */
1382 "8", /* loop_align. */
1383 2, /* int_reassoc_width. */
1384 4, /* fp_reassoc_width. */
1385 1, /* vec_reassoc_width. */
1386 2, /* min_div_recip_mul_sf. */
1387 2, /* min_div_recip_mul_df. */
1388 0, /* max_case_values. */
1389 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1390 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1391 &generic_prefetch_tune
1394 static const struct tune_params cortexa53_tunings =
1396 &cortexa53_extra_costs,
1397 &generic_addrcost_table,
1398 &cortexa53_regmove_cost,
1399 &generic_vector_cost,
1400 &generic_branch_cost,
1401 &generic_approx_modes,
1402 SVE_NOT_IMPLEMENTED, /* sve_width */
1403 { 4, /* load_int. */
1404 4, /* store_int. */
1405 4, /* load_fp. */
1406 4, /* store_fp. */
1407 4, /* load_pred. */
1408 4 /* store_pred. */
1409 }, /* memmov_cost. */
1410 2, /* issue_rate */
1411 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1412 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1413 "16", /* function_align. */
1414 "4", /* jump_align. */
1415 "8", /* loop_align. */
1416 2, /* int_reassoc_width. */
1417 4, /* fp_reassoc_width. */
1418 1, /* vec_reassoc_width. */
1419 2, /* min_div_recip_mul_sf. */
1420 2, /* min_div_recip_mul_df. */
1421 0, /* max_case_values. */
1422 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1423 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1424 &generic_prefetch_tune
1427 static const struct tune_params cortexa57_tunings =
1429 &cortexa57_extra_costs,
1430 &generic_addrcost_table,
1431 &cortexa57_regmove_cost,
1432 &cortexa57_vector_cost,
1433 &generic_branch_cost,
1434 &generic_approx_modes,
1435 SVE_NOT_IMPLEMENTED, /* sve_width */
1436 { 4, /* load_int. */
1437 4, /* store_int. */
1438 4, /* load_fp. */
1439 4, /* store_fp. */
1440 4, /* load_pred. */
1441 4 /* store_pred. */
1442 }, /* memmov_cost. */
1443 3, /* issue_rate */
1444 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1445 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1446 "16", /* function_align. */
1447 "4", /* jump_align. */
1448 "8", /* loop_align. */
1449 2, /* int_reassoc_width. */
1450 4, /* fp_reassoc_width. */
1451 1, /* vec_reassoc_width. */
1452 2, /* min_div_recip_mul_sf. */
1453 2, /* min_div_recip_mul_df. */
1454 0, /* max_case_values. */
1455 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1456 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1457 &generic_prefetch_tune
1460 static const struct tune_params cortexa72_tunings =
1462 &cortexa57_extra_costs,
1463 &generic_addrcost_table,
1464 &cortexa57_regmove_cost,
1465 &cortexa57_vector_cost,
1466 &generic_branch_cost,
1467 &generic_approx_modes,
1468 SVE_NOT_IMPLEMENTED, /* sve_width */
1469 { 4, /* load_int. */
1470 4, /* store_int. */
1471 4, /* load_fp. */
1472 4, /* store_fp. */
1473 4, /* load_pred. */
1474 4 /* store_pred. */
1475 }, /* memmov_cost. */
1476 3, /* issue_rate */
1477 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1478 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1479 "16", /* function_align. */
1480 "4", /* jump_align. */
1481 "8", /* loop_align. */
1482 2, /* int_reassoc_width. */
1483 4, /* fp_reassoc_width. */
1484 1, /* vec_reassoc_width. */
1485 2, /* min_div_recip_mul_sf. */
1486 2, /* min_div_recip_mul_df. */
1487 0, /* max_case_values. */
1488 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1489 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1490 &generic_prefetch_tune
1493 static const struct tune_params cortexa73_tunings =
1495 &cortexa57_extra_costs,
1496 &generic_addrcost_table,
1497 &cortexa57_regmove_cost,
1498 &cortexa57_vector_cost,
1499 &generic_branch_cost,
1500 &generic_approx_modes,
1501 SVE_NOT_IMPLEMENTED, /* sve_width */
1502 { 4, /* load_int. */
1503 4, /* store_int. */
1504 4, /* load_fp. */
1505 4, /* store_fp. */
1506 4, /* load_pred. */
1507 4 /* store_pred. */
1508 }, /* memmov_cost. */
1509 2, /* issue_rate. */
1510 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1511 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1512 "16", /* function_align. */
1513 "4", /* jump_align. */
1514 "8", /* loop_align. */
1515 2, /* int_reassoc_width. */
1516 4, /* fp_reassoc_width. */
1517 1, /* vec_reassoc_width. */
1518 2, /* min_div_recip_mul_sf. */
1519 2, /* min_div_recip_mul_df. */
1520 0, /* max_case_values. */
1521 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1522 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1523 &generic_prefetch_tune
1528 static const struct tune_params exynosm1_tunings =
1530 &exynosm1_extra_costs,
1531 &exynosm1_addrcost_table,
1532 &exynosm1_regmove_cost,
1533 &exynosm1_vector_cost,
1534 &generic_branch_cost,
1535 &exynosm1_approx_modes,
1536 SVE_NOT_IMPLEMENTED, /* sve_width */
1537 { 4, /* load_int. */
1538 4, /* store_int. */
1539 4, /* load_fp. */
1540 4, /* store_fp. */
1541 4, /* load_pred. */
1542 4 /* store_pred. */
1543 }, /* memmov_cost. */
1544 3, /* issue_rate */
1545 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
1546 "4", /* function_align. */
1547 "4", /* jump_align. */
1548 "4", /* loop_align. */
1549 2, /* int_reassoc_width. */
1550 4, /* fp_reassoc_width. */
1551 1, /* vec_reassoc_width. */
1552 2, /* min_div_recip_mul_sf. */
1553 2, /* min_div_recip_mul_df. */
1554 48, /* max_case_values. */
1555 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1556 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1557 &exynosm1_prefetch_tune
1560 static const struct tune_params thunderxt88_tunings =
1562 &thunderx_extra_costs,
1563 &generic_addrcost_table,
1564 &thunderx_regmove_cost,
1565 &thunderx_vector_cost,
1566 &generic_branch_cost,
1567 &generic_approx_modes,
1568 SVE_NOT_IMPLEMENTED, /* sve_width */
1569 { 6, /* load_int. */
1570 6, /* store_int. */
1571 6, /* load_fp. */
1572 6, /* store_fp. */
1573 6, /* load_pred. */
1574 6 /* store_pred. */
1575 }, /* memmov_cost. */
1576 2, /* issue_rate */
1577 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1578 "8", /* function_align. */
1579 "8", /* jump_align. */
1580 "8", /* loop_align. */
1581 2, /* int_reassoc_width. */
1582 4, /* fp_reassoc_width. */
1583 1, /* vec_reassoc_width. */
1584 2, /* min_div_recip_mul_sf. */
1585 2, /* min_div_recip_mul_df. */
1586 0, /* max_case_values. */
1587 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1588 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1589 &thunderxt88_prefetch_tune
1592 static const struct tune_params thunderx_tunings =
1594 &thunderx_extra_costs,
1595 &generic_addrcost_table,
1596 &thunderx_regmove_cost,
1597 &thunderx_vector_cost,
1598 &generic_branch_cost,
1599 &generic_approx_modes,
1600 SVE_NOT_IMPLEMENTED, /* sve_width */
1601 { 6, /* load_int. */
1602 6, /* store_int. */
1603 6, /* load_fp. */
1604 6, /* store_fp. */
1605 6, /* load_pred. */
1606 6 /* store_pred. */
1607 }, /* memmov_cost. */
1608 2, /* issue_rate */
1609 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1610 "8", /* function_align. */
1611 "8", /* jump_align. */
1612 "8", /* loop_align. */
1613 2, /* int_reassoc_width. */
1614 4, /* fp_reassoc_width. */
1615 1, /* vec_reassoc_width. */
1616 2, /* min_div_recip_mul_sf. */
1617 2, /* min_div_recip_mul_df. */
1618 0, /* max_case_values. */
1619 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1620 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1621 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1622 &thunderx_prefetch_tune
1625 static const struct tune_params tsv110_tunings =
1627 &tsv110_extra_costs,
1628 &tsv110_addrcost_table,
1629 &tsv110_regmove_cost,
1630 &tsv110_vector_cost,
1631 &generic_branch_cost,
1632 &generic_approx_modes,
1633 SVE_NOT_IMPLEMENTED, /* sve_width */
1634 { 4, /* load_int. */
1635 4, /* store_int. */
1636 4, /* load_fp. */
1637 4, /* store_fp. */
1638 4, /* load_pred. */
1639 4 /* store_pred. */
1640 }, /* memmov_cost. */
1641 4, /* issue_rate */
1642 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1643 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1644 "16", /* function_align. */
1645 "4", /* jump_align. */
1646 "8", /* loop_align. */
1647 2, /* int_reassoc_width. */
1648 4, /* fp_reassoc_width. */
1649 1, /* vec_reassoc_width. */
1650 2, /* min_div_recip_mul_sf. */
1651 2, /* min_div_recip_mul_df. */
1652 0, /* max_case_values. */
1653 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1654 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1655 &tsv110_prefetch_tune
1658 static const struct tune_params xgene1_tunings =
1660 &xgene1_extra_costs,
1661 &xgene1_addrcost_table,
1662 &xgene1_regmove_cost,
1663 &xgene1_vector_cost,
1664 &generic_branch_cost,
1665 &xgene1_approx_modes,
1666 SVE_NOT_IMPLEMENTED, /* sve_width */
1667 { 6, /* load_int. */
1668 6, /* store_int. */
1669 6, /* load_fp. */
1670 6, /* store_fp. */
1671 6, /* load_pred. */
1672 6 /* store_pred. */
1673 }, /* memmov_cost. */
1674 4, /* issue_rate */
1675 AARCH64_FUSE_NOTHING, /* fusible_ops */
1676 "16", /* function_align. */
1677 "16", /* jump_align. */
1678 "16", /* loop_align. */
1679 2, /* int_reassoc_width. */
1680 4, /* fp_reassoc_width. */
1681 1, /* vec_reassoc_width. */
1682 2, /* min_div_recip_mul_sf. */
1683 2, /* min_div_recip_mul_df. */
1684 17, /* max_case_values. */
1685 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1686 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1687 &xgene1_prefetch_tune
1690 static const struct tune_params emag_tunings =
1692 &xgene1_extra_costs,
1693 &xgene1_addrcost_table,
1694 &xgene1_regmove_cost,
1695 &xgene1_vector_cost,
1696 &generic_branch_cost,
1697 &xgene1_approx_modes,
1698 SVE_NOT_IMPLEMENTED,
1699 { 6, /* load_int. */
1700 6, /* store_int. */
1701 6, /* load_fp. */
1702 6, /* store_fp. */
1703 6, /* load_pred. */
1704 6 /* store_pred. */
1705 }, /* memmov_cost. */
1706 4, /* issue_rate */
1707 AARCH64_FUSE_NOTHING, /* fusible_ops */
1708 "16", /* function_align. */
1709 "16", /* jump_align. */
1710 "16", /* loop_align. */
1711 2, /* int_reassoc_width. */
1712 4, /* fp_reassoc_width. */
1713 1, /* vec_reassoc_width. */
1714 2, /* min_div_recip_mul_sf. */
1715 2, /* min_div_recip_mul_df. */
1716 17, /* max_case_values. */
1717 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1718 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1719 &xgene1_prefetch_tune
1722 static const struct tune_params qdf24xx_tunings =
1724 &qdf24xx_extra_costs,
1725 &qdf24xx_addrcost_table,
1726 &qdf24xx_regmove_cost,
1727 &qdf24xx_vector_cost,
1728 &generic_branch_cost,
1729 &generic_approx_modes,
1730 SVE_NOT_IMPLEMENTED, /* sve_width */
1731 { 4, /* load_int. */
1732 4, /* store_int. */
1733 4, /* load_fp. */
1734 4, /* store_fp. */
1735 4, /* load_pred. */
1736 4 /* store_pred. */
1737 }, /* memmov_cost. */
1738 4, /* issue_rate */
1739 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1740 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1741 "16", /* function_align. */
1742 "8", /* jump_align. */
1743 "16", /* loop_align. */
1744 2, /* int_reassoc_width. */
1745 4, /* fp_reassoc_width. */
1746 1, /* vec_reassoc_width. */
1747 2, /* min_div_recip_mul_sf. */
1748 2, /* min_div_recip_mul_df. */
1749 0, /* max_case_values. */
1750 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1751 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1752 &qdf24xx_prefetch_tune
1755 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1756 for now. */
1757 static const struct tune_params saphira_tunings =
1759 &generic_extra_costs,
1760 &generic_addrcost_table,
1761 &generic_regmove_cost,
1762 &generic_vector_cost,
1763 &generic_branch_cost,
1764 &generic_approx_modes,
1765 SVE_NOT_IMPLEMENTED, /* sve_width */
1766 { 4, /* load_int. */
1767 4, /* store_int. */
1768 4, /* load_fp. */
1769 4, /* store_fp. */
1770 4, /* load_pred. */
1771 4 /* store_pred. */
1772 }, /* memmov_cost. */
1773 4, /* issue_rate */
1774 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1775 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1776 "16", /* function_align. */
1777 "8", /* jump_align. */
1778 "16", /* loop_align. */
1779 2, /* int_reassoc_width. */
1780 4, /* fp_reassoc_width. */
1781 1, /* vec_reassoc_width. */
1782 2, /* min_div_recip_mul_sf. */
1783 2, /* min_div_recip_mul_df. */
1784 0, /* max_case_values. */
1785 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1786 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1787 &generic_prefetch_tune
1790 static const struct tune_params thunderx2t99_tunings =
1792 &thunderx2t99_extra_costs,
1793 &thunderx2t99_addrcost_table,
1794 &thunderx2t99_regmove_cost,
1795 &thunderx2t99_vector_cost,
1796 &generic_branch_cost,
1797 &generic_approx_modes,
1798 SVE_NOT_IMPLEMENTED, /* sve_width */
1799 { 4, /* load_int. */
1800 4, /* store_int. */
1801 4, /* load_fp. */
1802 4, /* store_fp. */
1803 4, /* load_pred. */
1804 4 /* store_pred. */
1805 }, /* memmov_cost. */
1806 4, /* issue_rate. */
1807 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1808 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1809 "16", /* function_align. */
1810 "8", /* jump_align. */
1811 "16", /* loop_align. */
1812 3, /* int_reassoc_width. */
1813 2, /* fp_reassoc_width. */
1814 2, /* vec_reassoc_width. */
1815 2, /* min_div_recip_mul_sf. */
1816 2, /* min_div_recip_mul_df. */
1817 0, /* max_case_values. */
1818 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1819 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1820 &thunderx2t99_prefetch_tune
1823 static const struct tune_params thunderx3t110_tunings =
1825 &thunderx3t110_extra_costs,
1826 &thunderx3t110_addrcost_table,
1827 &thunderx3t110_regmove_cost,
1828 &thunderx3t110_vector_cost,
1829 &generic_branch_cost,
1830 &generic_approx_modes,
1831 SVE_NOT_IMPLEMENTED, /* sve_width */
1832 { 4, /* load_int. */
1833 4, /* store_int. */
1834 4, /* load_fp. */
1835 4, /* store_fp. */
1836 4, /* load_pred. */
1837 4 /* store_pred. */
1838 }, /* memmov_cost. */
1839 6, /* issue_rate. */
1840 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1841 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1842 "16", /* function_align. */
1843 "8", /* jump_align. */
1844 "16", /* loop_align. */
1845 3, /* int_reassoc_width. */
1846 2, /* fp_reassoc_width. */
1847 2, /* vec_reassoc_width. */
1848 2, /* min_div_recip_mul_sf. */
1849 2, /* min_div_recip_mul_df. */
1850 0, /* max_case_values. */
1851 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1852 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1853 &thunderx3t110_prefetch_tune
1856 static const struct tune_params neoversen1_tunings =
1858 &cortexa76_extra_costs,
1859 &generic_addrcost_table,
1860 &generic_regmove_cost,
1861 &cortexa57_vector_cost,
1862 &generic_branch_cost,
1863 &generic_approx_modes,
1864 SVE_NOT_IMPLEMENTED, /* sve_width */
1865 { 4, /* load_int. */
1866 2, /* store_int. */
1867 5, /* load_fp. */
1868 2, /* store_fp. */
1869 4, /* load_pred. */
1870 4 /* store_pred. */
1871 }, /* memmov_cost. */
1872 3, /* issue_rate */
1873 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1874 "32:16", /* function_align. */
1875 "4", /* jump_align. */
1876 "32:16", /* loop_align. */
1877 2, /* int_reassoc_width. */
1878 4, /* fp_reassoc_width. */
1879 2, /* vec_reassoc_width. */
1880 2, /* min_div_recip_mul_sf. */
1881 2, /* min_div_recip_mul_df. */
1882 0, /* max_case_values. */
1883 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1884 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1885 &generic_prefetch_tune
1888 static const struct tune_params ampere1_tunings =
1890 &ampere1_extra_costs,
1891 &generic_addrcost_table,
1892 &generic_regmove_cost,
1893 &ampere1_vector_cost,
1894 &generic_branch_cost,
1895 &generic_approx_modes,
1896 SVE_NOT_IMPLEMENTED, /* sve_width */
1897 { 4, /* load_int. */
1898 4, /* store_int. */
1899 4, /* load_fp. */
1900 4, /* store_fp. */
1901 4, /* load_pred. */
1902 4 /* store_pred. */
1903 }, /* memmov_cost. */
1904 4, /* issue_rate */
1905 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1906 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1907 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1908 AARCH64_FUSE_CMP_BRANCH),
1909 /* fusible_ops */
1910 "32", /* function_align. */
1911 "4", /* jump_align. */
1912 "32:16", /* loop_align. */
1913 2, /* int_reassoc_width. */
1914 4, /* fp_reassoc_width. */
1915 2, /* vec_reassoc_width. */
1916 2, /* min_div_recip_mul_sf. */
1917 2, /* min_div_recip_mul_df. */
1918 0, /* max_case_values. */
1919 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1920 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1921 &ampere1_prefetch_tune
1924 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1926 2, /* int_stmt_cost */
1927 2, /* fp_stmt_cost */
1928 4, /* ld2_st2_permute_cost */
1929 4, /* ld3_st3_permute_cost */
1930 5, /* ld4_st4_permute_cost */
1931 3, /* permute_cost */
1932 4, /* reduc_i8_cost */
1933 4, /* reduc_i16_cost */
1934 2, /* reduc_i32_cost */
1935 2, /* reduc_i64_cost */
1936 6, /* reduc_f16_cost */
1937 3, /* reduc_f32_cost */
1938 2, /* reduc_f64_cost */
1939 2, /* store_elt_extra_cost */
1940 /* This value is just inherited from the Cortex-A57 table. */
1941 8, /* vec_to_scalar_cost */
1942 /* This depends very much on what the scalar value is and
1943 where it comes from. E.g. some constants take two dependent
1944 instructions or a load, while others might be moved from a GPR.
1945 4 seems to be a reasonable compromise in practice. */
1946 4, /* scalar_to_vec_cost */
1947 4, /* align_load_cost */
1948 4, /* unalign_load_cost */
1949 /* Although stores have a latency of 2 and compete for the
1950 vector pipes, in practice it's better not to model that. */
1951 1, /* unalign_store_cost */
1952 1 /* store_cost */
1955 static const sve_vec_cost neoversev1_sve_vector_cost =
1958 2, /* int_stmt_cost */
1959 2, /* fp_stmt_cost */
1960 4, /* ld2_st2_permute_cost */
1961 7, /* ld3_st3_permute_cost */
1962 8, /* ld4_st4_permute_cost */
1963 3, /* permute_cost */
1964 /* Theoretically, a reduction involving 31 scalar ADDs could
1965 complete in ~9 cycles and would have a cost of 31. [SU]ADDV
1966 completes in 14 cycles, so give it a cost of 31 + 5. */
1967 36, /* reduc_i8_cost */
1968 /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
1969 22, /* reduc_i16_cost */
1970 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
1971 14, /* reduc_i32_cost */
1972 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
1973 11, /* reduc_i64_cost */
1974 /* Theoretically, a reduction involving 15 scalar FADDs could
1975 complete in ~9 cycles and would have a cost of 30. FADDV
1976 completes in 13 cycles, so give it a cost of 30 + 4. */
1977 34, /* reduc_f16_cost */
1978 /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
1979 19, /* reduc_f32_cost */
1980 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
1981 11, /* reduc_f64_cost */
1982 2, /* store_elt_extra_cost */
1983 /* This value is just inherited from the Cortex-A57 table. */
1984 8, /* vec_to_scalar_cost */
1985 /* See the comment above the Advanced SIMD versions. */
1986 4, /* scalar_to_vec_cost */
1987 4, /* align_load_cost */
1988 4, /* unalign_load_cost */
1989 /* Although stores have a latency of 2 and compete for the
1990 vector pipes, in practice it's better not to model that. */
1991 1, /* unalign_store_cost */
1992 1 /* store_cost */
1994 3, /* clast_cost */
1995 19, /* fadda_f16_cost */
1996 11, /* fadda_f32_cost */
1997 8, /* fadda_f64_cost */
1998 32, /* gather_load_x32_cost */
1999 16, /* gather_load_x64_cost */
2000 3 /* scatter_store_elt_cost */
2003 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2005 3, /* loads_stores_per_cycle */
2006 2, /* stores_per_cycle */
2007 4, /* general_ops_per_cycle */
2008 0, /* fp_simd_load_general_ops */
2009 1 /* fp_simd_store_general_ops */
2012 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2015 3, /* loads_stores_per_cycle */
2016 2, /* stores_per_cycle */
2017 4, /* general_ops_per_cycle */
2018 0, /* fp_simd_load_general_ops */
2019 1 /* fp_simd_store_general_ops */
2021 2, /* ld2_st2_general_ops */
2022 2, /* ld3_st3_general_ops */
2023 3 /* ld4_st4_general_ops */
2026 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2030 2, /* loads_per_cycle */
2031 2, /* stores_per_cycle */
2032 2, /* general_ops_per_cycle */
2033 0, /* fp_simd_load_general_ops */
2034 1 /* fp_simd_store_general_ops */
2036 2, /* ld2_st2_general_ops */
2037 2, /* ld3_st3_general_ops */
2038 3 /* ld4_st4_general_ops */
2040 1, /* pred_ops_per_cycle */
2041 2, /* while_pred_ops */
2042 2, /* int_cmp_pred_ops */
2043 1, /* fp_cmp_pred_ops */
2044 1, /* gather_scatter_pair_general_ops */
2045 1 /* gather_scatter_pair_pred_ops */
2048 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2050 &neoversev1_scalar_issue_info,
2051 &neoversev1_advsimd_issue_info,
2052 &neoversev1_sve_issue_info
2055 /* Neoverse V1 costs for vector insn classes. */
2056 static const struct cpu_vector_cost neoversev1_vector_cost =
2058 1, /* scalar_int_stmt_cost */
2059 2, /* scalar_fp_stmt_cost */
2060 4, /* scalar_load_cost */
2061 1, /* scalar_store_cost */
2062 1, /* cond_taken_branch_cost */
2063 1, /* cond_not_taken_branch_cost */
2064 &neoversev1_advsimd_vector_cost, /* advsimd */
2065 &neoversev1_sve_vector_cost, /* sve */
2066 &neoversev1_vec_issue_info /* issue_info */
2069 static const struct tune_params neoversev1_tunings =
2071 &cortexa76_extra_costs,
2072 &neoversev1_addrcost_table,
2073 &neoversev1_regmove_cost,
2074 &neoversev1_vector_cost,
2075 &generic_branch_cost,
2076 &generic_approx_modes,
2077 SVE_256, /* sve_width */
2078 { 4, /* load_int. */
2079 2, /* store_int. */
2080 6, /* load_fp. */
2081 2, /* store_fp. */
2082 6, /* load_pred. */
2083 1 /* store_pred. */
2084 }, /* memmov_cost. */
2085 3, /* issue_rate */
2086 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2087 "32:16", /* function_align. */
2088 "4", /* jump_align. */
2089 "32:16", /* loop_align. */
2090 2, /* int_reassoc_width. */
2091 4, /* fp_reassoc_width. */
2092 2, /* vec_reassoc_width. */
2093 2, /* min_div_recip_mul_sf. */
2094 2, /* min_div_recip_mul_df. */
2095 0, /* max_case_values. */
2096 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2097 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2098 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2099 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2100 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
2101 &generic_prefetch_tune
2104 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2107 2, /* int_stmt_cost */
2108 2, /* fp_stmt_cost */
2109 4, /* ld2_st2_permute_cost */
2110 5, /* ld3_st3_permute_cost */
2111 5, /* ld4_st4_permute_cost */
2112 3, /* permute_cost */
2113 /* Theoretically, a reduction involving 15 scalar ADDs could
2114 complete in ~5 cycles and would have a cost of 15. Assume that
2115 [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
2116 21, /* reduc_i8_cost */
2117 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2118 13, /* reduc_i16_cost */
2119 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2120 9, /* reduc_i32_cost */
2121 /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
2122 8, /* reduc_i64_cost */
2123 /* Theoretically, a reduction involving 7 scalar FADDs could
2124 complete in ~6 cycles and would have a cost of 14. Assume that
2125 FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
2126 16, /* reduc_f16_cost */
2127 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2128 8, /* reduc_f32_cost */
2129 /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
2130 4, /* reduc_f64_cost */
2131 2, /* store_elt_extra_cost */
2132 /* This value is just inherited from the Cortex-A57 table. */
2133 8, /* vec_to_scalar_cost */
2134 /* This depends very much on what the scalar value is and
2135 where it comes from. E.g. some constants take two dependent
2136 instructions or a load, while others might be moved from a GPR.
2137 4 seems to be a reasonable compromise in practice. */
2138 4, /* scalar_to_vec_cost */
2139 4, /* align_load_cost */
2140 4, /* unalign_load_cost */
2141 /* Although stores generally have a latency of 2 and compete for the
2142 vector pipes, in practice it's better not to model that. */
2143 1, /* unalign_store_cost */
2144 1 /* store_cost */
2146 3, /* clast_cost */
2147 10, /* fadda_f16_cost */
2148 6, /* fadda_f32_cost */
2149 4, /* fadda_f64_cost */
2150 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2151 (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2152 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2153 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2154 (cost 2) to that, to avoid the difference being lost in rounding.
2156 There is no easy comparison between a strided Advanced SIMD x32 load
2157 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2158 operation more than a 64-bit gather. */
2159 14, /* gather_load_x32_cost */
2160 12, /* gather_load_x64_cost */
2161 3 /* scatter_store_elt_cost */
2164 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2168 3, /* loads_per_cycle */
2169 2, /* stores_per_cycle */
2170 4, /* general_ops_per_cycle */
2171 0, /* fp_simd_load_general_ops */
2172 1 /* fp_simd_store_general_ops */
2174 2, /* ld2_st2_general_ops */
2175 2, /* ld3_st3_general_ops */
2176 3 /* ld4_st4_general_ops */
2178 2, /* pred_ops_per_cycle */
2179 2, /* while_pred_ops */
2180 2, /* int_cmp_pred_ops */
2181 1, /* fp_cmp_pred_ops */
2182 1, /* gather_scatter_pair_general_ops */
2183 1 /* gather_scatter_pair_pred_ops */
2186 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2188 &neoversev1_scalar_issue_info,
2189 &neoversev1_advsimd_issue_info,
2190 &neoverse512tvb_sve_issue_info
2193 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2195 1, /* scalar_int_stmt_cost */
2196 2, /* scalar_fp_stmt_cost */
2197 4, /* scalar_load_cost */
2198 1, /* scalar_store_cost */
2199 1, /* cond_taken_branch_cost */
2200 1, /* cond_not_taken_branch_cost */
2201 &neoversev1_advsimd_vector_cost, /* advsimd */
2202 &neoverse512tvb_sve_vector_cost, /* sve */
2203 &neoverse512tvb_vec_issue_info /* issue_info */
2206 static const struct tune_params neoverse512tvb_tunings =
2208 &cortexa76_extra_costs,
2209 &neoversev1_addrcost_table,
2210 &neoversev1_regmove_cost,
2211 &neoverse512tvb_vector_cost,
2212 &generic_branch_cost,
2213 &generic_approx_modes,
2214 SVE_128 | SVE_256, /* sve_width */
2215 { 4, /* load_int. */
2216 2, /* store_int. */
2217 6, /* load_fp. */
2218 2, /* store_fp. */
2219 6, /* load_pred. */
2220 1 /* store_pred. */
2221 }, /* memmov_cost. */
2222 3, /* issue_rate */
2223 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2224 "32:16", /* function_align. */
2225 "4", /* jump_align. */
2226 "32:16", /* loop_align. */
2227 2, /* int_reassoc_width. */
2228 4, /* fp_reassoc_width. */
2229 2, /* vec_reassoc_width. */
2230 2, /* min_div_recip_mul_sf. */
2231 2, /* min_div_recip_mul_df. */
2232 0, /* max_case_values. */
2233 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2234 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2235 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2236 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2237 &generic_prefetch_tune
2240 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2242 2, /* int_stmt_cost */
2243 2, /* fp_stmt_cost */
2244 2, /* ld2_st2_permute_cost */
2245 2, /* ld3_st3_permute_cost */
2246 3, /* ld4_st4_permute_cost */
2247 3, /* permute_cost */
2248 4, /* reduc_i8_cost */
2249 4, /* reduc_i16_cost */
2250 2, /* reduc_i32_cost */
2251 2, /* reduc_i64_cost */
2252 6, /* reduc_f16_cost */
2253 4, /* reduc_f32_cost */
2254 2, /* reduc_f64_cost */
2255 2, /* store_elt_extra_cost */
2256 /* This value is just inherited from the Cortex-A57 table. */
2257 8, /* vec_to_scalar_cost */
2258 /* This depends very much on what the scalar value is and
2259 where it comes from. E.g. some constants take two dependent
2260 instructions or a load, while others might be moved from a GPR.
2261 4 seems to be a reasonable compromise in practice. */
2262 4, /* scalar_to_vec_cost */
2263 4, /* align_load_cost */
2264 4, /* unalign_load_cost */
2265 /* Although stores have a latency of 2 and compete for the
2266 vector pipes, in practice it's better not to model that. */
2267 1, /* unalign_store_cost */
2268 1 /* store_cost */
2271 static const sve_vec_cost neoversen2_sve_vector_cost =
2274 2, /* int_stmt_cost */
2275 2, /* fp_stmt_cost */
2276 3, /* ld2_st2_permute_cost */
2277 4, /* ld3_st3_permute_cost */
2278 4, /* ld4_st4_permute_cost */
2279 3, /* permute_cost */
2280 /* Theoretically, a reduction involving 15 scalar ADDs could
2281 complete in ~5 cycles and would have a cost of 15. [SU]ADDV
2282 completes in 11 cycles, so give it a cost of 15 + 6. */
2283 21, /* reduc_i8_cost */
2284 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2285 13, /* reduc_i16_cost */
2286 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2287 9, /* reduc_i32_cost */
2288 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2289 2, /* reduc_i64_cost */
2290 /* Theoretically, a reduction involving 7 scalar FADDs could
2291 complete in ~8 cycles and would have a cost of 14. FADDV
2292 completes in 6 cycles, so give it a cost of 14 - 2. */
2293 12, /* reduc_f16_cost */
2294 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
2295 6, /* reduc_f32_cost */
2296 /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
2297 2, /* reduc_f64_cost */
2298 2, /* store_elt_extra_cost */
2299 /* This value is just inherited from the Cortex-A57 table. */
2300 8, /* vec_to_scalar_cost */
2301 /* See the comment above the Advanced SIMD versions. */
2302 4, /* scalar_to_vec_cost */
2303 4, /* align_load_cost */
2304 4, /* unalign_load_cost */
2305 /* Although stores have a latency of 2 and compete for the
2306 vector pipes, in practice it's better not to model that. */
2307 1, /* unalign_store_cost */
2308 1 /* store_cost */
2310 3, /* clast_cost */
2311 10, /* fadda_f16_cost */
2312 6, /* fadda_f32_cost */
2313 4, /* fadda_f64_cost */
2314 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2315 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2316 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2317 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2318 (cost 2) to that, to avoid the difference being lost in rounding.
2320 There is no easy comparison between a strided Advanced SIMD x32 load
2321 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2322 operation more than a 64-bit gather. */
2323 14, /* gather_load_x32_cost */
2324 12, /* gather_load_x64_cost */
2325 3 /* scatter_store_elt_cost */
2328 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2330 3, /* loads_stores_per_cycle */
2331 2, /* stores_per_cycle */
2332 4, /* general_ops_per_cycle */
2333 0, /* fp_simd_load_general_ops */
2334 1 /* fp_simd_store_general_ops */
2337 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2340 3, /* loads_stores_per_cycle */
2341 2, /* stores_per_cycle */
2342 2, /* general_ops_per_cycle */
2343 0, /* fp_simd_load_general_ops */
2344 1 /* fp_simd_store_general_ops */
2346 2, /* ld2_st2_general_ops */
2347 2, /* ld3_st3_general_ops */
2348 3 /* ld4_st4_general_ops */
2351 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2355 3, /* loads_per_cycle */
2356 2, /* stores_per_cycle */
2357 2, /* general_ops_per_cycle */
2358 0, /* fp_simd_load_general_ops */
2359 1 /* fp_simd_store_general_ops */
2361 2, /* ld2_st2_general_ops */
2362 3, /* ld3_st3_general_ops */
2363 3 /* ld4_st4_general_ops */
2365 2, /* pred_ops_per_cycle */
2366 2, /* while_pred_ops */
2367 2, /* int_cmp_pred_ops */
2368 1, /* fp_cmp_pred_ops */
2369 1, /* gather_scatter_pair_general_ops */
2370 1 /* gather_scatter_pair_pred_ops */
2373 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2375 &neoversen2_scalar_issue_info,
2376 &neoversen2_advsimd_issue_info,
2377 &neoversen2_sve_issue_info
2380 /* Neoverse N2 costs for vector insn classes. */
2381 static const struct cpu_vector_cost neoversen2_vector_cost =
2383 1, /* scalar_int_stmt_cost */
2384 2, /* scalar_fp_stmt_cost */
2385 4, /* scalar_load_cost */
2386 1, /* scalar_store_cost */
2387 1, /* cond_taken_branch_cost */
2388 1, /* cond_not_taken_branch_cost */
2389 &neoversen2_advsimd_vector_cost, /* advsimd */
2390 &neoversen2_sve_vector_cost, /* sve */
2391 &neoversen2_vec_issue_info /* issue_info */
2394 static const struct tune_params neoversen2_tunings =
2396 &cortexa76_extra_costs,
2397 &neoversen2_addrcost_table,
2398 &neoversen2_regmove_cost,
2399 &neoversen2_vector_cost,
2400 &generic_branch_cost,
2401 &generic_approx_modes,
2402 SVE_128, /* sve_width */
2403 { 4, /* load_int. */
2404 1, /* store_int. */
2405 6, /* load_fp. */
2406 2, /* store_fp. */
2407 6, /* load_pred. */
2408 1 /* store_pred. */
2409 }, /* memmov_cost. */
2410 3, /* issue_rate */
2411 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2412 "32:16", /* function_align. */
2413 "4", /* jump_align. */
2414 "32:16", /* loop_align. */
2415 2, /* int_reassoc_width. */
2416 4, /* fp_reassoc_width. */
2417 2, /* vec_reassoc_width. */
2418 2, /* min_div_recip_mul_sf. */
2419 2, /* min_div_recip_mul_df. */
2420 0, /* max_case_values. */
2421 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2422 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2423 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2424 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2425 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2426 &generic_prefetch_tune
2429 static const advsimd_vec_cost demeter_advsimd_vector_cost =
2431 2, /* int_stmt_cost */
2432 2, /* fp_stmt_cost */
2433 2, /* ld2_st2_permute_cost */
2434 2, /* ld3_st3_permute_cost */
2435 3, /* ld4_st4_permute_cost */
2436 3, /* permute_cost */
2437 4, /* reduc_i8_cost */
2438 4, /* reduc_i16_cost */
2439 2, /* reduc_i32_cost */
2440 2, /* reduc_i64_cost */
2441 6, /* reduc_f16_cost */
2442 3, /* reduc_f32_cost */
2443 2, /* reduc_f64_cost */
2444 2, /* store_elt_extra_cost */
2445 /* This value is just inherited from the Cortex-A57 table. */
2446 8, /* vec_to_scalar_cost */
2447 /* This depends very much on what the scalar value is and
2448 where it comes from. E.g. some constants take two dependent
2449 instructions or a load, while others might be moved from a GPR.
2450 4 seems to be a reasonable compromise in practice. */
2451 4, /* scalar_to_vec_cost */
2452 4, /* align_load_cost */
2453 4, /* unalign_load_cost */
2454 /* Although stores have a latency of 2 and compete for the
2455 vector pipes, in practice it's better not to model that. */
2456 1, /* unalign_store_cost */
2457 1 /* store_cost */
2460 static const sve_vec_cost demeter_sve_vector_cost =
2463 2, /* int_stmt_cost */
2464 2, /* fp_stmt_cost */
2465 3, /* ld2_st2_permute_cost */
2466 3, /* ld3_st3_permute_cost */
2467 4, /* ld4_st4_permute_cost */
2468 3, /* permute_cost */
2469 /* Theoretically, a reduction involving 15 scalar ADDs could
2470 complete in ~3 cycles and would have a cost of 15. [SU]ADDV
2471 completes in 11 cycles, so give it a cost of 15 + 8. */
2472 21, /* reduc_i8_cost */
2473 /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
2474 14, /* reduc_i16_cost */
2475 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
2476 7, /* reduc_i32_cost */
2477 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2478 2, /* reduc_i64_cost */
2479 /* Theoretically, a reduction involving 7 scalar FADDs could
2480 complete in ~6 cycles and would have a cost of 14. FADDV
2481 completes in 8 cycles, so give it a cost of 14 + 2. */
2482 16, /* reduc_f16_cost */
2483 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2484 8, /* reduc_f32_cost */
2485 /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
2486 4, /* reduc_f64_cost */
2487 2, /* store_elt_extra_cost */
2488 /* This value is just inherited from the Cortex-A57 table. */
2489 8, /* vec_to_scalar_cost */
2490 /* See the comment above the Advanced SIMD versions. */
2491 4, /* scalar_to_vec_cost */
2492 4, /* align_load_cost */
2493 4, /* unalign_load_cost */
2494 /* Although stores have a latency of 2 and compete for the
2495 vector pipes, in practice it's better not to model that. */
2496 1, /* unalign_store_cost */
2497 1 /* store_cost */
2499 3, /* clast_cost */
2500 10, /* fadda_f16_cost */
2501 6, /* fadda_f32_cost */
2502 4, /* fadda_f64_cost */
2503 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2504 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2505 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2506 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2507 (cost 2) to that, to avoid the difference being lost in rounding.
2509 There is no easy comparison between a strided Advanced SIMD x32 load
2510 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2511 operation more than a 64-bit gather. */
2512 14, /* gather_load_x32_cost */
2513 12, /* gather_load_x64_cost */
2514 3 /* scatter_store_elt_cost */
2517 static const aarch64_scalar_vec_issue_info demeter_scalar_issue_info =
2519 3, /* loads_stores_per_cycle */
2520 2, /* stores_per_cycle */
2521 6, /* general_ops_per_cycle */
2522 0, /* fp_simd_load_general_ops */
2523 1 /* fp_simd_store_general_ops */
2526 static const aarch64_advsimd_vec_issue_info demeter_advsimd_issue_info =
2529 3, /* loads_stores_per_cycle */
2530 2, /* stores_per_cycle */
2531 4, /* general_ops_per_cycle */
2532 0, /* fp_simd_load_general_ops */
2533 1 /* fp_simd_store_general_ops */
2535 2, /* ld2_st2_general_ops */
2536 2, /* ld3_st3_general_ops */
2537 3 /* ld4_st4_general_ops */
2540 static const aarch64_sve_vec_issue_info demeter_sve_issue_info =
2544 3, /* loads_per_cycle */
2545 2, /* stores_per_cycle */
2546 4, /* general_ops_per_cycle */
2547 0, /* fp_simd_load_general_ops */
2548 1 /* fp_simd_store_general_ops */
2550 2, /* ld2_st2_general_ops */
2551 3, /* ld3_st3_general_ops */
2552 3 /* ld4_st4_general_ops */
2554 2, /* pred_ops_per_cycle */
2555 2, /* while_pred_ops */
2556 2, /* int_cmp_pred_ops */
2557 1, /* fp_cmp_pred_ops */
2558 1, /* gather_scatter_pair_general_ops */
2559 1 /* gather_scatter_pair_pred_ops */
2562 static const aarch64_vec_issue_info demeter_vec_issue_info =
2564 &demeter_scalar_issue_info,
2565 &demeter_advsimd_issue_info,
2566 &demeter_sve_issue_info
2569 /* Demeter costs for vector insn classes. */
2570 static const struct cpu_vector_cost demeter_vector_cost =
2572 1, /* scalar_int_stmt_cost */
2573 2, /* scalar_fp_stmt_cost */
2574 4, /* scalar_load_cost */
2575 1, /* scalar_store_cost */
2576 1, /* cond_taken_branch_cost */
2577 1, /* cond_not_taken_branch_cost */
2578 &demeter_advsimd_vector_cost, /* advsimd */
2579 &demeter_sve_vector_cost, /* sve */
2580 &demeter_vec_issue_info /* issue_info */
2583 static const struct tune_params demeter_tunings =
2585 &cortexa76_extra_costs,
2586 &demeter_addrcost_table,
2587 &demeter_regmove_cost,
2588 &demeter_vector_cost,
2589 &generic_branch_cost,
2590 &generic_approx_modes,
2591 SVE_128, /* sve_width */
2592 { 4, /* load_int. */
2593 2, /* store_int. */
2594 6, /* load_fp. */
2595 1, /* store_fp. */
2596 6, /* load_pred. */
2597 2 /* store_pred. */
2598 }, /* memmov_cost. */
2599 5, /* issue_rate */
2600 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2601 "32:16", /* function_align. */
2602 "4", /* jump_align. */
2603 "32:16", /* loop_align. */
2604 3, /* int_reassoc_width. */
2605 6, /* fp_reassoc_width. */
2606 3, /* vec_reassoc_width. */
2607 2, /* min_div_recip_mul_sf. */
2608 2, /* min_div_recip_mul_df. */
2609 0, /* max_case_values. */
2610 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2611 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2612 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2613 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2614 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2615 &generic_prefetch_tune
2618 static const struct tune_params a64fx_tunings =
2620 &a64fx_extra_costs,
2621 &a64fx_addrcost_table,
2622 &a64fx_regmove_cost,
2623 &a64fx_vector_cost,
2624 &generic_branch_cost,
2625 &generic_approx_modes,
2626 SVE_512, /* sve_width */
2627 { 4, /* load_int. */
2628 4, /* store_int. */
2629 4, /* load_fp. */
2630 4, /* store_fp. */
2631 4, /* load_pred. */
2632 4 /* store_pred. */
2633 }, /* memmov_cost. */
2634 7, /* issue_rate */
2635 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2636 "32", /* function_align. */
2637 "16", /* jump_align. */
2638 "32", /* loop_align. */
2639 4, /* int_reassoc_width. */
2640 2, /* fp_reassoc_width. */
2641 2, /* vec_reassoc_width. */
2642 2, /* min_div_recip_mul_sf. */
2643 2, /* min_div_recip_mul_df. */
2644 0, /* max_case_values. */
2645 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2646 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
2647 &a64fx_prefetch_tune
2650 /* Support for fine-grained override of the tuning structures. */
2651 struct aarch64_tuning_override_function
2653 const char* name;
2654 void (*parse_override)(const char*, struct tune_params*);
2657 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2658 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2659 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2661 static const struct aarch64_tuning_override_function
2662 aarch64_tuning_override_functions[] =
2664 { "fuse", aarch64_parse_fuse_string },
2665 { "tune", aarch64_parse_tune_string },
2666 { "sve_width", aarch64_parse_sve_width_string },
2667 { NULL, NULL }
2670 /* A processor implementing AArch64. */
2671 struct processor
2673 const char *const name;
2674 enum aarch64_processor ident;
2675 enum aarch64_processor sched_core;
2676 enum aarch64_arch arch;
2677 const uint64_t flags;
2678 const struct tune_params *const tune;
2681 /* Architectures implementing AArch64. */
2682 static const struct processor all_architectures[] =
2684 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
2685 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, FLAGS, NULL},
2686 #include "aarch64-arches.def"
2687 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2690 /* Processor cores implementing AArch64. */
2691 static const struct processor all_cores[] =
2693 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
2694 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2695 FLAGS, &COSTS##_tunings},
2696 #include "aarch64-cores.def"
2697 {"generic", generic, cortexa53, AARCH64_ARCH_8A,
2698 AARCH64_FL_FOR_ARCH8, &generic_tunings},
2699 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2702 /* The current tuning set. */
2703 struct tune_params aarch64_tune_params = generic_tunings;
2705 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
2707 static tree
2708 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2709 int, bool *no_add_attrs)
2711 /* Since we set fn_type_req to true, the caller should have checked
2712 this for us. */
2713 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2714 switch ((arm_pcs) fntype_abi (*node).id ())
2716 case ARM_PCS_AAPCS64:
2717 case ARM_PCS_SIMD:
2718 return NULL_TREE;
2720 case ARM_PCS_SVE:
2721 error ("the %qE attribute cannot be applied to an SVE function type",
2722 name);
2723 *no_add_attrs = true;
2724 return NULL_TREE;
2726 case ARM_PCS_TLSDESC:
2727 case ARM_PCS_UNKNOWN:
2728 break;
2730 gcc_unreachable ();
2733 /* Table of machine attributes. */
2734 static const struct attribute_spec aarch64_attribute_table[] =
2736 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2737 affects_type_identity, handler, exclude } */
2738 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
2739 handle_aarch64_vector_pcs_attribute, NULL },
2740 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
2741 aarch64_sve::handle_arm_sve_vector_bits_attribute,
2742 NULL },
2743 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
2744 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
2745 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
2746 { NULL, 0, 0, false, false, false, false, NULL, NULL }
2749 /* An ISA extension in the co-processor and main instruction set space. */
2750 struct aarch64_option_extension
2752 const char *const name;
2753 const unsigned long flags_on;
2754 const unsigned long flags_off;
2757 typedef enum aarch64_cond_code
2759 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2760 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2761 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2763 aarch64_cc;
2765 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2767 struct aarch64_branch_protect_type
2769 /* The type's name that the user passes to the branch-protection option
2770 string. */
2771 const char* name;
2772 /* Function to handle the protection type and set global variables.
2773 First argument is the string token corresponding with this type and the
2774 second argument is the next token in the option string.
2775 Return values:
2776 * AARCH64_PARSE_OK: Handling was sucessful.
2777 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2778 should print an error.
2779 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2780 own error. */
2781 enum aarch64_parse_opt_result (*handler)(char*, char*);
2782 /* A list of types that can follow this type in the option string. */
2783 const aarch64_branch_protect_type* subtypes;
2784 unsigned int num_subtypes;
2787 static enum aarch64_parse_opt_result
2788 aarch64_handle_no_branch_protection (char* str, char* rest)
2790 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2791 aarch64_enable_bti = 0;
2792 if (rest)
2794 error ("unexpected %<%s%> after %<%s%>", rest, str);
2795 return AARCH64_PARSE_INVALID_FEATURE;
2797 return AARCH64_PARSE_OK;
2800 static enum aarch64_parse_opt_result
2801 aarch64_handle_standard_branch_protection (char* str, char* rest)
2803 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2804 aarch64_ra_sign_key = AARCH64_KEY_A;
2805 aarch64_enable_bti = 1;
2806 if (rest)
2808 error ("unexpected %<%s%> after %<%s%>", rest, str);
2809 return AARCH64_PARSE_INVALID_FEATURE;
2811 return AARCH64_PARSE_OK;
2814 static enum aarch64_parse_opt_result
2815 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2816 char* rest ATTRIBUTE_UNUSED)
2818 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2819 aarch64_ra_sign_key = AARCH64_KEY_A;
2820 return AARCH64_PARSE_OK;
2823 static enum aarch64_parse_opt_result
2824 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2825 char* rest ATTRIBUTE_UNUSED)
2827 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2828 return AARCH64_PARSE_OK;
2831 static enum aarch64_parse_opt_result
2832 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2833 char* rest ATTRIBUTE_UNUSED)
2835 aarch64_ra_sign_key = AARCH64_KEY_B;
2836 return AARCH64_PARSE_OK;
2839 static enum aarch64_parse_opt_result
2840 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2841 char* rest ATTRIBUTE_UNUSED)
2843 aarch64_enable_bti = 1;
2844 return AARCH64_PARSE_OK;
2847 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2848 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2849 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2850 { NULL, NULL, NULL, 0 }
2853 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2854 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2855 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2856 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2857 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2858 { "bti", aarch64_handle_bti_protection, NULL, 0 },
2859 { NULL, NULL, NULL, 0 }
2862 /* The condition codes of the processor, and the inverse function. */
2863 static const char * const aarch64_condition_codes[] =
2865 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2866 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2869 /* The preferred condition codes for SVE conditions. */
2870 static const char *const aarch64_sve_condition_codes[] =
2872 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2873 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2876 /* Return the assembly token for svpattern value VALUE. */
2878 static const char *
2879 svpattern_token (enum aarch64_svpattern pattern)
2881 switch (pattern)
2883 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2884 AARCH64_FOR_SVPATTERN (CASE)
2885 #undef CASE
2886 case AARCH64_NUM_SVPATTERNS:
2887 break;
2889 gcc_unreachable ();
2892 /* Return the location of a piece that is known to be passed or returned
2893 in registers. FIRST_ZR is the first unused vector argument register
2894 and FIRST_PR is the first unused predicate argument register. */
2897 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2898 unsigned int first_pr) const
2900 gcc_assert (VECTOR_MODE_P (mode)
2901 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2902 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2904 if (num_zr > 0 && num_pr == 0)
2905 return gen_rtx_REG (mode, first_zr);
2907 if (num_zr == 0 && num_pr == 1)
2908 return gen_rtx_REG (mode, first_pr);
2910 gcc_unreachable ();
2913 /* Return the total number of vector registers required by the PST. */
2915 unsigned int
2916 pure_scalable_type_info::num_zr () const
2918 unsigned int res = 0;
2919 for (unsigned int i = 0; i < pieces.length (); ++i)
2920 res += pieces[i].num_zr;
2921 return res;
2924 /* Return the total number of predicate registers required by the PST. */
2926 unsigned int
2927 pure_scalable_type_info::num_pr () const
2929 unsigned int res = 0;
2930 for (unsigned int i = 0; i < pieces.length (); ++i)
2931 res += pieces[i].num_pr;
2932 return res;
2935 /* Return the location of a PST that is known to be passed or returned
2936 in registers. FIRST_ZR is the first unused vector argument register
2937 and FIRST_PR is the first unused predicate argument register. */
2940 pure_scalable_type_info::get_rtx (machine_mode mode,
2941 unsigned int first_zr,
2942 unsigned int first_pr) const
2944 /* Try to return a single REG if possible. This leads to better
2945 code generation; it isn't required for correctness. */
2946 if (mode == pieces[0].mode)
2948 gcc_assert (pieces.length () == 1);
2949 return pieces[0].get_rtx (first_zr, first_pr);
2952 /* Build up a PARALLEL that contains the individual pieces. */
2953 rtvec rtxes = rtvec_alloc (pieces.length ());
2954 for (unsigned int i = 0; i < pieces.length (); ++i)
2956 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
2957 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
2958 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
2959 first_zr += pieces[i].num_zr;
2960 first_pr += pieces[i].num_pr;
2962 return gen_rtx_PARALLEL (mode, rtxes);
2965 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
2966 in the AAPCS64. */
2968 pure_scalable_type_info::analysis_result
2969 pure_scalable_type_info::analyze (const_tree type)
2971 /* Prevent accidental reuse. */
2972 gcc_assert (pieces.is_empty ());
2974 /* No code will be generated for erroneous types, so we won't establish
2975 an ABI mapping. */
2976 if (type == error_mark_node)
2977 return NO_ABI_IDENTITY;
2979 /* Zero-sized types disappear in the language->ABI mapping. */
2980 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2981 return NO_ABI_IDENTITY;
2983 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
2984 piece p = {};
2985 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
2987 machine_mode mode = TYPE_MODE_RAW (type);
2988 gcc_assert (VECTOR_MODE_P (mode)
2989 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
2991 p.mode = p.orig_mode = mode;
2992 add_piece (p);
2993 return IS_PST;
2996 /* Check for user-defined PSTs. */
2997 if (TREE_CODE (type) == ARRAY_TYPE)
2998 return analyze_array (type);
2999 if (TREE_CODE (type) == RECORD_TYPE)
3000 return analyze_record (type);
3002 return ISNT_PST;
3005 /* Analyze a type that is known not to be passed or returned in memory.
3006 Return true if it has an ABI identity and is a Pure Scalable Type. */
3008 bool
3009 pure_scalable_type_info::analyze_registers (const_tree type)
3011 analysis_result result = analyze (type);
3012 gcc_assert (result != DOESNT_MATTER);
3013 return result == IS_PST;
3016 /* Subroutine of analyze for handling ARRAY_TYPEs. */
3018 pure_scalable_type_info::analysis_result
3019 pure_scalable_type_info::analyze_array (const_tree type)
3021 /* Analyze the element type. */
3022 pure_scalable_type_info element_info;
3023 analysis_result result = element_info.analyze (TREE_TYPE (type));
3024 if (result != IS_PST)
3025 return result;
3027 /* An array of unknown, flexible or variable length will be passed and
3028 returned by reference whatever we do. */
3029 tree nelts_minus_one = array_type_nelts (type);
3030 if (!tree_fits_uhwi_p (nelts_minus_one))
3031 return DOESNT_MATTER;
3033 /* Likewise if the array is constant-sized but too big to be interesting.
3034 The double checks against MAX_PIECES are to protect against overflow. */
3035 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
3036 if (count > MAX_PIECES)
3037 return DOESNT_MATTER;
3038 count += 1;
3039 if (count * element_info.pieces.length () > MAX_PIECES)
3040 return DOESNT_MATTER;
3042 /* The above checks should have weeded out elements of unknown size. */
3043 poly_uint64 element_bytes;
3044 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3045 gcc_unreachable ();
3047 /* Build up the list of individual vectors and predicates. */
3048 gcc_assert (!element_info.pieces.is_empty ());
3049 for (unsigned int i = 0; i < count; ++i)
3050 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3052 piece p = element_info.pieces[j];
3053 p.offset += i * element_bytes;
3054 add_piece (p);
3056 return IS_PST;
3059 /* Subroutine of analyze for handling RECORD_TYPEs. */
3061 pure_scalable_type_info::analysis_result
3062 pure_scalable_type_info::analyze_record (const_tree type)
3064 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3066 if (TREE_CODE (field) != FIELD_DECL)
3067 continue;
3069 /* Zero-sized fields disappear in the language->ABI mapping. */
3070 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3071 continue;
3073 /* All fields with an ABI identity must be PSTs for the record as
3074 a whole to be a PST. If any individual field is too big to be
3075 interesting then the record is too. */
3076 pure_scalable_type_info field_info;
3077 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3078 if (subresult == NO_ABI_IDENTITY)
3079 continue;
3080 if (subresult != IS_PST)
3081 return subresult;
3083 /* Since all previous fields are PSTs, we ought to be able to track
3084 the field offset using poly_ints. */
3085 tree bitpos = bit_position (field);
3086 gcc_assert (poly_int_tree_p (bitpos));
3088 /* For the same reason, it shouldn't be possible to create a PST field
3089 whose offset isn't byte-aligned. */
3090 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3091 BITS_PER_UNIT);
3093 /* Punt if the record is too big to be interesting. */
3094 poly_uint64 bytepos;
3095 if (!wide_bytepos.to_uhwi (&bytepos)
3096 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3097 return DOESNT_MATTER;
3099 /* Add the individual vectors and predicates in the field to the
3100 record's list. */
3101 gcc_assert (!field_info.pieces.is_empty ());
3102 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3104 piece p = field_info.pieces[i];
3105 p.offset += bytepos;
3106 add_piece (p);
3109 /* Empty structures disappear in the language->ABI mapping. */
3110 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3113 /* Add P to the list of pieces in the type. */
3115 void
3116 pure_scalable_type_info::add_piece (const piece &p)
3118 /* Try to fold the new piece into the previous one to form a
3119 single-mode PST. For example, if we see three consecutive vectors
3120 of the same mode, we can represent them using the corresponding
3121 3-tuple mode.
3123 This is purely an optimization. */
3124 if (!pieces.is_empty ())
3126 piece &prev = pieces.last ();
3127 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3128 unsigned int nelems1, nelems2;
3129 if (prev.orig_mode == p.orig_mode
3130 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3131 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3132 GET_MODE_NUNITS (p.orig_mode), &nelems1)
3133 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3134 GET_MODE_NUNITS (p.orig_mode), &nelems2)
3135 && targetm.array_mode (p.orig_mode,
3136 nelems1 + nelems2).exists (&prev.mode))
3138 prev.num_zr += p.num_zr;
3139 prev.num_pr += p.num_pr;
3140 return;
3143 pieces.quick_push (p);
3146 /* Return true if at least one possible value of type TYPE includes at
3147 least one object of Pure Scalable Type, in the sense of the AAPCS64.
3149 This is a relatively expensive test for some types, so it should
3150 generally be made as late as possible. */
3152 static bool
3153 aarch64_some_values_include_pst_objects_p (const_tree type)
3155 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3156 return false;
3158 if (aarch64_sve::builtin_type_p (type))
3159 return true;
3161 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3162 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3164 if (RECORD_OR_UNION_TYPE_P (type))
3165 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3166 if (TREE_CODE (field) == FIELD_DECL
3167 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3168 return true;
3170 return false;
3173 /* Return the descriptor of the SIMD ABI. */
3175 static const predefined_function_abi &
3176 aarch64_simd_abi (void)
3178 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3179 if (!simd_abi.initialized_p ())
3181 HARD_REG_SET full_reg_clobbers
3182 = default_function_abi.full_reg_clobbers ();
3183 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3184 if (FP_SIMD_SAVED_REGNUM_P (regno))
3185 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3186 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3188 return simd_abi;
3191 /* Return the descriptor of the SVE PCS. */
3193 static const predefined_function_abi &
3194 aarch64_sve_abi (void)
3196 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3197 if (!sve_abi.initialized_p ())
3199 HARD_REG_SET full_reg_clobbers
3200 = default_function_abi.full_reg_clobbers ();
3201 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3202 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3203 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3204 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3205 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3207 return sve_abi;
3210 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3211 wraps, otherwise return X itself. */
3213 static rtx
3214 strip_salt (rtx x)
3216 rtx search = x;
3217 if (GET_CODE (search) == CONST)
3218 search = XEXP (search, 0);
3219 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3220 x = XVECEXP (search, 0, 0);
3221 return x;
3224 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3225 expression. */
3227 static rtx
3228 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3230 return strip_salt (strip_offset (addr, offset));
3233 /* Generate code to enable conditional branches in functions over 1 MiB. */
3234 const char *
3235 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3236 const char * branch_format)
3238 rtx_code_label * tmp_label = gen_label_rtx ();
3239 char label_buf[256];
3240 char buffer[128];
3241 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3242 CODE_LABEL_NUMBER (tmp_label));
3243 const char *label_ptr = targetm.strip_name_encoding (label_buf);
3244 rtx dest_label = operands[pos_label];
3245 operands[pos_label] = tmp_label;
3247 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3248 output_asm_insn (buffer, operands);
3250 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3251 operands[pos_label] = dest_label;
3252 output_asm_insn (buffer, operands);
3253 return "";
3256 void
3257 aarch64_err_no_fpadvsimd (machine_mode mode)
3259 if (TARGET_GENERAL_REGS_ONLY)
3260 if (FLOAT_MODE_P (mode))
3261 error ("%qs is incompatible with the use of floating-point types",
3262 "-mgeneral-regs-only");
3263 else
3264 error ("%qs is incompatible with the use of vector types",
3265 "-mgeneral-regs-only");
3266 else
3267 if (FLOAT_MODE_P (mode))
3268 error ("%qs feature modifier is incompatible with the use of"
3269 " floating-point types", "+nofp");
3270 else
3271 error ("%qs feature modifier is incompatible with the use of"
3272 " vector types", "+nofp");
3275 /* Report when we try to do something that requires SVE when SVE is disabled.
3276 This is an error of last resort and isn't very high-quality. It usually
3277 involves attempts to measure the vector length in some way. */
3278 static void
3279 aarch64_report_sve_required (void)
3281 static bool reported_p = false;
3283 /* Avoid reporting a slew of messages for a single oversight. */
3284 if (reported_p)
3285 return;
3287 error ("this operation requires the SVE ISA extension");
3288 inform (input_location, "you can enable SVE using the command-line"
3289 " option %<-march%>, or by using the %<target%>"
3290 " attribute or pragma");
3291 reported_p = true;
3294 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3295 registers. */
3296 inline bool
3297 pr_or_ffr_regnum_p (unsigned int regno)
3299 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3302 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3303 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3304 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3305 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3306 and GENERAL_REGS is lower than the memory cost (in this case the best class
3307 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
3308 cost results in bad allocations with many redundant int<->FP moves which
3309 are expensive on various cores.
3310 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3311 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
3312 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
3313 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
3314 The result of this is that it is no longer inefficient to have a higher
3315 memory move cost than the register move cost.
3318 static reg_class_t
3319 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3320 reg_class_t best_class)
3322 machine_mode mode;
3324 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3325 || !reg_class_subset_p (FP_REGS, allocno_class))
3326 return allocno_class;
3328 if (!reg_class_subset_p (GENERAL_REGS, best_class)
3329 || !reg_class_subset_p (FP_REGS, best_class))
3330 return best_class;
3332 mode = PSEUDO_REGNO_MODE (regno);
3333 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3336 static unsigned int
3337 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3339 if (GET_MODE_UNIT_SIZE (mode) == 4)
3340 return aarch64_tune_params.min_div_recip_mul_sf;
3341 return aarch64_tune_params.min_div_recip_mul_df;
3344 /* Return the reassociation width of treeop OPC with mode MODE. */
3345 static int
3346 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3348 if (VECTOR_MODE_P (mode))
3349 return aarch64_tune_params.vec_reassoc_width;
3350 if (INTEGRAL_MODE_P (mode))
3351 return aarch64_tune_params.int_reassoc_width;
3352 /* Avoid reassociating floating point addition so we emit more FMAs. */
3353 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
3354 return aarch64_tune_params.fp_reassoc_width;
3355 return 1;
3358 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
3359 unsigned
3360 aarch64_dbx_register_number (unsigned regno)
3362 if (GP_REGNUM_P (regno))
3363 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3364 else if (regno == SP_REGNUM)
3365 return AARCH64_DWARF_SP;
3366 else if (FP_REGNUM_P (regno))
3367 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3368 else if (PR_REGNUM_P (regno))
3369 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3370 else if (regno == VG_REGNUM)
3371 return AARCH64_DWARF_VG;
3373 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3374 equivalent DWARF register. */
3375 return DWARF_FRAME_REGISTERS;
3378 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3379 integer, otherwise return X unmodified. */
3380 static rtx
3381 aarch64_bit_representation (rtx x)
3383 if (CONST_DOUBLE_P (x))
3384 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3385 return x;
3388 /* Return an estimate for the number of quadwords in an SVE vector. This is
3389 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
3390 static unsigned int
3391 aarch64_estimated_sve_vq ()
3393 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3396 /* Return true if MODE is an SVE predicate mode. */
3397 static bool
3398 aarch64_sve_pred_mode_p (machine_mode mode)
3400 return (TARGET_SVE
3401 && (mode == VNx16BImode
3402 || mode == VNx8BImode
3403 || mode == VNx4BImode
3404 || mode == VNx2BImode));
3407 /* Three mutually-exclusive flags describing a vector or predicate type. */
3408 const unsigned int VEC_ADVSIMD = 1;
3409 const unsigned int VEC_SVE_DATA = 2;
3410 const unsigned int VEC_SVE_PRED = 4;
3411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3412 a structure of 2, 3 or 4 vectors. */
3413 const unsigned int VEC_STRUCT = 8;
3414 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3415 vector has fewer significant bytes than a full SVE vector. */
3416 const unsigned int VEC_PARTIAL = 16;
3417 /* Useful combinations of the above. */
3418 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
3419 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3421 /* Return a set of flags describing the vector properties of mode MODE.
3422 Ignore modes that are not supported by the current target. */
3423 static unsigned int
3424 aarch64_classify_vector_mode (machine_mode mode)
3426 if (aarch64_sve_pred_mode_p (mode))
3427 return VEC_SVE_PRED;
3429 /* Make the decision based on the mode's enum value rather than its
3430 properties, so that we keep the correct classification regardless
3431 of -msve-vector-bits. */
3432 switch (mode)
3434 /* Partial SVE QI vectors. */
3435 case E_VNx2QImode:
3436 case E_VNx4QImode:
3437 case E_VNx8QImode:
3438 /* Partial SVE HI vectors. */
3439 case E_VNx2HImode:
3440 case E_VNx4HImode:
3441 /* Partial SVE SI vector. */
3442 case E_VNx2SImode:
3443 /* Partial SVE HF vectors. */
3444 case E_VNx2HFmode:
3445 case E_VNx4HFmode:
3446 /* Partial SVE BF vectors. */
3447 case E_VNx2BFmode:
3448 case E_VNx4BFmode:
3449 /* Partial SVE SF vector. */
3450 case E_VNx2SFmode:
3451 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3453 case E_VNx16QImode:
3454 case E_VNx8HImode:
3455 case E_VNx4SImode:
3456 case E_VNx2DImode:
3457 case E_VNx8BFmode:
3458 case E_VNx8HFmode:
3459 case E_VNx4SFmode:
3460 case E_VNx2DFmode:
3461 return TARGET_SVE ? VEC_SVE_DATA : 0;
3463 /* x2 SVE vectors. */
3464 case E_VNx32QImode:
3465 case E_VNx16HImode:
3466 case E_VNx8SImode:
3467 case E_VNx4DImode:
3468 case E_VNx16BFmode:
3469 case E_VNx16HFmode:
3470 case E_VNx8SFmode:
3471 case E_VNx4DFmode:
3472 /* x3 SVE vectors. */
3473 case E_VNx48QImode:
3474 case E_VNx24HImode:
3475 case E_VNx12SImode:
3476 case E_VNx6DImode:
3477 case E_VNx24BFmode:
3478 case E_VNx24HFmode:
3479 case E_VNx12SFmode:
3480 case E_VNx6DFmode:
3481 /* x4 SVE vectors. */
3482 case E_VNx64QImode:
3483 case E_VNx32HImode:
3484 case E_VNx16SImode:
3485 case E_VNx8DImode:
3486 case E_VNx32BFmode:
3487 case E_VNx32HFmode:
3488 case E_VNx16SFmode:
3489 case E_VNx8DFmode:
3490 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3492 case E_OImode:
3493 case E_CImode:
3494 case E_XImode:
3495 return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
3497 /* Structures of 64-bit Advanced SIMD vectors. */
3498 case E_V2x8QImode:
3499 case E_V2x4HImode:
3500 case E_V2x2SImode:
3501 case E_V2x1DImode:
3502 case E_V2x4BFmode:
3503 case E_V2x4HFmode:
3504 case E_V2x2SFmode:
3505 case E_V2x1DFmode:
3506 case E_V3x8QImode:
3507 case E_V3x4HImode:
3508 case E_V3x2SImode:
3509 case E_V3x1DImode:
3510 case E_V3x4BFmode:
3511 case E_V3x4HFmode:
3512 case E_V3x2SFmode:
3513 case E_V3x1DFmode:
3514 case E_V4x8QImode:
3515 case E_V4x4HImode:
3516 case E_V4x2SImode:
3517 case E_V4x1DImode:
3518 case E_V4x4BFmode:
3519 case E_V4x4HFmode:
3520 case E_V4x2SFmode:
3521 case E_V4x1DFmode:
3522 return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3524 /* Structures of 128-bit Advanced SIMD vectors. */
3525 case E_V2x16QImode:
3526 case E_V2x8HImode:
3527 case E_V2x4SImode:
3528 case E_V2x2DImode:
3529 case E_V2x8BFmode:
3530 case E_V2x8HFmode:
3531 case E_V2x4SFmode:
3532 case E_V2x2DFmode:
3533 case E_V3x16QImode:
3534 case E_V3x8HImode:
3535 case E_V3x4SImode:
3536 case E_V3x2DImode:
3537 case E_V3x8BFmode:
3538 case E_V3x8HFmode:
3539 case E_V3x4SFmode:
3540 case E_V3x2DFmode:
3541 case E_V4x16QImode:
3542 case E_V4x8HImode:
3543 case E_V4x4SImode:
3544 case E_V4x2DImode:
3545 case E_V4x8BFmode:
3546 case E_V4x8HFmode:
3547 case E_V4x4SFmode:
3548 case E_V4x2DFmode:
3549 return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
3551 /* 64-bit Advanced SIMD vectors. */
3552 case E_V8QImode:
3553 case E_V4HImode:
3554 case E_V2SImode:
3555 /* ...E_V1DImode doesn't exist. */
3556 case E_V4HFmode:
3557 case E_V4BFmode:
3558 case E_V2SFmode:
3559 case E_V1DFmode:
3560 /* 128-bit Advanced SIMD vectors. */
3561 case E_V16QImode:
3562 case E_V8HImode:
3563 case E_V4SImode:
3564 case E_V2DImode:
3565 case E_V8HFmode:
3566 case E_V8BFmode:
3567 case E_V4SFmode:
3568 case E_V2DFmode:
3569 return TARGET_SIMD ? VEC_ADVSIMD : 0;
3571 default:
3572 return 0;
3576 /* Return true if MODE is any of the Advanced SIMD structure modes. */
3577 bool
3578 aarch64_advsimd_struct_mode_p (machine_mode mode)
3580 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3581 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3584 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
3585 static bool
3586 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3588 return (aarch64_classify_vector_mode (mode)
3589 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3592 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
3593 static bool
3594 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3596 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3599 /* Return true if MODE is any of the data vector modes, including
3600 structure modes. */
3601 static bool
3602 aarch64_vector_data_mode_p (machine_mode mode)
3604 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3607 /* Return true if MODE is any form of SVE mode, including predicates,
3608 vectors and structures. */
3609 bool
3610 aarch64_sve_mode_p (machine_mode mode)
3612 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3615 /* Return true if MODE is an SVE data vector mode; either a single vector
3616 or a structure of vectors. */
3617 static bool
3618 aarch64_sve_data_mode_p (machine_mode mode)
3620 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3623 /* Return the number of defined bytes in one constituent vector of
3624 SVE mode MODE, which has vector flags VEC_FLAGS. */
3625 static poly_int64
3626 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3628 if (vec_flags & VEC_PARTIAL)
3629 /* A single partial vector. */
3630 return GET_MODE_SIZE (mode);
3632 if (vec_flags & VEC_SVE_DATA)
3633 /* A single vector or a tuple. */
3634 return BYTES_PER_SVE_VECTOR;
3636 /* A single predicate. */
3637 gcc_assert (vec_flags & VEC_SVE_PRED);
3638 return BYTES_PER_SVE_PRED;
3641 /* If MODE holds an array of vectors, return the number of vectors
3642 in the array, otherwise return 1. */
3644 static unsigned int
3645 aarch64_ldn_stn_vectors (machine_mode mode)
3647 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3648 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3649 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3650 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3651 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3652 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3653 return exact_div (GET_MODE_SIZE (mode),
3654 BYTES_PER_SVE_VECTOR).to_constant ();
3655 return 1;
3658 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3659 corresponding vector structure mode. */
3660 static opt_machine_mode
3661 aarch64_advsimd_vector_array_mode (machine_mode mode,
3662 unsigned HOST_WIDE_INT nelems)
3664 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3665 if (known_eq (GET_MODE_SIZE (mode), 8))
3666 flags |= VEC_PARTIAL;
3668 machine_mode struct_mode;
3669 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3670 if (aarch64_classify_vector_mode (struct_mode) == flags
3671 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3672 && known_eq (GET_MODE_NUNITS (struct_mode),
3673 GET_MODE_NUNITS (mode) * nelems))
3674 return struct_mode;
3675 return opt_machine_mode ();
3678 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
3680 opt_machine_mode
3681 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3683 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3684 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3685 machine_mode mode;
3686 FOR_EACH_MODE_IN_CLASS (mode, mclass)
3687 if (inner_mode == GET_MODE_INNER (mode)
3688 && known_eq (nunits, GET_MODE_NUNITS (mode))
3689 && aarch64_sve_data_mode_p (mode))
3690 return mode;
3691 return opt_machine_mode ();
3694 /* Implement target hook TARGET_ARRAY_MODE. */
3695 static opt_machine_mode
3696 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3698 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3699 && IN_RANGE (nelems, 2, 4))
3700 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3701 GET_MODE_NUNITS (mode) * nelems);
3702 if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3703 && IN_RANGE (nelems, 2, 4))
3704 return aarch64_advsimd_vector_array_mode (mode, nelems);
3706 return opt_machine_mode ();
3709 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
3710 static bool
3711 aarch64_array_mode_supported_p (machine_mode mode,
3712 unsigned HOST_WIDE_INT nelems)
3714 if (TARGET_SIMD
3715 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3716 || AARCH64_VALID_SIMD_DREG_MODE (mode))
3717 && (nelems >= 2 && nelems <= 4))
3718 return true;
3720 return false;
3723 /* MODE is some form of SVE vector mode. For data modes, return the number
3724 of vector register bits that each element of MODE occupies, such as 64
3725 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3726 in a 64-bit container). For predicate modes, return the number of
3727 data bits controlled by each significant predicate bit. */
3729 static unsigned int
3730 aarch64_sve_container_bits (machine_mode mode)
3732 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3733 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3734 ? BITS_PER_SVE_VECTOR
3735 : GET_MODE_BITSIZE (mode));
3736 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3739 /* Return the SVE predicate mode to use for elements that have
3740 ELEM_NBYTES bytes, if such a mode exists. */
3742 opt_machine_mode
3743 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3745 if (TARGET_SVE)
3747 if (elem_nbytes == 1)
3748 return VNx16BImode;
3749 if (elem_nbytes == 2)
3750 return VNx8BImode;
3751 if (elem_nbytes == 4)
3752 return VNx4BImode;
3753 if (elem_nbytes == 8)
3754 return VNx2BImode;
3756 return opt_machine_mode ();
3759 /* Return the SVE predicate mode that should be used to control
3760 SVE mode MODE. */
3762 machine_mode
3763 aarch64_sve_pred_mode (machine_mode mode)
3765 unsigned int bits = aarch64_sve_container_bits (mode);
3766 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3769 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
3771 static opt_machine_mode
3772 aarch64_get_mask_mode (machine_mode mode)
3774 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3775 if (vec_flags & VEC_SVE_DATA)
3776 return aarch64_sve_pred_mode (mode);
3778 return default_get_mask_mode (mode);
3781 /* Return the integer element mode associated with SVE mode MODE. */
3783 static scalar_int_mode
3784 aarch64_sve_element_int_mode (machine_mode mode)
3786 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3787 ? BITS_PER_SVE_VECTOR
3788 : GET_MODE_BITSIZE (mode));
3789 unsigned int elt_bits = vector_element_size (vector_bits,
3790 GET_MODE_NUNITS (mode));
3791 return int_mode_for_size (elt_bits, 0).require ();
3794 /* Return an integer element mode that contains exactly
3795 aarch64_sve_container_bits (MODE) bits. This is wider than
3796 aarch64_sve_element_int_mode if MODE is a partial vector,
3797 otherwise it's the same. */
3799 static scalar_int_mode
3800 aarch64_sve_container_int_mode (machine_mode mode)
3802 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3805 /* Return the integer vector mode associated with SVE mode MODE.
3806 Unlike related_int_vector_mode, this can handle the case in which
3807 MODE is a predicate (and thus has a different total size). */
3809 machine_mode
3810 aarch64_sve_int_mode (machine_mode mode)
3812 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3813 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3816 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
3818 static opt_machine_mode
3819 aarch64_vectorize_related_mode (machine_mode vector_mode,
3820 scalar_mode element_mode,
3821 poly_uint64 nunits)
3823 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3825 /* If we're operating on SVE vectors, try to return an SVE mode. */
3826 poly_uint64 sve_nunits;
3827 if ((vec_flags & VEC_SVE_DATA)
3828 && multiple_p (BYTES_PER_SVE_VECTOR,
3829 GET_MODE_SIZE (element_mode), &sve_nunits))
3831 machine_mode sve_mode;
3832 if (maybe_ne (nunits, 0U))
3834 /* Try to find a full or partial SVE mode with exactly
3835 NUNITS units. */
3836 if (multiple_p (sve_nunits, nunits)
3837 && aarch64_sve_data_mode (element_mode,
3838 nunits).exists (&sve_mode))
3839 return sve_mode;
3841 else
3843 /* Take the preferred number of units from the number of bytes
3844 that fit in VECTOR_MODE. We always start by "autodetecting"
3845 a full vector mode with preferred_simd_mode, so vectors
3846 chosen here will also be full vector modes. Then
3847 autovectorize_vector_modes tries smaller starting modes
3848 and thus smaller preferred numbers of units. */
3849 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3850 if (aarch64_sve_data_mode (element_mode,
3851 sve_nunits).exists (&sve_mode))
3852 return sve_mode;
3856 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
3857 if ((vec_flags & VEC_ADVSIMD)
3858 && known_eq (nunits, 0U)
3859 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3860 && maybe_ge (GET_MODE_BITSIZE (element_mode)
3861 * GET_MODE_NUNITS (vector_mode), 128U))
3863 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3864 if (VECTOR_MODE_P (res))
3865 return res;
3868 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3871 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
3872 prefer to use the first arithmetic operand as the else value if
3873 the else value doesn't matter, since that exactly matches the SVE
3874 destructive merging form. For ternary operations we could either
3875 pick the first operand and use FMAD-like instructions or the last
3876 operand and use FMLA-like instructions; the latter seems more
3877 natural. */
3879 static tree
3880 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3882 return nops == 3 ? ops[2] : ops[0];
3885 /* Implement TARGET_HARD_REGNO_NREGS. */
3887 static unsigned int
3888 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3890 /* ??? Logically we should only need to provide a value when
3891 HARD_REGNO_MODE_OK says that the combination is valid,
3892 but at the moment we need to handle all modes. Just ignore
3893 any runtime parts for registers that can't store them. */
3894 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3895 switch (aarch64_regno_regclass (regno))
3897 case FP_REGS:
3898 case FP_LO_REGS:
3899 case FP_LO8_REGS:
3901 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3902 if (vec_flags & VEC_SVE_DATA)
3903 return exact_div (GET_MODE_SIZE (mode),
3904 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3905 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3906 return GET_MODE_SIZE (mode).to_constant () / 8;
3907 return CEIL (lowest_size, UNITS_PER_VREG);
3909 case PR_REGS:
3910 case PR_LO_REGS:
3911 case PR_HI_REGS:
3912 case FFR_REGS:
3913 case PR_AND_FFR_REGS:
3914 return 1;
3915 default:
3916 return CEIL (lowest_size, UNITS_PER_WORD);
3918 gcc_unreachable ();
3921 /* Implement TARGET_HARD_REGNO_MODE_OK. */
3923 static bool
3924 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3926 if (mode == V8DImode)
3927 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
3928 && multiple_p (regno - R0_REGNUM, 2);
3930 if (GET_MODE_CLASS (mode) == MODE_CC)
3931 return regno == CC_REGNUM;
3933 if (regno == VG_REGNUM)
3934 /* This must have the same size as _Unwind_Word. */
3935 return mode == DImode;
3937 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3938 if (vec_flags & VEC_SVE_PRED)
3939 return pr_or_ffr_regnum_p (regno);
3941 if (pr_or_ffr_regnum_p (regno))
3942 return false;
3944 if (regno == SP_REGNUM)
3945 /* The purpose of comparing with ptr_mode is to support the
3946 global register variable associated with the stack pointer
3947 register via the syntax of asm ("wsp") in ILP32. */
3948 return mode == Pmode || mode == ptr_mode;
3950 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
3951 return mode == Pmode;
3953 if (GP_REGNUM_P (regno))
3955 if (vec_flags & VEC_ANY_SVE)
3956 return false;
3957 if (known_le (GET_MODE_SIZE (mode), 8))
3958 return true;
3959 if (known_le (GET_MODE_SIZE (mode), 16))
3960 return (regno & 1) == 0;
3962 else if (FP_REGNUM_P (regno))
3964 if (vec_flags & VEC_STRUCT)
3965 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
3966 else
3967 return !VECTOR_MODE_P (mode) || vec_flags != 0;
3970 return false;
3973 /* Return true if a function with type FNTYPE returns its value in
3974 SVE vector or predicate registers. */
3976 static bool
3977 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
3979 tree return_type = TREE_TYPE (fntype);
3981 pure_scalable_type_info pst_info;
3982 switch (pst_info.analyze (return_type))
3984 case pure_scalable_type_info::IS_PST:
3985 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
3986 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
3988 case pure_scalable_type_info::DOESNT_MATTER:
3989 gcc_assert (aarch64_return_in_memory_1 (return_type));
3990 return false;
3992 case pure_scalable_type_info::NO_ABI_IDENTITY:
3993 case pure_scalable_type_info::ISNT_PST:
3994 return false;
3996 gcc_unreachable ();
3999 /* Return true if a function with type FNTYPE takes arguments in
4000 SVE vector or predicate registers. */
4002 static bool
4003 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
4005 CUMULATIVE_ARGS args_so_far_v;
4006 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
4007 NULL_TREE, 0, true);
4008 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4010 for (tree chain = TYPE_ARG_TYPES (fntype);
4011 chain && chain != void_list_node;
4012 chain = TREE_CHAIN (chain))
4014 tree arg_type = TREE_VALUE (chain);
4015 if (arg_type == error_mark_node)
4016 return false;
4018 function_arg_info arg (arg_type, /*named=*/true);
4019 apply_pass_by_reference_rules (&args_so_far_v, arg);
4020 pure_scalable_type_info pst_info;
4021 if (pst_info.analyze_registers (arg.type))
4023 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4024 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4025 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4026 return true;
4029 targetm.calls.function_arg_advance (args_so_far, arg);
4031 return false;
4034 /* Implement TARGET_FNTYPE_ABI. */
4036 static const predefined_function_abi &
4037 aarch64_fntype_abi (const_tree fntype)
4039 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4040 return aarch64_simd_abi ();
4042 if (aarch64_returns_value_in_sve_regs_p (fntype)
4043 || aarch64_takes_arguments_in_sve_regs_p (fntype))
4044 return aarch64_sve_abi ();
4046 return default_function_abi;
4049 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
4051 static bool
4052 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4054 return (aarch64_sve::builtin_type_p (type1)
4055 == aarch64_sve::builtin_type_p (type2));
4058 /* Return true if we should emit CFI for register REGNO. */
4060 static bool
4061 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4063 return (GP_REGNUM_P (regno)
4064 || !default_function_abi.clobbers_full_reg_p (regno));
4067 /* Return the mode we should use to save and restore register REGNO. */
4069 static machine_mode
4070 aarch64_reg_save_mode (unsigned int regno)
4072 if (GP_REGNUM_P (regno))
4073 return DImode;
4075 if (FP_REGNUM_P (regno))
4076 switch (crtl->abi->id ())
4078 case ARM_PCS_AAPCS64:
4079 /* Only the low 64 bits are saved by the base PCS. */
4080 return DFmode;
4082 case ARM_PCS_SIMD:
4083 /* The vector PCS saves the low 128 bits (which is the full
4084 register on non-SVE targets). */
4085 return TFmode;
4087 case ARM_PCS_SVE:
4088 /* Use vectors of DImode for registers that need frame
4089 information, so that the first 64 bytes of the save slot
4090 are always the equivalent of what storing D<n> would give. */
4091 if (aarch64_emit_cfi_for_reg_p (regno))
4092 return VNx2DImode;
4094 /* Use vectors of bytes otherwise, so that the layout is
4095 endian-agnostic, and so that we can use LDR and STR for
4096 big-endian targets. */
4097 return VNx16QImode;
4099 case ARM_PCS_TLSDESC:
4100 case ARM_PCS_UNKNOWN:
4101 break;
4104 if (PR_REGNUM_P (regno))
4105 /* Save the full predicate register. */
4106 return VNx16BImode;
4108 gcc_unreachable ();
4111 /* Implement TARGET_INSN_CALLEE_ABI. */
4113 const predefined_function_abi &
4114 aarch64_insn_callee_abi (const rtx_insn *insn)
4116 rtx pat = PATTERN (insn);
4117 gcc_assert (GET_CODE (pat) == PARALLEL);
4118 rtx unspec = XVECEXP (pat, 0, 1);
4119 gcc_assert (GET_CODE (unspec) == UNSPEC
4120 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4121 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4124 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
4125 the lower 64 bits of a 128-bit register. Tell the compiler the callee
4126 clobbers the top 64 bits when restoring the bottom 64 bits. */
4128 static bool
4129 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4130 unsigned int regno,
4131 machine_mode mode)
4133 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4135 poly_int64 per_register_size = GET_MODE_SIZE (mode);
4136 unsigned int nregs = hard_regno_nregs (regno, mode);
4137 if (nregs > 1)
4138 per_register_size = exact_div (per_register_size, nregs);
4139 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4140 return maybe_gt (per_register_size, 16);
4141 return maybe_gt (per_register_size, 8);
4143 return false;
4146 /* Implement REGMODE_NATURAL_SIZE. */
4147 poly_uint64
4148 aarch64_regmode_natural_size (machine_mode mode)
4150 /* The natural size for SVE data modes is one SVE data vector,
4151 and similarly for predicates. We can't independently modify
4152 anything smaller than that. */
4153 /* ??? For now, only do this for variable-width SVE registers.
4154 Doing it for constant-sized registers breaks lower-subreg.cc. */
4155 /* ??? And once that's fixed, we should probably have similar
4156 code for Advanced SIMD. */
4157 if (!aarch64_sve_vg.is_constant ())
4159 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4160 if (vec_flags & VEC_SVE_PRED)
4161 return BYTES_PER_SVE_PRED;
4162 if (vec_flags & VEC_SVE_DATA)
4163 return BYTES_PER_SVE_VECTOR;
4165 return UNITS_PER_WORD;
4168 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
4169 machine_mode
4170 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4171 machine_mode mode)
4173 /* The predicate mode determines which bits are significant and
4174 which are "don't care". Decreasing the number of lanes would
4175 lose data while increasing the number of lanes would make bits
4176 unnecessarily significant. */
4177 if (PR_REGNUM_P (regno))
4178 return mode;
4179 if (known_ge (GET_MODE_SIZE (mode), 4))
4180 return mode;
4181 else
4182 return SImode;
4185 /* Return true if I's bits are consecutive ones from the MSB. */
4186 bool
4187 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4189 return exact_log2 (-i) != HOST_WIDE_INT_M1;
4192 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
4193 that strcpy from constants will be faster. */
4195 static HOST_WIDE_INT
4196 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4198 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4199 return MAX (align, BITS_PER_WORD);
4200 return align;
4203 /* Return true if calls to DECL should be treated as
4204 long-calls (ie called via a register). */
4205 static bool
4206 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4208 return false;
4211 /* Return true if calls to symbol-ref SYM should be treated as
4212 long-calls (ie called via a register). */
4213 bool
4214 aarch64_is_long_call_p (rtx sym)
4216 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4219 /* Return true if calls to symbol-ref SYM should not go through
4220 plt stubs. */
4222 bool
4223 aarch64_is_noplt_call_p (rtx sym)
4225 const_tree decl = SYMBOL_REF_DECL (sym);
4227 if (flag_pic
4228 && decl
4229 && (!flag_plt
4230 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4231 && !targetm.binds_local_p (decl))
4232 return true;
4234 return false;
4237 /* Emit an insn that's a simple single-set. Both the operands must be
4238 known to be valid. */
4239 inline static rtx_insn *
4240 emit_set_insn (rtx x, rtx y)
4242 return emit_insn (gen_rtx_SET (x, y));
4245 /* X and Y are two things to compare using CODE. Emit the compare insn and
4246 return the rtx for register 0 in the proper mode. */
4248 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4250 machine_mode cmp_mode = GET_MODE (x);
4251 machine_mode cc_mode;
4252 rtx cc_reg;
4254 if (cmp_mode == TImode)
4256 gcc_assert (code == NE);
4258 cc_mode = CCmode;
4259 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4261 rtx x_lo = operand_subword (x, 0, 0, TImode);
4262 rtx y_lo = operand_subword (y, 0, 0, TImode);
4263 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4265 rtx x_hi = operand_subword (x, 1, 0, TImode);
4266 rtx y_hi = operand_subword (y, 1, 0, TImode);
4267 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4268 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4269 GEN_INT (AARCH64_EQ)));
4271 else
4273 cc_mode = SELECT_CC_MODE (code, x, y);
4274 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4275 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4277 return cc_reg;
4280 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
4282 static rtx
4283 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4284 machine_mode y_mode)
4286 if (y_mode == E_QImode || y_mode == E_HImode)
4288 if (CONST_INT_P (y))
4290 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4291 y_mode = SImode;
4293 else
4295 rtx t, cc_reg;
4296 machine_mode cc_mode;
4298 t = gen_rtx_ZERO_EXTEND (SImode, y);
4299 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4300 cc_mode = CC_SWPmode;
4301 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4302 emit_set_insn (cc_reg, t);
4303 return cc_reg;
4307 if (!aarch64_plus_operand (y, y_mode))
4308 y = force_reg (y_mode, y);
4310 return aarch64_gen_compare_reg (code, x, y);
4313 /* Consider the operation:
4315 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4317 where:
4319 - CODE is [SU]MAX or [SU]MIN
4320 - OPERANDS[2] and OPERANDS[3] are constant integers
4321 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4322 - all operands have mode MODE
4324 Decide whether it is possible to implement the operation using:
4326 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4328 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4330 followed by:
4332 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4334 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
4335 If GENERATE_P is true, also update OPERANDS as follows:
4337 OPERANDS[4] = -OPERANDS[3]
4338 OPERANDS[5] = the rtl condition representing <cond>
4339 OPERANDS[6] = <tmp>
4340 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
4341 bool
4342 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4344 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4345 rtx dst = operands[0];
4346 rtx maxmin_op = operands[2];
4347 rtx add_op = operands[3];
4348 machine_mode mode = GET_MODE (dst);
4350 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4351 == (x >= y ? x : y) - z
4352 == (x > y ? x : y) - z
4353 == (x > y - 1 ? x : y) - z
4355 min (x, y) - z == (x <= y - 1 ? x : y) - z
4356 == (x <= y ? x : y) - z
4357 == (x < y ? x : y) - z
4358 == (x < y + 1 ? x : y) - z
4360 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4361 which x is compared with z. Set DIFF to y - z. Thus the supported
4362 combinations are as follows, with DIFF being the value after the ":":
4364 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
4365 == x >= y ? x - y : 0 [z == y]
4366 == x > y ? x - y : 0 [z == y]
4367 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
4369 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
4370 == x <= y ? x - y : 0 [z == y]
4371 == x < y ? x - y : 0 [z == y]
4372 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
4373 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4374 auto add_val = rtx_mode_t (add_op, mode);
4375 auto sub_val = wi::neg (add_val);
4376 auto diff = wi::sub (maxmin_val, sub_val);
4377 if (!(diff == 0
4378 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4379 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4380 return false;
4382 if (!generate_p)
4383 return true;
4385 rtx_code cmp;
4386 switch (code)
4388 case SMAX:
4389 cmp = diff == 1 ? GT : GE;
4390 break;
4391 case UMAX:
4392 cmp = diff == 1 ? GTU : GEU;
4393 break;
4394 case SMIN:
4395 cmp = diff == -1 ? LT : LE;
4396 break;
4397 case UMIN:
4398 cmp = diff == -1 ? LTU : LEU;
4399 break;
4400 default:
4401 gcc_unreachable ();
4403 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4405 operands[4] = immed_wide_int_const (sub_val, mode);
4406 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4407 if (can_create_pseudo_p ())
4408 operands[6] = gen_reg_rtx (mode);
4409 else
4410 operands[6] = dst;
4411 operands[7] = immed_wide_int_const (diff, mode);
4413 return true;
4417 /* Build the SYMBOL_REF for __tls_get_addr. */
4419 static GTY(()) rtx tls_get_addr_libfunc;
4422 aarch64_tls_get_addr (void)
4424 if (!tls_get_addr_libfunc)
4425 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4426 return tls_get_addr_libfunc;
4429 /* Return the TLS model to use for ADDR. */
4431 static enum tls_model
4432 tls_symbolic_operand_type (rtx addr)
4434 enum tls_model tls_kind = TLS_MODEL_NONE;
4435 poly_int64 offset;
4436 addr = strip_offset_and_salt (addr, &offset);
4437 if (SYMBOL_REF_P (addr))
4438 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4440 return tls_kind;
4443 /* We'll allow lo_sum's in addresses in our legitimate addresses
4444 so that combine would take care of combining addresses where
4445 necessary, but for generation purposes, we'll generate the address
4446 as :
4447 RTL Absolute
4448 tmp = hi (symbol_ref); adrp x1, foo
4449 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
4452 PIC TLS
4453 adrp x1, :got:foo adrp tmp, :tlsgd:foo
4454 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
4455 bl __tls_get_addr
4458 Load TLS symbol, depending on TLS mechanism and TLS access model.
4460 Global Dynamic - Traditional TLS:
4461 adrp tmp, :tlsgd:imm
4462 add dest, tmp, #:tlsgd_lo12:imm
4463 bl __tls_get_addr
4465 Global Dynamic - TLS Descriptors:
4466 adrp dest, :tlsdesc:imm
4467 ldr tmp, [dest, #:tlsdesc_lo12:imm]
4468 add dest, dest, #:tlsdesc_lo12:imm
4469 blr tmp
4470 mrs tp, tpidr_el0
4471 add dest, dest, tp
4473 Initial Exec:
4474 mrs tp, tpidr_el0
4475 adrp tmp, :gottprel:imm
4476 ldr dest, [tmp, #:gottprel_lo12:imm]
4477 add dest, dest, tp
4479 Local Exec:
4480 mrs tp, tpidr_el0
4481 add t0, tp, #:tprel_hi12:imm, lsl #12
4482 add t0, t0, #:tprel_lo12_nc:imm
4485 static void
4486 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4487 enum aarch64_symbol_type type)
4489 switch (type)
4491 case SYMBOL_SMALL_ABSOLUTE:
4493 /* In ILP32, the mode of dest can be either SImode or DImode. */
4494 rtx tmp_reg = dest;
4495 machine_mode mode = GET_MODE (dest);
4497 gcc_assert (mode == Pmode || mode == ptr_mode);
4499 if (can_create_pseudo_p ())
4500 tmp_reg = gen_reg_rtx (mode);
4502 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4503 emit_insn (gen_add_losym (dest, tmp_reg, imm));
4504 return;
4507 case SYMBOL_TINY_ABSOLUTE:
4508 emit_insn (gen_rtx_SET (dest, imm));
4509 return;
4511 case SYMBOL_SMALL_GOT_28K:
4513 machine_mode mode = GET_MODE (dest);
4514 rtx gp_rtx = pic_offset_table_rtx;
4515 rtx insn;
4516 rtx mem;
4518 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4519 here before rtl expand. Tree IVOPT will generate rtl pattern to
4520 decide rtx costs, in which case pic_offset_table_rtx is not
4521 initialized. For that case no need to generate the first adrp
4522 instruction as the final cost for global variable access is
4523 one instruction. */
4524 if (gp_rtx != NULL)
4526 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4527 using the page base as GOT base, the first page may be wasted,
4528 in the worst scenario, there is only 28K space for GOT).
4530 The generate instruction sequence for accessing global variable
4533 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4535 Only one instruction needed. But we must initialize
4536 pic_offset_table_rtx properly. We generate initialize insn for
4537 every global access, and allow CSE to remove all redundant.
4539 The final instruction sequences will look like the following
4540 for multiply global variables access.
4542 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4544 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4545 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4546 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4547 ... */
4549 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4550 crtl->uses_pic_offset_table = 1;
4551 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4553 if (mode != GET_MODE (gp_rtx))
4554 gp_rtx = gen_lowpart (mode, gp_rtx);
4558 if (mode == ptr_mode)
4560 if (mode == DImode)
4561 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4562 else
4563 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4565 mem = XVECEXP (SET_SRC (insn), 0, 0);
4567 else
4569 gcc_assert (mode == Pmode);
4571 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4572 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4575 /* The operand is expected to be MEM. Whenever the related insn
4576 pattern changed, above code which calculate mem should be
4577 updated. */
4578 gcc_assert (MEM_P (mem));
4579 MEM_READONLY_P (mem) = 1;
4580 MEM_NOTRAP_P (mem) = 1;
4581 emit_insn (insn);
4582 return;
4585 case SYMBOL_SMALL_GOT_4G:
4586 emit_insn (gen_rtx_SET (dest, imm));
4587 return;
4589 case SYMBOL_SMALL_TLSGD:
4591 rtx_insn *insns;
4592 /* The return type of __tls_get_addr is the C pointer type
4593 so use ptr_mode. */
4594 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4595 rtx tmp_reg = dest;
4597 if (GET_MODE (dest) != ptr_mode)
4598 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4600 start_sequence ();
4601 if (ptr_mode == SImode)
4602 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4603 else
4604 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4605 insns = get_insns ();
4606 end_sequence ();
4608 RTL_CONST_CALL_P (insns) = 1;
4609 emit_libcall_block (insns, tmp_reg, result, imm);
4610 /* Convert back to the mode of the dest adding a zero_extend
4611 from SImode (ptr_mode) to DImode (Pmode). */
4612 if (dest != tmp_reg)
4613 convert_move (dest, tmp_reg, true);
4614 return;
4617 case SYMBOL_SMALL_TLSDESC:
4619 machine_mode mode = GET_MODE (dest);
4620 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4621 rtx tp;
4623 gcc_assert (mode == Pmode || mode == ptr_mode);
4625 /* In ILP32, the got entry is always of SImode size. Unlike
4626 small GOT, the dest is fixed at reg 0. */
4627 if (TARGET_ILP32)
4628 emit_insn (gen_tlsdesc_small_si (imm));
4629 else
4630 emit_insn (gen_tlsdesc_small_di (imm));
4631 tp = aarch64_load_tp (NULL);
4633 if (mode != Pmode)
4634 tp = gen_lowpart (mode, tp);
4636 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4637 if (REG_P (dest))
4638 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4639 return;
4642 case SYMBOL_SMALL_TLSIE:
4644 /* In ILP32, the mode of dest can be either SImode or DImode,
4645 while the got entry is always of SImode size. The mode of
4646 dest depends on how dest is used: if dest is assigned to a
4647 pointer (e.g. in the memory), it has SImode; it may have
4648 DImode if dest is dereferenced to access the memeory.
4649 This is why we have to handle three different tlsie_small
4650 patterns here (two patterns for ILP32). */
4651 machine_mode mode = GET_MODE (dest);
4652 rtx tmp_reg = gen_reg_rtx (mode);
4653 rtx tp = aarch64_load_tp (NULL);
4655 if (mode == ptr_mode)
4657 if (mode == DImode)
4658 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4659 else
4661 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4662 tp = gen_lowpart (mode, tp);
4665 else
4667 gcc_assert (mode == Pmode);
4668 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4671 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4672 if (REG_P (dest))
4673 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4674 return;
4677 case SYMBOL_TLSLE12:
4678 case SYMBOL_TLSLE24:
4679 case SYMBOL_TLSLE32:
4680 case SYMBOL_TLSLE48:
4682 machine_mode mode = GET_MODE (dest);
4683 rtx tp = aarch64_load_tp (NULL);
4685 if (mode != Pmode)
4686 tp = gen_lowpart (mode, tp);
4688 switch (type)
4690 case SYMBOL_TLSLE12:
4691 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4692 (dest, tp, imm));
4693 break;
4694 case SYMBOL_TLSLE24:
4695 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4696 (dest, tp, imm));
4697 break;
4698 case SYMBOL_TLSLE32:
4699 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4700 (dest, imm));
4701 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4702 (dest, dest, tp));
4703 break;
4704 case SYMBOL_TLSLE48:
4705 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4706 (dest, imm));
4707 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4708 (dest, dest, tp));
4709 break;
4710 default:
4711 gcc_unreachable ();
4714 if (REG_P (dest))
4715 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4716 return;
4719 case SYMBOL_TINY_GOT:
4721 rtx insn;
4722 machine_mode mode = GET_MODE (dest);
4724 if (mode == ptr_mode)
4725 insn = gen_ldr_got_tiny (mode, dest, imm);
4726 else
4728 gcc_assert (mode == Pmode);
4729 insn = gen_ldr_got_tiny_sidi (dest, imm);
4732 emit_insn (insn);
4733 return;
4736 case SYMBOL_TINY_TLSIE:
4738 machine_mode mode = GET_MODE (dest);
4739 rtx tp = aarch64_load_tp (NULL);
4741 if (mode == ptr_mode)
4743 if (mode == DImode)
4744 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4745 else
4747 tp = gen_lowpart (mode, tp);
4748 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4751 else
4753 gcc_assert (mode == Pmode);
4754 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4757 if (REG_P (dest))
4758 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4759 return;
4762 default:
4763 gcc_unreachable ();
4767 /* Emit a move from SRC to DEST. Assume that the move expanders can
4768 handle all moves if !can_create_pseudo_p (). The distinction is
4769 important because, unlike emit_move_insn, the move expanders know
4770 how to force Pmode objects into the constant pool even when the
4771 constant pool address is not itself legitimate. */
4772 static rtx
4773 aarch64_emit_move (rtx dest, rtx src)
4775 return (can_create_pseudo_p ()
4776 ? emit_move_insn (dest, src)
4777 : emit_move_insn_1 (dest, src));
4780 /* Apply UNOPTAB to OP and store the result in DEST. */
4782 static void
4783 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4785 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4786 if (dest != tmp)
4787 emit_move_insn (dest, tmp);
4790 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
4792 static void
4793 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4795 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4796 OPTAB_DIRECT);
4797 if (dest != tmp)
4798 emit_move_insn (dest, tmp);
4801 /* Split a 128-bit move operation into two 64-bit move operations,
4802 taking care to handle partial overlap of register to register
4803 copies. Special cases are needed when moving between GP regs and
4804 FP regs. SRC can be a register, constant or memory; DST a register
4805 or memory. If either operand is memory it must not have any side
4806 effects. */
4807 void
4808 aarch64_split_128bit_move (rtx dst, rtx src)
4810 rtx dst_lo, dst_hi;
4811 rtx src_lo, src_hi;
4813 machine_mode mode = GET_MODE (dst);
4815 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
4816 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4817 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4819 if (REG_P (dst) && REG_P (src))
4821 int src_regno = REGNO (src);
4822 int dst_regno = REGNO (dst);
4824 /* Handle FP <-> GP regs. */
4825 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4827 src_lo = gen_lowpart (word_mode, src);
4828 src_hi = gen_highpart (word_mode, src);
4830 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4831 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4832 return;
4834 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4836 dst_lo = gen_lowpart (word_mode, dst);
4837 dst_hi = gen_highpart (word_mode, dst);
4839 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4840 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4841 return;
4845 dst_lo = gen_lowpart (word_mode, dst);
4846 dst_hi = gen_highpart (word_mode, dst);
4847 src_lo = gen_lowpart (word_mode, src);
4848 src_hi = gen_highpart_mode (word_mode, mode, src);
4850 /* At most one pairing may overlap. */
4851 if (reg_overlap_mentioned_p (dst_lo, src_hi))
4853 aarch64_emit_move (dst_hi, src_hi);
4854 aarch64_emit_move (dst_lo, src_lo);
4856 else
4858 aarch64_emit_move (dst_lo, src_lo);
4859 aarch64_emit_move (dst_hi, src_hi);
4863 /* Return true if we should split a move from 128-bit value SRC
4864 to 128-bit register DEST. */
4866 bool
4867 aarch64_split_128bit_move_p (rtx dst, rtx src)
4869 if (FP_REGNUM_P (REGNO (dst)))
4870 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4871 /* All moves to GPRs need to be split. */
4872 return true;
4875 /* Split a complex SIMD move. */
4877 void
4878 aarch64_split_simd_move (rtx dst, rtx src)
4880 machine_mode src_mode = GET_MODE (src);
4881 machine_mode dst_mode = GET_MODE (dst);
4883 gcc_assert (VECTOR_MODE_P (dst_mode));
4885 if (REG_P (dst) && REG_P (src))
4887 gcc_assert (VECTOR_MODE_P (src_mode));
4888 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4892 bool
4893 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4894 machine_mode ymode, rtx y)
4896 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4897 gcc_assert (r != NULL);
4898 return rtx_equal_p (x, r);
4901 /* Return TARGET if it is nonnull and a register of mode MODE.
4902 Otherwise, return a fresh register of mode MODE if we can,
4903 or TARGET reinterpreted as MODE if we can't. */
4905 static rtx
4906 aarch64_target_reg (rtx target, machine_mode mode)
4908 if (target && REG_P (target) && GET_MODE (target) == mode)
4909 return target;
4910 if (!can_create_pseudo_p ())
4912 gcc_assert (target);
4913 return gen_lowpart (mode, target);
4915 return gen_reg_rtx (mode);
4918 /* Return a register that contains the constant in BUILDER, given that
4919 the constant is a legitimate move operand. Use TARGET as the register
4920 if it is nonnull and convenient. */
4922 static rtx
4923 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4925 rtx src = builder.build ();
4926 target = aarch64_target_reg (target, GET_MODE (src));
4927 emit_insn (gen_rtx_SET (target, src));
4928 return target;
4931 static rtx
4932 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
4934 if (can_create_pseudo_p ())
4935 return force_reg (mode, value);
4936 else
4938 gcc_assert (x);
4939 aarch64_emit_move (x, value);
4940 return x;
4944 /* Return true if predicate value X is a constant in which every element
4945 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
4946 value, i.e. as a predicate in which all bits are significant. */
4948 static bool
4949 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
4951 if (!CONST_VECTOR_P (x))
4952 return false;
4954 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
4955 GET_MODE_NUNITS (GET_MODE (x)));
4956 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
4957 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
4958 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
4960 unsigned int nelts = const_vector_encoded_nelts (x);
4961 for (unsigned int i = 0; i < nelts; ++i)
4963 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
4964 if (!CONST_INT_P (elt))
4965 return false;
4967 builder.quick_push (elt);
4968 for (unsigned int j = 1; j < factor; ++j)
4969 builder.quick_push (const0_rtx);
4971 builder.finalize ();
4972 return true;
4975 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
4976 widest predicate element size it can have (that is, the largest size
4977 for which each element would still be 0 or 1). */
4979 unsigned int
4980 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
4982 /* Start with the most optimistic assumption: that we only need
4983 one bit per pattern. This is what we will use if only the first
4984 bit in each pattern is ever set. */
4985 unsigned int mask = GET_MODE_SIZE (DImode);
4986 mask |= builder.npatterns ();
4988 /* Look for set bits. */
4989 unsigned int nelts = builder.encoded_nelts ();
4990 for (unsigned int i = 1; i < nelts; ++i)
4991 if (INTVAL (builder.elt (i)) != 0)
4993 if (i & 1)
4994 return 1;
4995 mask |= i;
4997 return mask & -mask;
5000 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5001 return that predicate mode, otherwise return opt_machine_mode (). */
5003 opt_machine_mode
5004 aarch64_ptrue_all_mode (rtx x)
5006 gcc_assert (GET_MODE (x) == VNx16BImode);
5007 if (!CONST_VECTOR_P (x)
5008 || !CONST_VECTOR_DUPLICATE_P (x)
5009 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5010 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5011 return opt_machine_mode ();
5013 unsigned int nelts = const_vector_encoded_nelts (x);
5014 for (unsigned int i = 1; i < nelts; ++i)
5015 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5016 return opt_machine_mode ();
5018 return aarch64_sve_pred_mode (nelts);
5021 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
5022 that the constant would have with predicate element size ELT_SIZE
5023 (ignoring the upper bits in each element) and return:
5025 * -1 if all bits are set
5026 * N if the predicate has N leading set bits followed by all clear bits
5027 * 0 if the predicate does not have any of these forms. */
5030 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5031 unsigned int elt_size)
5033 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5034 followed by set bits. */
5035 if (builder.nelts_per_pattern () == 3)
5036 return 0;
5038 /* Skip over leading set bits. */
5039 unsigned int nelts = builder.encoded_nelts ();
5040 unsigned int i = 0;
5041 for (; i < nelts; i += elt_size)
5042 if (INTVAL (builder.elt (i)) == 0)
5043 break;
5044 unsigned int vl = i / elt_size;
5046 /* Check for the all-true case. */
5047 if (i == nelts)
5048 return -1;
5050 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5051 repeating pattern of set bits followed by clear bits. */
5052 if (builder.nelts_per_pattern () != 2)
5053 return 0;
5055 /* We have a "foreground" value and a duplicated "background" value.
5056 If the background might repeat and the last set bit belongs to it,
5057 we might have set bits followed by clear bits followed by set bits. */
5058 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5059 return 0;
5061 /* Make sure that the rest are all clear. */
5062 for (; i < nelts; i += elt_size)
5063 if (INTVAL (builder.elt (i)) != 0)
5064 return 0;
5066 return vl;
5069 /* See if there is an svpattern that encodes an SVE predicate of mode
5070 PRED_MODE in which the first VL bits are set and the rest are clear.
5071 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5072 A VL of -1 indicates an all-true vector. */
5074 aarch64_svpattern
5075 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5077 if (vl < 0)
5078 return AARCH64_SV_ALL;
5080 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5081 return AARCH64_NUM_SVPATTERNS;
5083 if (vl >= 1 && vl <= 8)
5084 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5086 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5087 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5089 int max_vl;
5090 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5092 if (vl == (max_vl / 3) * 3)
5093 return AARCH64_SV_MUL3;
5094 /* These would only trigger for non-power-of-2 lengths. */
5095 if (vl == (max_vl & -4))
5096 return AARCH64_SV_MUL4;
5097 if (vl == (1 << floor_log2 (max_vl)))
5098 return AARCH64_SV_POW2;
5099 if (vl == max_vl)
5100 return AARCH64_SV_ALL;
5102 return AARCH64_NUM_SVPATTERNS;
5105 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5106 bits has the lowest bit set and the upper bits clear. This is the
5107 VNx16BImode equivalent of a PTRUE for controlling elements of
5108 ELT_SIZE bytes. However, because the constant is VNx16BImode,
5109 all bits are significant, even the upper zeros. */
5112 aarch64_ptrue_all (unsigned int elt_size)
5114 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5115 builder.quick_push (const1_rtx);
5116 for (unsigned int i = 1; i < elt_size; ++i)
5117 builder.quick_push (const0_rtx);
5118 return builder.build ();
5121 /* Return an all-true predicate register of mode MODE. */
5124 aarch64_ptrue_reg (machine_mode mode)
5126 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5127 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5128 return gen_lowpart (mode, reg);
5131 /* Return an all-false predicate register of mode MODE. */
5134 aarch64_pfalse_reg (machine_mode mode)
5136 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5137 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5138 return gen_lowpart (mode, reg);
5141 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5142 for it. PRED2[0] is the predicate for the instruction whose result
5143 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5144 for it. Return true if we can prove that the two predicates are
5145 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5146 with PRED1[0] without changing behavior. */
5148 bool
5149 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5151 machine_mode mode = GET_MODE (pred1[0]);
5152 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5153 && mode == GET_MODE (pred2[0])
5154 && aarch64_sve_ptrue_flag (pred1[1], SImode)
5155 && aarch64_sve_ptrue_flag (pred2[1], SImode));
5157 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5158 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5159 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5160 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5161 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5164 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5165 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5166 Use TARGET as the target register if nonnull and convenient. */
5168 static rtx
5169 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5170 machine_mode data_mode, rtx op1, rtx op2)
5172 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5173 expand_operand ops[5];
5174 create_output_operand (&ops[0], target, pred_mode);
5175 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5176 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5177 create_input_operand (&ops[3], op1, data_mode);
5178 create_input_operand (&ops[4], op2, data_mode);
5179 expand_insn (icode, 5, ops);
5180 return ops[0].value;
5183 /* Use a comparison to convert integer vector SRC into MODE, which is
5184 the corresponding SVE predicate mode. Use TARGET for the result
5185 if it's nonnull and convenient. */
5188 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5190 machine_mode src_mode = GET_MODE (src);
5191 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5192 src, CONST0_RTX (src_mode));
5195 /* Return the assembly token for svprfop value PRFOP. */
5197 static const char *
5198 svprfop_token (enum aarch64_svprfop prfop)
5200 switch (prfop)
5202 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5203 AARCH64_FOR_SVPRFOP (CASE)
5204 #undef CASE
5205 case AARCH64_NUM_SVPRFOPS:
5206 break;
5208 gcc_unreachable ();
5211 /* Return the assembly string for an SVE prefetch operation with
5212 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5213 and that SUFFIX is the format for the remaining operands. */
5215 char *
5216 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5217 const char *suffix)
5219 static char buffer[128];
5220 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5221 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5222 mnemonic, svprfop_token (prfop), suffix);
5223 gcc_assert (written < sizeof (buffer));
5224 return buffer;
5227 /* Check whether we can calculate the number of elements in PATTERN
5228 at compile time, given that there are NELTS_PER_VQ elements per
5229 128-bit block. Return the value if so, otherwise return -1. */
5231 HOST_WIDE_INT
5232 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5234 unsigned int vl, const_vg;
5235 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5236 vl = 1 + (pattern - AARCH64_SV_VL1);
5237 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5238 vl = 16 << (pattern - AARCH64_SV_VL16);
5239 else if (aarch64_sve_vg.is_constant (&const_vg))
5241 /* There are two vector granules per quadword. */
5242 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5243 switch (pattern)
5245 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5246 case AARCH64_SV_MUL4: return nelts & -4;
5247 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5248 case AARCH64_SV_ALL: return nelts;
5249 default: gcc_unreachable ();
5252 else
5253 return -1;
5255 /* There are two vector granules per quadword. */
5256 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5257 if (known_le (vl, nelts_all))
5258 return vl;
5260 /* Requesting more elements than are available results in a PFALSE. */
5261 if (known_gt (vl, nelts_all))
5262 return 0;
5264 return -1;
5267 /* Return true if we can move VALUE into a register using a single
5268 CNT[BHWD] instruction. */
5270 static bool
5271 aarch64_sve_cnt_immediate_p (poly_int64 value)
5273 HOST_WIDE_INT factor = value.coeffs[0];
5274 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
5275 return (value.coeffs[1] == factor
5276 && IN_RANGE (factor, 2, 16 * 16)
5277 && (factor & 1) == 0
5278 && factor <= 16 * (factor & -factor));
5281 /* Likewise for rtx X. */
5283 bool
5284 aarch64_sve_cnt_immediate_p (rtx x)
5286 poly_int64 value;
5287 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5290 /* Return the asm string for an instruction with a CNT-like vector size
5291 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5292 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5293 first part of the operands template (the part that comes before the
5294 vector size itself). PATTERN is the pattern to use. FACTOR is the
5295 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
5296 in each quadword. If it is zero, we can use any element size. */
5298 static char *
5299 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5300 aarch64_svpattern pattern,
5301 unsigned int factor,
5302 unsigned int nelts_per_vq)
5304 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5306 if (nelts_per_vq == 0)
5307 /* There is some overlap in the ranges of the four CNT instructions.
5308 Here we always use the smallest possible element size, so that the
5309 multiplier is 1 whereever possible. */
5310 nelts_per_vq = factor & -factor;
5311 int shift = std::min (exact_log2 (nelts_per_vq), 4);
5312 gcc_assert (IN_RANGE (shift, 1, 4));
5313 char suffix = "dwhb"[shift - 1];
5315 factor >>= shift;
5316 unsigned int written;
5317 if (pattern == AARCH64_SV_ALL && factor == 1)
5318 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5319 prefix, suffix, operands);
5320 else if (factor == 1)
5321 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5322 prefix, suffix, operands, svpattern_token (pattern));
5323 else
5324 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5325 prefix, suffix, operands, svpattern_token (pattern),
5326 factor);
5327 gcc_assert (written < sizeof (buffer));
5328 return buffer;
5331 /* Return the asm string for an instruction with a CNT-like vector size
5332 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5333 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5334 first part of the operands template (the part that comes before the
5335 vector size itself). X is the value of the vector size operand,
5336 as a polynomial integer rtx; we need to convert this into an "all"
5337 pattern with a multiplier. */
5339 char *
5340 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5341 rtx x)
5343 poly_int64 value = rtx_to_poly_int64 (x);
5344 gcc_assert (aarch64_sve_cnt_immediate_p (value));
5345 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5346 value.coeffs[1], 0);
5349 /* Return the asm string for an instruction with a CNT-like vector size
5350 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5351 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5352 first part of the operands template (the part that comes before the
5353 vector size itself). CNT_PAT[0..2] are the operands of the
5354 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
5356 char *
5357 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5358 const char *operands, rtx *cnt_pat)
5360 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5361 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5362 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5363 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5364 factor, nelts_per_vq);
5367 /* Return true if we can add X using a single SVE INC or DEC instruction. */
5369 bool
5370 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5372 poly_int64 value;
5373 return (poly_int_rtx_p (x, &value)
5374 && (aarch64_sve_cnt_immediate_p (value)
5375 || aarch64_sve_cnt_immediate_p (-value)));
5378 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5379 operand 0. */
5381 char *
5382 aarch64_output_sve_scalar_inc_dec (rtx offset)
5384 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5385 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5386 if (offset_value.coeffs[1] > 0)
5387 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5388 offset_value.coeffs[1], 0);
5389 else
5390 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5391 -offset_value.coeffs[1], 0);
5394 /* Return true if we can add VALUE to a register using a single ADDVL
5395 or ADDPL instruction. */
5397 static bool
5398 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5400 HOST_WIDE_INT factor = value.coeffs[0];
5401 if (factor == 0 || value.coeffs[1] != factor)
5402 return false;
5403 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5404 and a value of 16 is one vector width. */
5405 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5406 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5409 /* Likewise for rtx X. */
5411 bool
5412 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5414 poly_int64 value;
5415 return (poly_int_rtx_p (x, &value)
5416 && aarch64_sve_addvl_addpl_immediate_p (value));
5419 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5420 to operand 1 and storing the result in operand 0. */
5422 char *
5423 aarch64_output_sve_addvl_addpl (rtx offset)
5425 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5426 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5427 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5429 int factor = offset_value.coeffs[1];
5430 if ((factor & 15) == 0)
5431 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5432 else
5433 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5434 return buffer;
5437 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5438 instruction. If it is, store the number of elements in each vector
5439 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5440 factor in *FACTOR_OUT (if nonnull). */
5442 bool
5443 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5444 unsigned int *nelts_per_vq_out)
5446 rtx elt;
5447 poly_int64 value;
5449 if (!const_vec_duplicate_p (x, &elt)
5450 || !poly_int_rtx_p (elt, &value))
5451 return false;
5453 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5454 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5455 /* There's no vector INCB. */
5456 return false;
5458 HOST_WIDE_INT factor = value.coeffs[0];
5459 if (value.coeffs[1] != factor)
5460 return false;
5462 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
5463 if ((factor % nelts_per_vq) != 0
5464 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5465 return false;
5467 if (factor_out)
5468 *factor_out = factor;
5469 if (nelts_per_vq_out)
5470 *nelts_per_vq_out = nelts_per_vq;
5471 return true;
5474 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5475 instruction. */
5477 bool
5478 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5480 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5483 /* Return the asm template for an SVE vector INC or DEC instruction.
5484 OPERANDS gives the operands before the vector count and X is the
5485 value of the vector count operand itself. */
5487 char *
5488 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5490 int factor;
5491 unsigned int nelts_per_vq;
5492 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5493 gcc_unreachable ();
5494 if (factor < 0)
5495 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5496 -factor, nelts_per_vq);
5497 else
5498 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5499 factor, nelts_per_vq);
5502 static int
5503 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5504 scalar_int_mode mode)
5506 int i;
5507 unsigned HOST_WIDE_INT val, val2, mask;
5508 int one_match, zero_match;
5509 int num_insns;
5511 val = INTVAL (imm);
5513 if (aarch64_move_imm (val, mode))
5515 if (generate)
5516 emit_insn (gen_rtx_SET (dest, imm));
5517 return 1;
5520 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
5521 (with XXXX non-zero). In that case check to see if the move can be done in
5522 a smaller mode. */
5523 val2 = val & 0xffffffff;
5524 if (mode == DImode
5525 && aarch64_move_imm (val2, SImode)
5526 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
5528 if (generate)
5529 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5531 /* Check if we have to emit a second instruction by checking to see
5532 if any of the upper 32 bits of the original DI mode value is set. */
5533 if (val == val2)
5534 return 1;
5536 i = (val >> 48) ? 48 : 32;
5538 if (generate)
5539 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5540 GEN_INT ((val >> i) & 0xffff)));
5542 return 2;
5545 if ((val >> 32) == 0 || mode == SImode)
5547 if (generate)
5549 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5550 if (mode == SImode)
5551 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5552 GEN_INT ((val >> 16) & 0xffff)));
5553 else
5554 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5555 GEN_INT ((val >> 16) & 0xffff)));
5557 return 2;
5560 /* Remaining cases are all for DImode. */
5562 mask = 0xffff;
5563 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5564 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5565 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5566 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5568 if (zero_match != 2 && one_match != 2)
5570 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
5571 For a 64-bit bitmask try whether changing 16 bits to all ones or
5572 zeroes creates a valid bitmask. To check any repeated bitmask,
5573 try using 16 bits from the other 32-bit half of val. */
5575 for (i = 0; i < 64; i += 16, mask <<= 16)
5577 val2 = val & ~mask;
5578 if (val2 != val && aarch64_bitmask_imm (val2, mode))
5579 break;
5580 val2 = val | mask;
5581 if (val2 != val && aarch64_bitmask_imm (val2, mode))
5582 break;
5583 val2 = val2 & ~mask;
5584 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
5585 if (val2 != val && aarch64_bitmask_imm (val2, mode))
5586 break;
5588 if (i != 64)
5590 if (generate)
5592 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5593 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5594 GEN_INT ((val >> i) & 0xffff)));
5596 return 2;
5600 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5601 are emitted by the initial mov. If one_match > zero_match, skip set bits,
5602 otherwise skip zero bits. */
5604 num_insns = 1;
5605 mask = 0xffff;
5606 val2 = one_match > zero_match ? ~val : val;
5607 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5609 if (generate)
5610 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5611 ? (val | ~(mask << i))
5612 : (val & (mask << i)))));
5613 for (i += 16; i < 64; i += 16)
5615 if ((val2 & (mask << i)) == 0)
5616 continue;
5617 if (generate)
5618 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5619 GEN_INT ((val >> i) & 0xffff)));
5620 num_insns ++;
5623 return num_insns;
5626 /* Return whether imm is a 128-bit immediate which is simple enough to
5627 expand inline. */
5628 bool
5629 aarch64_mov128_immediate (rtx imm)
5631 if (CONST_INT_P (imm))
5632 return true;
5634 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5636 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5637 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5639 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5640 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5644 /* Return the number of temporary registers that aarch64_add_offset_1
5645 would need to add OFFSET to a register. */
5647 static unsigned int
5648 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5650 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5653 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
5654 a non-polynomial OFFSET. MODE is the mode of the addition.
5655 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5656 be set and CFA adjustments added to the generated instructions.
5658 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5659 temporary if register allocation is already complete. This temporary
5660 register may overlap DEST but must not overlap SRC. If TEMP1 is known
5661 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5662 the immediate again.
5664 Since this function may be used to adjust the stack pointer, we must
5665 ensure that it cannot cause transient stack deallocation (for example
5666 by first incrementing SP and then decrementing when adjusting by a
5667 large immediate). */
5669 static void
5670 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5671 rtx src, HOST_WIDE_INT offset, rtx temp1,
5672 bool frame_related_p, bool emit_move_imm)
5674 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5675 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5677 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5678 rtx_insn *insn;
5680 if (!moffset)
5682 if (!rtx_equal_p (dest, src))
5684 insn = emit_insn (gen_rtx_SET (dest, src));
5685 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5687 return;
5690 /* Single instruction adjustment. */
5691 if (aarch64_uimm12_shift (moffset))
5693 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5694 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5695 return;
5698 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
5699 and either:
5701 a) the offset cannot be loaded by a 16-bit move or
5702 b) there is no spare register into which we can move it. */
5703 if (moffset < 0x1000000
5704 && ((!temp1 && !can_create_pseudo_p ())
5705 || !aarch64_move_imm (moffset, mode)))
5707 HOST_WIDE_INT low_off = moffset & 0xfff;
5709 low_off = offset < 0 ? -low_off : low_off;
5710 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
5711 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5712 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
5713 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5714 return;
5717 /* Emit a move immediate if required and an addition/subtraction. */
5718 if (emit_move_imm)
5720 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
5721 temp1 = aarch64_force_temporary (mode, temp1,
5722 gen_int_mode (moffset, mode));
5724 insn = emit_insn (offset < 0
5725 ? gen_sub3_insn (dest, src, temp1)
5726 : gen_add3_insn (dest, src, temp1));
5727 if (frame_related_p)
5729 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5730 rtx adj = plus_constant (mode, src, offset);
5731 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
5735 /* Return the number of temporary registers that aarch64_add_offset
5736 would need to move OFFSET into a register or add OFFSET to a register;
5737 ADD_P is true if we want the latter rather than the former. */
5739 static unsigned int
5740 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
5742 /* This follows the same structure as aarch64_add_offset. */
5743 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
5744 return 0;
5746 unsigned int count = 0;
5747 HOST_WIDE_INT factor = offset.coeffs[1];
5748 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5749 poly_int64 poly_offset (factor, factor);
5750 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5751 /* Need one register for the ADDVL/ADDPL result. */
5752 count += 1;
5753 else if (factor != 0)
5755 factor = abs (factor);
5756 if (factor > 16 * (factor & -factor))
5757 /* Need one register for the CNT result and one for the multiplication
5758 factor. If necessary, the second temporary can be reused for the
5759 constant part of the offset. */
5760 return 2;
5761 /* Need one register for the CNT result (which might then
5762 be shifted). */
5763 count += 1;
5765 return count + aarch64_add_offset_1_temporaries (constant);
5768 /* If X can be represented as a poly_int64, return the number
5769 of temporaries that are required to add it to a register.
5770 Return -1 otherwise. */
5773 aarch64_add_offset_temporaries (rtx x)
5775 poly_int64 offset;
5776 if (!poly_int_rtx_p (x, &offset))
5777 return -1;
5778 return aarch64_offset_temporaries (true, offset);
5781 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
5782 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5783 be set and CFA adjustments added to the generated instructions.
5785 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5786 temporary if register allocation is already complete. This temporary
5787 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
5788 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
5789 false to avoid emitting the immediate again.
5791 TEMP2, if nonnull, is a second temporary register that doesn't
5792 overlap either DEST or REG.
5794 Since this function may be used to adjust the stack pointer, we must
5795 ensure that it cannot cause transient stack deallocation (for example
5796 by first incrementing SP and then decrementing when adjusting by a
5797 large immediate). */
5799 static void
5800 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5801 poly_int64 offset, rtx temp1, rtx temp2,
5802 bool frame_related_p, bool emit_move_imm = true)
5804 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5805 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5806 gcc_assert (temp1 == NULL_RTX
5807 || !frame_related_p
5808 || !reg_overlap_mentioned_p (temp1, dest));
5809 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
5811 /* Try using ADDVL or ADDPL to add the whole value. */
5812 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
5814 rtx offset_rtx = gen_int_mode (offset, mode);
5815 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5816 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5817 return;
5820 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
5821 SVE vector register, over and above the minimum size of 128 bits.
5822 This is equivalent to half the value returned by CNTD with a
5823 vector shape of ALL. */
5824 HOST_WIDE_INT factor = offset.coeffs[1];
5825 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5827 /* Try using ADDVL or ADDPL to add the VG-based part. */
5828 poly_int64 poly_offset (factor, factor);
5829 if (src != const0_rtx
5830 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5832 rtx offset_rtx = gen_int_mode (poly_offset, mode);
5833 if (frame_related_p)
5835 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5836 RTX_FRAME_RELATED_P (insn) = true;
5837 src = dest;
5839 else
5841 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
5842 src = aarch64_force_temporary (mode, temp1, addr);
5843 temp1 = temp2;
5844 temp2 = NULL_RTX;
5847 /* Otherwise use a CNT-based sequence. */
5848 else if (factor != 0)
5850 /* Use a subtraction if we have a negative factor. */
5851 rtx_code code = PLUS;
5852 if (factor < 0)
5854 factor = -factor;
5855 code = MINUS;
5858 /* Calculate CNTD * FACTOR / 2. First try to fold the division
5859 into the multiplication. */
5860 rtx val;
5861 int shift = 0;
5862 if (factor & 1)
5863 /* Use a right shift by 1. */
5864 shift = -1;
5865 else
5866 factor /= 2;
5867 HOST_WIDE_INT low_bit = factor & -factor;
5868 if (factor <= 16 * low_bit)
5870 if (factor > 16 * 8)
5872 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
5873 the value with the minimum multiplier and shift it into
5874 position. */
5875 int extra_shift = exact_log2 (low_bit);
5876 shift += extra_shift;
5877 factor >>= extra_shift;
5879 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
5881 else
5883 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
5884 directly, since that should increase the chances of being
5885 able to use a shift and add sequence. If LOW_BIT itself
5886 is out of range, just use CNTD. */
5887 if (low_bit <= 16 * 8)
5888 factor /= low_bit;
5889 else
5890 low_bit = 1;
5892 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
5893 val = aarch64_force_temporary (mode, temp1, val);
5895 if (can_create_pseudo_p ())
5897 rtx coeff1 = gen_int_mode (factor, mode);
5898 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
5900 else
5902 /* Go back to using a negative multiplication factor if we have
5903 no register from which to subtract. */
5904 if (code == MINUS && src == const0_rtx)
5906 factor = -factor;
5907 code = PLUS;
5909 rtx coeff1 = gen_int_mode (factor, mode);
5910 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
5911 val = gen_rtx_MULT (mode, val, coeff1);
5915 if (shift > 0)
5917 /* Multiply by 1 << SHIFT. */
5918 val = aarch64_force_temporary (mode, temp1, val);
5919 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5921 else if (shift == -1)
5923 /* Divide by 2. */
5924 val = aarch64_force_temporary (mode, temp1, val);
5925 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
5928 /* Calculate SRC +/- CNTD * FACTOR / 2. */
5929 if (src != const0_rtx)
5931 val = aarch64_force_temporary (mode, temp1, val);
5932 val = gen_rtx_fmt_ee (code, mode, src, val);
5934 else if (code == MINUS)
5936 val = aarch64_force_temporary (mode, temp1, val);
5937 val = gen_rtx_NEG (mode, val);
5940 if (constant == 0 || frame_related_p)
5942 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5943 if (frame_related_p)
5945 RTX_FRAME_RELATED_P (insn) = true;
5946 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5947 gen_rtx_SET (dest, plus_constant (Pmode, src,
5948 poly_offset)));
5950 src = dest;
5951 if (constant == 0)
5952 return;
5954 else
5956 src = aarch64_force_temporary (mode, temp1, val);
5957 temp1 = temp2;
5958 temp2 = NULL_RTX;
5961 emit_move_imm = true;
5964 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
5965 frame_related_p, emit_move_imm);
5968 /* Like aarch64_add_offset, but the offset is given as an rtx rather
5969 than a poly_int64. */
5971 void
5972 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5973 rtx offset_rtx, rtx temp1, rtx temp2)
5975 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
5976 temp1, temp2, false);
5979 /* Add DELTA to the stack pointer, marking the instructions frame-related.
5980 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
5981 if TEMP1 already contains abs (DELTA). */
5983 static inline void
5984 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
5986 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
5987 temp1, temp2, true, emit_move_imm);
5990 /* Subtract DELTA from the stack pointer, marking the instructions
5991 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
5992 if nonnull. */
5994 static inline void
5995 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
5996 bool emit_move_imm = true)
5998 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
5999 temp1, temp2, frame_related_p, emit_move_imm);
6002 /* Set DEST to (vec_series BASE STEP). */
6004 static void
6005 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6007 machine_mode mode = GET_MODE (dest);
6008 scalar_mode inner = GET_MODE_INNER (mode);
6010 /* Each operand can be a register or an immediate in the range [-16, 15]. */
6011 if (!aarch64_sve_index_immediate_p (base))
6012 base = force_reg (inner, base);
6013 if (!aarch64_sve_index_immediate_p (step))
6014 step = force_reg (inner, step);
6016 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6019 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6020 register of mode MODE. Use TARGET for the result if it's nonnull
6021 and convenient.
6023 The two vector modes must have the same element mode. The behavior
6024 is to duplicate architectural lane N of SRC into architectural lanes
6025 N + I * STEP of the result. On big-endian targets, architectural
6026 lane 0 of an Advanced SIMD vector is the last element of the vector
6027 in memory layout, so for big-endian targets this operation has the
6028 effect of reversing SRC before duplicating it. Callers need to
6029 account for this. */
6032 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6034 machine_mode src_mode = GET_MODE (src);
6035 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6036 insn_code icode = (BYTES_BIG_ENDIAN
6037 ? code_for_aarch64_vec_duplicate_vq_be (mode)
6038 : code_for_aarch64_vec_duplicate_vq_le (mode));
6040 unsigned int i = 0;
6041 expand_operand ops[3];
6042 create_output_operand (&ops[i++], target, mode);
6043 create_output_operand (&ops[i++], src, src_mode);
6044 if (BYTES_BIG_ENDIAN)
6046 /* Create a PARALLEL describing the reversal of SRC. */
6047 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6048 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6049 nelts_per_vq - 1, -1);
6050 create_fixed_operand (&ops[i++], sel);
6052 expand_insn (icode, i, ops);
6053 return ops[0].value;
6056 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6057 the memory image into DEST. Return true on success. */
6059 static bool
6060 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6062 src = force_const_mem (GET_MODE (src), src);
6063 if (!src)
6064 return false;
6066 /* Make sure that the address is legitimate. */
6067 if (!aarch64_sve_ld1rq_operand_p (src))
6069 rtx addr = force_reg (Pmode, XEXP (src, 0));
6070 src = replace_equiv_address (src, addr);
6073 machine_mode mode = GET_MODE (dest);
6074 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6075 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6076 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6077 return true;
6080 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6081 by N "background" values. Try to move it into TARGET using:
6083 PTRUE PRED.<T>, VL<N>
6084 MOV TRUE.<T>, #<foreground>
6085 MOV FALSE.<T>, #<background>
6086 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6088 The PTRUE is always a single instruction but the MOVs might need a
6089 longer sequence. If the background value is zero (as it often is),
6090 the sequence can sometimes collapse to a PTRUE followed by a
6091 zero-predicated move.
6093 Return the target on success, otherwise return null. */
6095 static rtx
6096 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6098 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6100 /* Make sure that the PTRUE is valid. */
6101 machine_mode mode = GET_MODE (src);
6102 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6103 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6104 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6105 == AARCH64_NUM_SVPATTERNS)
6106 return NULL_RTX;
6108 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6109 rtx_vector_builder true_builder (mode, npatterns, 1);
6110 rtx_vector_builder false_builder (mode, npatterns, 1);
6111 for (unsigned int i = 0; i < npatterns; ++i)
6113 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6114 pred_builder.quick_push (CONST1_RTX (BImode));
6116 for (unsigned int i = 0; i < npatterns; ++i)
6118 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6119 pred_builder.quick_push (CONST0_RTX (BImode));
6121 expand_operand ops[4];
6122 create_output_operand (&ops[0], target, mode);
6123 create_input_operand (&ops[1], true_builder.build (), mode);
6124 create_input_operand (&ops[2], false_builder.build (), mode);
6125 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6126 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6127 return target;
6130 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6131 SVE data mode and isn't a legitimate constant. Use TARGET for the
6132 result if convenient.
6134 The returned register can have whatever mode seems most natural
6135 given the contents of SRC. */
6137 static rtx
6138 aarch64_expand_sve_const_vector (rtx target, rtx src)
6140 machine_mode mode = GET_MODE (src);
6141 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6142 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6143 scalar_mode elt_mode = GET_MODE_INNER (mode);
6144 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6145 unsigned int container_bits = aarch64_sve_container_bits (mode);
6146 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6148 if (nelts_per_pattern == 1
6149 && encoded_bits <= 128
6150 && container_bits != elt_bits)
6152 /* We have a partial vector mode and a constant whose full-vector
6153 equivalent would occupy a repeating 128-bit sequence. Build that
6154 full-vector equivalent instead, so that we have the option of
6155 using LD1RQ and Advanced SIMD operations. */
6156 unsigned int repeat = container_bits / elt_bits;
6157 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6158 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6159 for (unsigned int i = 0; i < npatterns; ++i)
6160 for (unsigned int j = 0; j < repeat; ++j)
6161 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6162 target = aarch64_target_reg (target, full_mode);
6163 return aarch64_expand_sve_const_vector (target, builder.build ());
6166 if (nelts_per_pattern == 1 && encoded_bits == 128)
6168 /* The constant is a duplicated quadword but can't be narrowed
6169 beyond a quadword. Get the memory image of the first quadword
6170 as a 128-bit vector and try using LD1RQ to load it from memory.
6172 The effect for both endiannesses is to load memory lane N into
6173 architectural lanes N + I * STEP of the result. On big-endian
6174 targets, the layout of the 128-bit vector in an Advanced SIMD
6175 register would be different from its layout in an SVE register,
6176 but this 128-bit vector is a memory value only. */
6177 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6178 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6179 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6180 return target;
6183 if (nelts_per_pattern == 1 && encoded_bits < 128)
6185 /* The vector is a repeating sequence of 64 bits or fewer.
6186 See if we can load them using an Advanced SIMD move and then
6187 duplicate it to fill a vector. This is better than using a GPR
6188 move because it keeps everything in the same register file. */
6189 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6190 rtx_vector_builder builder (vq_mode, npatterns, 1);
6191 for (unsigned int i = 0; i < npatterns; ++i)
6193 /* We want memory lane N to go into architectural lane N,
6194 so reverse for big-endian targets. The DUP .Q pattern
6195 has a compensating reverse built-in. */
6196 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6197 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6199 rtx vq_src = builder.build ();
6200 if (aarch64_simd_valid_immediate (vq_src, NULL))
6202 vq_src = force_reg (vq_mode, vq_src);
6203 return aarch64_expand_sve_dupq (target, mode, vq_src);
6206 /* Get an integer representation of the repeating part of Advanced
6207 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
6208 which for big-endian targets is lane-swapped wrt a normal
6209 Advanced SIMD vector. This means that for both endiannesses,
6210 memory lane N of SVE vector SRC corresponds to architectural
6211 lane N of a register holding VQ_SRC. This in turn means that
6212 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6213 as a single 128-bit value) and thus that memory lane 0 of SRC is
6214 in the lsb of the integer. Duplicating the integer therefore
6215 ensures that memory lane N of SRC goes into architectural lane
6216 N + I * INDEX of the SVE register. */
6217 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6218 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6219 if (elt_value)
6221 /* Pretend that we had a vector of INT_MODE to start with. */
6222 elt_mode = int_mode;
6223 mode = aarch64_full_sve_mode (int_mode).require ();
6225 /* If the integer can be moved into a general register by a
6226 single instruction, do that and duplicate the result. */
6227 if (CONST_INT_P (elt_value)
6228 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
6230 elt_value = force_reg (elt_mode, elt_value);
6231 return expand_vector_broadcast (mode, elt_value);
6234 else if (npatterns == 1)
6235 /* We're duplicating a single value, but can't do better than
6236 force it to memory and load from there. This handles things
6237 like symbolic constants. */
6238 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6240 if (elt_value)
6242 /* Load the element from memory if we can, otherwise move it into
6243 a register and use a DUP. */
6244 rtx op = force_const_mem (elt_mode, elt_value);
6245 if (!op)
6246 op = force_reg (elt_mode, elt_value);
6247 return expand_vector_broadcast (mode, op);
6251 /* Try using INDEX. */
6252 rtx base, step;
6253 if (const_vec_series_p (src, &base, &step))
6255 aarch64_expand_vec_series (target, base, step);
6256 return target;
6259 /* From here on, it's better to force the whole constant to memory
6260 if we can. */
6261 if (GET_MODE_NUNITS (mode).is_constant ())
6262 return NULL_RTX;
6264 if (nelts_per_pattern == 2)
6265 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6266 return res;
6268 /* Expand each pattern individually. */
6269 gcc_assert (npatterns > 1);
6270 rtx_vector_builder builder;
6271 auto_vec<rtx, 16> vectors (npatterns);
6272 for (unsigned int i = 0; i < npatterns; ++i)
6274 builder.new_vector (mode, 1, nelts_per_pattern);
6275 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6276 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6277 vectors.quick_push (force_reg (mode, builder.build ()));
6280 /* Use permutes to interleave the separate vectors. */
6281 while (npatterns > 1)
6283 npatterns /= 2;
6284 for (unsigned int i = 0; i < npatterns; ++i)
6286 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6287 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6288 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6289 vectors[i] = tmp;
6292 gcc_assert (vectors[0] == target);
6293 return target;
6296 /* Use WHILE to set a predicate register of mode MODE in which the first
6297 VL bits are set and the rest are clear. Use TARGET for the register
6298 if it's nonnull and convenient. */
6300 static rtx
6301 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6302 unsigned int vl)
6304 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6305 target = aarch64_target_reg (target, mode);
6306 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6307 target, const0_rtx, limit));
6308 return target;
6311 static rtx
6312 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6314 /* BUILDER is a constant predicate in which the index of every set bit
6315 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6316 by inverting every element at a multiple of ELT_SIZE and EORing the
6317 result with an ELT_SIZE PTRUE.
6319 Return a register that contains the constant on success, otherwise
6320 return null. Use TARGET as the register if it is nonnull and
6321 convenient. */
6323 static rtx
6324 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6325 unsigned int elt_size)
6327 /* Invert every element at a multiple of ELT_SIZE, keeping the
6328 other bits zero. */
6329 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6330 builder.nelts_per_pattern ());
6331 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6332 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6333 inv_builder.quick_push (const1_rtx);
6334 else
6335 inv_builder.quick_push (const0_rtx);
6336 inv_builder.finalize ();
6338 /* See if we can load the constant cheaply. */
6339 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6340 if (!inv)
6341 return NULL_RTX;
6343 /* EOR the result with an ELT_SIZE PTRUE. */
6344 rtx mask = aarch64_ptrue_all (elt_size);
6345 mask = force_reg (VNx16BImode, mask);
6346 inv = gen_lowpart (VNx16BImode, inv);
6347 target = aarch64_target_reg (target, VNx16BImode);
6348 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6349 return target;
6352 /* BUILDER is a constant predicate in which the index of every set bit
6353 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6354 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
6355 register on success, otherwise return null. Use TARGET as the register
6356 if nonnull and convenient. */
6358 static rtx
6359 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6360 unsigned int elt_size,
6361 unsigned int permute_size)
6363 /* We're going to split the constant into two new constants A and B,
6364 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6365 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6367 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6368 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6370 where _ indicates elements that will be discarded by the permute.
6372 First calculate the ELT_SIZEs for A and B. */
6373 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6374 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6375 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6376 if (INTVAL (builder.elt (i)) != 0)
6378 if (i & permute_size)
6379 b_elt_size |= i - permute_size;
6380 else
6381 a_elt_size |= i;
6383 a_elt_size &= -a_elt_size;
6384 b_elt_size &= -b_elt_size;
6386 /* Now construct the vectors themselves. */
6387 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6388 builder.nelts_per_pattern ());
6389 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6390 builder.nelts_per_pattern ());
6391 unsigned int nelts = builder.encoded_nelts ();
6392 for (unsigned int i = 0; i < nelts; ++i)
6393 if (i & (elt_size - 1))
6395 a_builder.quick_push (const0_rtx);
6396 b_builder.quick_push (const0_rtx);
6398 else if ((i & permute_size) == 0)
6400 /* The A and B elements are significant. */
6401 a_builder.quick_push (builder.elt (i));
6402 b_builder.quick_push (builder.elt (i + permute_size));
6404 else
6406 /* The A and B elements are going to be discarded, so pick whatever
6407 is likely to give a nice constant. We are targeting element
6408 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6409 with the aim of each being a sequence of ones followed by
6410 a sequence of zeros. So:
6412 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6413 duplicate the last X_ELT_SIZE element, to extend the
6414 current sequence of ones or zeros.
6416 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6417 zero, so that the constant really does have X_ELT_SIZE and
6418 not a smaller size. */
6419 if (a_elt_size > permute_size)
6420 a_builder.quick_push (const0_rtx);
6421 else
6422 a_builder.quick_push (a_builder.elt (i - a_elt_size));
6423 if (b_elt_size > permute_size)
6424 b_builder.quick_push (const0_rtx);
6425 else
6426 b_builder.quick_push (b_builder.elt (i - b_elt_size));
6428 a_builder.finalize ();
6429 b_builder.finalize ();
6431 /* Try loading A into a register. */
6432 rtx_insn *last = get_last_insn ();
6433 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6434 if (!a)
6435 return NULL_RTX;
6437 /* Try loading B into a register. */
6438 rtx b = a;
6439 if (a_builder != b_builder)
6441 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6442 if (!b)
6444 delete_insns_since (last);
6445 return NULL_RTX;
6449 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
6450 operands but permutes them as though they had mode MODE. */
6451 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6452 target = aarch64_target_reg (target, GET_MODE (a));
6453 rtx type_reg = CONST0_RTX (mode);
6454 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6455 return target;
6458 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
6459 constant in BUILDER into an SVE predicate register. Return the register
6460 on success, otherwise return null. Use TARGET for the register if
6461 nonnull and convenient.
6463 ALLOW_RECURSE_P is true if we can use methods that would call this
6464 function recursively. */
6466 static rtx
6467 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6468 bool allow_recurse_p)
6470 if (builder.encoded_nelts () == 1)
6471 /* A PFALSE or a PTRUE .B ALL. */
6472 return aarch64_emit_set_immediate (target, builder);
6474 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6475 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6477 /* If we can load the constant using PTRUE, use it as-is. */
6478 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6479 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6480 return aarch64_emit_set_immediate (target, builder);
6482 /* Otherwise use WHILE to set the first VL bits. */
6483 return aarch64_sve_move_pred_via_while (target, mode, vl);
6486 if (!allow_recurse_p)
6487 return NULL_RTX;
6489 /* Try inverting the vector in element size ELT_SIZE and then EORing
6490 the result with an ELT_SIZE PTRUE. */
6491 if (INTVAL (builder.elt (0)) == 0)
6492 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6493 elt_size))
6494 return res;
6496 /* Try using TRN1 to permute two simpler constants. */
6497 for (unsigned int i = elt_size; i <= 8; i *= 2)
6498 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6499 elt_size, i))
6500 return res;
6502 return NULL_RTX;
6505 /* Return an SVE predicate register that contains the VNx16BImode
6506 constant in BUILDER, without going through the move expanders.
6508 The returned register can have whatever mode seems most natural
6509 given the contents of BUILDER. Use TARGET for the result if
6510 convenient. */
6512 static rtx
6513 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6515 /* Try loading the constant using pure predicate operations. */
6516 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6517 return res;
6519 /* Try forcing the constant to memory. */
6520 if (builder.full_nelts ().is_constant ())
6521 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6523 target = aarch64_target_reg (target, VNx16BImode);
6524 emit_move_insn (target, mem);
6525 return target;
6528 /* The last resort is to load the constant as an integer and then
6529 compare it against zero. Use -1 for set bits in order to increase
6530 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
6531 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6532 builder.nelts_per_pattern ());
6533 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6534 int_builder.quick_push (INTVAL (builder.elt (i))
6535 ? constm1_rtx : const0_rtx);
6536 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6537 int_builder.build ());
6540 /* Set DEST to immediate IMM. */
6542 void
6543 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6545 machine_mode mode = GET_MODE (dest);
6547 /* Check on what type of symbol it is. */
6548 scalar_int_mode int_mode;
6549 if ((SYMBOL_REF_P (imm)
6550 || LABEL_REF_P (imm)
6551 || GET_CODE (imm) == CONST
6552 || GET_CODE (imm) == CONST_POLY_INT)
6553 && is_a <scalar_int_mode> (mode, &int_mode))
6555 rtx mem;
6556 poly_int64 offset;
6557 HOST_WIDE_INT const_offset;
6558 enum aarch64_symbol_type sty;
6560 /* If we have (const (plus symbol offset)), separate out the offset
6561 before we start classifying the symbol. */
6562 rtx base = strip_offset (imm, &offset);
6564 /* We must always add an offset involving VL separately, rather than
6565 folding it into the relocation. */
6566 if (!offset.is_constant (&const_offset))
6568 if (!TARGET_SVE)
6570 aarch64_report_sve_required ();
6571 return;
6573 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6574 emit_insn (gen_rtx_SET (dest, imm));
6575 else
6577 /* Do arithmetic on 32-bit values if the result is smaller
6578 than that. */
6579 if (partial_subreg_p (int_mode, SImode))
6581 /* It is invalid to do symbol calculations in modes
6582 narrower than SImode. */
6583 gcc_assert (base == const0_rtx);
6584 dest = gen_lowpart (SImode, dest);
6585 int_mode = SImode;
6587 if (base != const0_rtx)
6589 base = aarch64_force_temporary (int_mode, dest, base);
6590 aarch64_add_offset (int_mode, dest, base, offset,
6591 NULL_RTX, NULL_RTX, false);
6593 else
6594 aarch64_add_offset (int_mode, dest, base, offset,
6595 dest, NULL_RTX, false);
6597 return;
6600 sty = aarch64_classify_symbol (base, const_offset);
6601 switch (sty)
6603 case SYMBOL_FORCE_TO_MEM:
6604 if (int_mode != ptr_mode)
6605 imm = convert_memory_address (ptr_mode, imm);
6607 if (const_offset != 0
6608 && targetm.cannot_force_const_mem (ptr_mode, imm))
6610 gcc_assert (can_create_pseudo_p ());
6611 base = aarch64_force_temporary (int_mode, dest, base);
6612 aarch64_add_offset (int_mode, dest, base, const_offset,
6613 NULL_RTX, NULL_RTX, false);
6614 return;
6617 mem = force_const_mem (ptr_mode, imm);
6618 gcc_assert (mem);
6620 /* If we aren't generating PC relative literals, then
6621 we need to expand the literal pool access carefully.
6622 This is something that needs to be done in a number
6623 of places, so could well live as a separate function. */
6624 if (!aarch64_pcrelative_literal_loads)
6626 gcc_assert (can_create_pseudo_p ());
6627 base = gen_reg_rtx (ptr_mode);
6628 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6629 if (ptr_mode != Pmode)
6630 base = convert_memory_address (Pmode, base);
6631 mem = gen_rtx_MEM (ptr_mode, base);
6634 if (int_mode != ptr_mode)
6635 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6637 emit_insn (gen_rtx_SET (dest, mem));
6639 return;
6641 case SYMBOL_SMALL_TLSGD:
6642 case SYMBOL_SMALL_TLSDESC:
6643 case SYMBOL_SMALL_TLSIE:
6644 case SYMBOL_SMALL_GOT_28K:
6645 case SYMBOL_SMALL_GOT_4G:
6646 case SYMBOL_TINY_GOT:
6647 case SYMBOL_TINY_TLSIE:
6648 if (const_offset != 0)
6650 gcc_assert(can_create_pseudo_p ());
6651 base = aarch64_force_temporary (int_mode, dest, base);
6652 aarch64_add_offset (int_mode, dest, base, const_offset,
6653 NULL_RTX, NULL_RTX, false);
6654 return;
6656 /* FALLTHRU */
6658 case SYMBOL_SMALL_ABSOLUTE:
6659 case SYMBOL_TINY_ABSOLUTE:
6660 case SYMBOL_TLSLE12:
6661 case SYMBOL_TLSLE24:
6662 case SYMBOL_TLSLE32:
6663 case SYMBOL_TLSLE48:
6664 aarch64_load_symref_appropriately (dest, imm, sty);
6665 return;
6667 default:
6668 gcc_unreachable ();
6672 if (!CONST_INT_P (imm))
6674 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6676 /* Only the low bit of each .H, .S and .D element is defined,
6677 so we can set the upper bits to whatever we like. If the
6678 predicate is all-true in MODE, prefer to set all the undefined
6679 bits as well, so that we can share a single .B predicate for
6680 all modes. */
6681 if (imm == CONSTM1_RTX (mode))
6682 imm = CONSTM1_RTX (VNx16BImode);
6684 /* All methods for constructing predicate modes wider than VNx16BI
6685 will set the upper bits of each element to zero. Expose this
6686 by moving such constants as a VNx16BI, so that all bits are
6687 significant and so that constants for different modes can be
6688 shared. The wider constant will still be available as a
6689 REG_EQUAL note. */
6690 rtx_vector_builder builder;
6691 if (aarch64_get_sve_pred_bits (builder, imm))
6693 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6694 if (dest != res)
6695 emit_move_insn (dest, gen_lowpart (mode, res));
6696 return;
6700 if (GET_CODE (imm) == HIGH
6701 || aarch64_simd_valid_immediate (imm, NULL))
6703 emit_insn (gen_rtx_SET (dest, imm));
6704 return;
6707 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6708 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6710 if (dest != res)
6711 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6712 return;
6715 rtx mem = force_const_mem (mode, imm);
6716 gcc_assert (mem);
6717 emit_move_insn (dest, mem);
6718 return;
6721 aarch64_internal_mov_immediate (dest, imm, true,
6722 as_a <scalar_int_mode> (mode));
6725 /* Return the MEM rtx that provides the canary value that should be used
6726 for stack-smashing protection. MODE is the mode of the memory.
6727 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6728 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6729 indicates whether the caller is performing a SET or a TEST operation. */
6732 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6733 aarch64_salt_type salt_type)
6735 rtx addr;
6736 if (aarch64_stack_protector_guard == SSP_GLOBAL)
6738 gcc_assert (MEM_P (decl_rtl));
6739 addr = XEXP (decl_rtl, 0);
6740 poly_int64 offset;
6741 rtx base = strip_offset_and_salt (addr, &offset);
6742 if (!SYMBOL_REF_P (base))
6743 return decl_rtl;
6745 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6746 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6747 addr = gen_rtx_CONST (Pmode, addr);
6748 addr = plus_constant (Pmode, addr, offset);
6750 else
6752 /* Calculate the address from the system register. */
6753 rtx salt = GEN_INT (salt_type);
6754 addr = gen_reg_rtx (mode);
6755 if (mode == DImode)
6756 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6757 else
6759 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6760 addr = convert_memory_address (Pmode, addr);
6762 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6764 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6767 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6768 that is known to contain PTRUE. */
6770 void
6771 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6773 expand_operand ops[3];
6774 machine_mode mode = GET_MODE (dest);
6775 create_output_operand (&ops[0], dest, mode);
6776 create_input_operand (&ops[1], pred, GET_MODE(pred));
6777 create_input_operand (&ops[2], src, mode);
6778 temporary_volatile_ok v (true);
6779 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6782 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6783 operand is in memory. In this case we need to use the predicated LD1
6784 and ST1 instead of LDR and STR, both for correctness on big-endian
6785 targets and because LD1 and ST1 support a wider range of addressing modes.
6786 PRED_MODE is the mode of the predicate.
6788 See the comment at the head of aarch64-sve.md for details about the
6789 big-endian handling. */
6791 void
6792 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6794 machine_mode mode = GET_MODE (dest);
6795 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6796 if (!register_operand (src, mode)
6797 && !register_operand (dest, mode))
6799 rtx tmp = gen_reg_rtx (mode);
6800 if (MEM_P (src))
6801 aarch64_emit_sve_pred_move (tmp, ptrue, src);
6802 else
6803 emit_move_insn (tmp, src);
6804 src = tmp;
6806 aarch64_emit_sve_pred_move (dest, ptrue, src);
6809 /* Called only on big-endian targets. See whether an SVE vector move
6810 from SRC to DEST is effectively a REV[BHW] instruction, because at
6811 least one operand is a subreg of an SVE vector that has wider or
6812 narrower elements. Return true and emit the instruction if so.
6814 For example:
6816 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6818 represents a VIEW_CONVERT between the following vectors, viewed
6819 in memory order:
6821 R2: { [0].high, [0].low, [1].high, [1].low, ... }
6822 R1: { [0], [1], [2], [3], ... }
6824 The high part of lane X in R2 should therefore correspond to lane X*2
6825 of R1, but the register representations are:
6827 msb lsb
6828 R2: ...... [1].high [1].low [0].high [0].low
6829 R1: ...... [3] [2] [1] [0]
6831 where the low part of lane X in R2 corresponds to lane X*2 in R1.
6832 We therefore need a reverse operation to swap the high and low values
6833 around.
6835 This is purely an optimization. Without it we would spill the
6836 subreg operand to the stack in one mode and reload it in the
6837 other mode, which has the same effect as the REV. */
6839 bool
6840 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6842 gcc_assert (BYTES_BIG_ENDIAN);
6844 /* Do not try to optimize subregs that LRA has created for matched
6845 reloads. These subregs only exist as a temporary measure to make
6846 the RTL well-formed, but they are exempt from the usual
6847 TARGET_CAN_CHANGE_MODE_CLASS rules.
6849 For example, if we have:
6851 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6853 and the constraints require R1 and R2 to be in the same register,
6854 LRA may need to create RTL such as:
6856 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6857 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6858 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6860 which forces both the input and output of the original instruction
6861 to use the same hard register. But for this to work, the normal
6862 rules have to be suppressed on the subreg input, otherwise LRA
6863 would need to reload that input too, meaning that the process
6864 would never terminate. To compensate for this, the normal rules
6865 are also suppressed for the subreg output of the first move.
6866 Ignoring the special case and handling the first move normally
6867 would therefore generate wrong code: we would reverse the elements
6868 for the first subreg but not reverse them back for the second subreg. */
6869 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6870 dest = SUBREG_REG (dest);
6871 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6872 src = SUBREG_REG (src);
6874 /* The optimization handles two single SVE REGs with different element
6875 sizes. */
6876 if (!REG_P (dest)
6877 || !REG_P (src)
6878 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6879 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6880 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6881 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6882 return false;
6884 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
6885 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6886 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6887 UNSPEC_REV_SUBREG);
6888 emit_insn (gen_rtx_SET (dest, unspec));
6889 return true;
6892 /* Return a copy of X with mode MODE, without changing its other
6893 attributes. Unlike gen_lowpart, this doesn't care whether the
6894 mode change is valid. */
6897 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6899 if (GET_MODE (x) == mode)
6900 return x;
6902 x = shallow_copy_rtx (x);
6903 set_mode_and_regno (x, mode, REGNO (x));
6904 return x;
6907 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6908 stored in wider integer containers. */
6910 static unsigned int
6911 aarch64_sve_rev_unspec (machine_mode mode)
6913 switch (GET_MODE_UNIT_SIZE (mode))
6915 case 1: return UNSPEC_REVB;
6916 case 2: return UNSPEC_REVH;
6917 case 4: return UNSPEC_REVW;
6919 gcc_unreachable ();
6922 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6923 operands. */
6925 void
6926 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6928 /* Decide which REV operation we need. The mode with wider elements
6929 determines the mode of the operands and the mode with the narrower
6930 elements determines the reverse width. */
6931 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6932 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6933 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6934 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6935 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6937 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6938 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6940 /* Get the operands in the appropriate modes and emit the instruction. */
6941 ptrue = gen_lowpart (pred_mode, ptrue);
6942 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6943 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6944 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6945 dest, ptrue, src));
6948 static bool
6949 aarch64_function_ok_for_sibcall (tree, tree exp)
6951 if (crtl->abi->id () != expr_callee_abi (exp).id ())
6952 return false;
6954 return true;
6957 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6958 passed in SVE registers. */
6960 static bool
6961 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6962 const function_arg_info &arg)
6964 HOST_WIDE_INT size;
6965 machine_mode dummymode;
6966 int nregs;
6968 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6969 if (arg.mode == BLKmode && arg.type)
6970 size = int_size_in_bytes (arg.type);
6971 else
6972 /* No frontends can create types with variable-sized modes, so we
6973 shouldn't be asked to pass or return them. */
6974 size = GET_MODE_SIZE (arg.mode).to_constant ();
6976 /* Aggregates are passed by reference based on their size. */
6977 if (arg.aggregate_type_p ())
6978 size = int_size_in_bytes (arg.type);
6980 /* Variable sized arguments are always returned by reference. */
6981 if (size < 0)
6982 return true;
6984 /* Can this be a candidate to be passed in fp/simd register(s)? */
6985 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6986 &dummymode, &nregs, NULL,
6987 !pcum || pcum->silent_p))
6988 return false;
6990 /* Arguments which are variable sized or larger than 2 registers are
6991 passed by reference unless they are a homogenous floating point
6992 aggregate. */
6993 return size > 2 * UNITS_PER_WORD;
6996 /* Implement TARGET_PASS_BY_REFERENCE. */
6998 static bool
6999 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7000 const function_arg_info &arg)
7002 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7004 if (!arg.type)
7005 return aarch64_pass_by_reference_1 (pcum, arg);
7007 pure_scalable_type_info pst_info;
7008 switch (pst_info.analyze (arg.type))
7010 case pure_scalable_type_info::IS_PST:
7011 if (pcum && !pcum->silent_p && !TARGET_SVE)
7012 /* We can't gracefully recover at this point, so make this a
7013 fatal error. */
7014 fatal_error (input_location, "arguments of type %qT require"
7015 " the SVE ISA extension", arg.type);
7017 /* Variadic SVE types are passed by reference. Normal non-variadic
7018 arguments are too if we've run out of registers. */
7019 return (!arg.named
7020 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7021 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7023 case pure_scalable_type_info::DOESNT_MATTER:
7024 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7025 return true;
7027 case pure_scalable_type_info::NO_ABI_IDENTITY:
7028 case pure_scalable_type_info::ISNT_PST:
7029 return aarch64_pass_by_reference_1 (pcum, arg);
7031 gcc_unreachable ();
7034 /* Return TRUE if VALTYPE is padded to its least significant bits. */
7035 static bool
7036 aarch64_return_in_msb (const_tree valtype)
7038 machine_mode dummy_mode;
7039 int dummy_int;
7041 /* Never happens in little-endian mode. */
7042 if (!BYTES_BIG_ENDIAN)
7043 return false;
7045 /* Only composite types smaller than or equal to 16 bytes can
7046 be potentially returned in registers. */
7047 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7048 || int_size_in_bytes (valtype) <= 0
7049 || int_size_in_bytes (valtype) > 16)
7050 return false;
7052 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7053 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7054 is always passed/returned in the least significant bits of fp/simd
7055 register(s). */
7056 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7057 &dummy_mode, &dummy_int, NULL,
7058 false))
7059 return false;
7061 /* Likewise pure scalable types for SVE vector and predicate registers. */
7062 pure_scalable_type_info pst_info;
7063 if (pst_info.analyze_registers (valtype))
7064 return false;
7066 return true;
7069 /* Implement TARGET_FUNCTION_VALUE.
7070 Define how to find the value returned by a function. */
7072 static rtx
7073 aarch64_function_value (const_tree type, const_tree func,
7074 bool outgoing ATTRIBUTE_UNUSED)
7076 machine_mode mode;
7077 int unsignedp;
7079 mode = TYPE_MODE (type);
7080 if (INTEGRAL_TYPE_P (type))
7081 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7083 pure_scalable_type_info pst_info;
7084 if (type && pst_info.analyze_registers (type))
7085 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7087 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7088 are returned in memory, not by value. */
7089 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7090 bool sve_p = (vec_flags & VEC_ANY_SVE);
7092 if (aarch64_return_in_msb (type))
7094 HOST_WIDE_INT size = int_size_in_bytes (type);
7096 if (size % UNITS_PER_WORD != 0)
7098 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7099 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7103 int count;
7104 machine_mode ag_mode;
7105 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7106 NULL, false))
7108 gcc_assert (!sve_p);
7109 if (!aarch64_composite_type_p (type, mode))
7111 gcc_assert (count == 1 && mode == ag_mode);
7112 return gen_rtx_REG (mode, V0_REGNUM);
7114 else if (aarch64_advsimd_full_struct_mode_p (mode)
7115 && known_eq (GET_MODE_SIZE (ag_mode), 16))
7116 return gen_rtx_REG (mode, V0_REGNUM);
7117 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7118 && known_eq (GET_MODE_SIZE (ag_mode), 8))
7119 return gen_rtx_REG (mode, V0_REGNUM);
7120 else
7122 int i;
7123 rtx par;
7125 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7126 for (i = 0; i < count; i++)
7128 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7129 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7130 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7131 XVECEXP (par, 0, i) = tmp;
7133 return par;
7136 else
7138 if (sve_p)
7140 /* Vector types can acquire a partial SVE mode using things like
7141 __attribute__((vector_size(N))), and this is potentially useful.
7142 However, the choice of mode doesn't affect the type's ABI
7143 identity, so we should treat the types as though they had
7144 the associated integer mode, just like they did before SVE
7145 was introduced.
7147 We know that the vector must be 128 bits or smaller,
7148 otherwise we'd have returned it in memory instead. */
7149 gcc_assert (type
7150 && (aarch64_some_values_include_pst_objects_p (type)
7151 || (vec_flags & VEC_PARTIAL)));
7153 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7154 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7155 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7156 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7158 return gen_rtx_REG (mode, R0_REGNUM);
7162 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7163 Return true if REGNO is the number of a hard register in which the values
7164 of called function may come back. */
7166 static bool
7167 aarch64_function_value_regno_p (const unsigned int regno)
7169 /* Maximum of 16 bytes can be returned in the general registers. Examples
7170 of 16-byte return values are: 128-bit integers and 16-byte small
7171 structures (excluding homogeneous floating-point aggregates). */
7172 if (regno == R0_REGNUM || regno == R1_REGNUM)
7173 return true;
7175 /* Up to four fp/simd registers can return a function value, e.g. a
7176 homogeneous floating-point aggregate having four members. */
7177 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7178 return TARGET_FLOAT;
7180 return false;
7183 /* Subroutine for aarch64_return_in_memory for types that are not returned
7184 in SVE registers. */
7186 static bool
7187 aarch64_return_in_memory_1 (const_tree type)
7189 HOST_WIDE_INT size;
7190 machine_mode ag_mode;
7191 int count;
7193 if (!AGGREGATE_TYPE_P (type)
7194 && TREE_CODE (type) != COMPLEX_TYPE
7195 && TREE_CODE (type) != VECTOR_TYPE)
7196 /* Simple scalar types always returned in registers. */
7197 return false;
7199 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7200 &ag_mode, &count, NULL, false))
7201 return false;
7203 /* Types larger than 2 registers returned in memory. */
7204 size = int_size_in_bytes (type);
7205 return (size < 0 || size > 2 * UNITS_PER_WORD);
7208 /* Implement TARGET_RETURN_IN_MEMORY.
7210 If the type T of the result of a function is such that
7211 void func (T arg)
7212 would require that arg be passed as a value in a register (or set of
7213 registers) according to the parameter passing rules, then the result
7214 is returned in the same registers as would be used for such an
7215 argument. */
7217 static bool
7218 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7220 pure_scalable_type_info pst_info;
7221 switch (pst_info.analyze (type))
7223 case pure_scalable_type_info::IS_PST:
7224 return (pst_info.num_zr () > NUM_FP_ARG_REGS
7225 || pst_info.num_pr () > NUM_PR_ARG_REGS);
7227 case pure_scalable_type_info::DOESNT_MATTER:
7228 gcc_assert (aarch64_return_in_memory_1 (type));
7229 return true;
7231 case pure_scalable_type_info::NO_ABI_IDENTITY:
7232 case pure_scalable_type_info::ISNT_PST:
7233 return aarch64_return_in_memory_1 (type);
7235 gcc_unreachable ();
7238 static bool
7239 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7240 const_tree type, int *nregs)
7242 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7243 return aarch64_vfp_is_call_or_return_candidate (mode, type,
7244 &pcum->aapcs_vfp_rmode,
7245 nregs, NULL, pcum->silent_p);
7248 /* Given MODE and TYPE of a function argument, return the alignment in
7249 bits. The idea is to suppress any stronger alignment requested by
7250 the user and opt for the natural alignment (specified in AAPCS64 \S
7251 4.1). ABI_BREAK is set to true if the alignment was incorrectly
7252 calculated in versions of GCC prior to GCC-9. This is a helper
7253 function for local use only. */
7255 static unsigned int
7256 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7257 unsigned int *abi_break)
7259 *abi_break = 0;
7260 if (!type)
7261 return GET_MODE_ALIGNMENT (mode);
7263 if (integer_zerop (TYPE_SIZE (type)))
7264 return 0;
7266 gcc_assert (TYPE_MODE (type) == mode);
7268 if (!AGGREGATE_TYPE_P (type))
7269 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
7271 if (TREE_CODE (type) == ARRAY_TYPE)
7272 return TYPE_ALIGN (TREE_TYPE (type));
7274 unsigned int alignment = 0;
7275 unsigned int bitfield_alignment = 0;
7276 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7277 if (TREE_CODE (field) == FIELD_DECL)
7279 /* Note that we explicitly consider zero-sized fields here,
7280 even though they don't map to AAPCS64 machine types.
7281 For example, in:
7283 struct __attribute__((aligned(8))) empty {};
7285 struct s {
7286 [[no_unique_address]] empty e;
7287 int x;
7290 "s" contains only one Fundamental Data Type (the int field)
7291 but gains 8-byte alignment and size thanks to "e". */
7292 alignment = std::max (alignment, DECL_ALIGN (field));
7293 if (DECL_BIT_FIELD_TYPE (field))
7294 bitfield_alignment
7295 = std::max (bitfield_alignment,
7296 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7299 if (bitfield_alignment > alignment)
7301 *abi_break = alignment;
7302 return bitfield_alignment;
7305 return alignment;
7308 /* Layout a function argument according to the AAPCS64 rules. The rule
7309 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
7310 mode that was originally given to us by the target hook, whereas the
7311 mode in ARG might be the result of replacing partial SVE modes with
7312 the equivalent integer mode. */
7314 static void
7315 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7317 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7318 tree type = arg.type;
7319 machine_mode mode = arg.mode;
7320 int ncrn, nvrn, nregs;
7321 bool allocate_ncrn, allocate_nvrn;
7322 HOST_WIDE_INT size;
7323 unsigned int abi_break;
7325 /* We need to do this once per argument. */
7326 if (pcum->aapcs_arg_processed)
7327 return;
7329 pcum->aapcs_arg_processed = true;
7331 pure_scalable_type_info pst_info;
7332 if (type && pst_info.analyze_registers (type))
7334 /* The PCS says that it is invalid to pass an SVE value to an
7335 unprototyped function. There is no ABI-defined location we
7336 can return in this case, so we have no real choice but to raise
7337 an error immediately, even though this is only a query function. */
7338 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7340 gcc_assert (!pcum->silent_p);
7341 error ("SVE type %qT cannot be passed to an unprototyped function",
7342 arg.type);
7343 /* Avoid repeating the message, and avoid tripping the assert
7344 below. */
7345 pcum->pcs_variant = ARM_PCS_SVE;
7348 /* We would have converted the argument into pass-by-reference
7349 form if it didn't fit in registers. */
7350 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7351 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7352 gcc_assert (arg.named
7353 && pcum->pcs_variant == ARM_PCS_SVE
7354 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7355 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7356 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7357 P0_REGNUM + pcum->aapcs_nprn);
7358 return;
7361 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7362 are passed by reference, not by value. */
7363 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7364 bool sve_p = (vec_flags & VEC_ANY_SVE);
7365 if (sve_p)
7366 /* Vector types can acquire a partial SVE mode using things like
7367 __attribute__((vector_size(N))), and this is potentially useful.
7368 However, the choice of mode doesn't affect the type's ABI
7369 identity, so we should treat the types as though they had
7370 the associated integer mode, just like they did before SVE
7371 was introduced.
7373 We know that the vector must be 128 bits or smaller,
7374 otherwise we'd have passed it in memory instead. */
7375 gcc_assert (type
7376 && (aarch64_some_values_include_pst_objects_p (type)
7377 || (vec_flags & VEC_PARTIAL)));
7379 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
7380 if (type)
7381 size = int_size_in_bytes (type);
7382 else
7383 /* No frontends can create types with variable-sized modes, so we
7384 shouldn't be asked to pass or return them. */
7385 size = GET_MODE_SIZE (mode).to_constant ();
7386 size = ROUND_UP (size, UNITS_PER_WORD);
7388 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7389 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7390 mode,
7391 type,
7392 &nregs);
7393 gcc_assert (!sve_p || !allocate_nvrn);
7395 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7396 The following code thus handles passing by SIMD/FP registers first. */
7398 nvrn = pcum->aapcs_nvrn;
7400 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7401 and homogenous short-vector aggregates (HVA). */
7402 if (allocate_nvrn)
7404 if (!pcum->silent_p && !TARGET_FLOAT)
7405 aarch64_err_no_fpadvsimd (mode);
7407 if (nvrn + nregs <= NUM_FP_ARG_REGS)
7409 pcum->aapcs_nextnvrn = nvrn + nregs;
7410 if (!aarch64_composite_type_p (type, mode))
7412 gcc_assert (nregs == 1);
7413 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7415 else if (aarch64_advsimd_full_struct_mode_p (mode)
7416 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7417 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7418 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7419 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7420 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7421 else
7423 rtx par;
7424 int i;
7425 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7426 for (i = 0; i < nregs; i++)
7428 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7429 V0_REGNUM + nvrn + i);
7430 rtx offset = gen_int_mode
7431 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7432 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7433 XVECEXP (par, 0, i) = tmp;
7435 pcum->aapcs_reg = par;
7437 return;
7439 else
7441 /* C.3 NSRN is set to 8. */
7442 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7443 goto on_stack;
7447 ncrn = pcum->aapcs_ncrn;
7448 nregs = size / UNITS_PER_WORD;
7450 /* C6 - C9. though the sign and zero extension semantics are
7451 handled elsewhere. This is the case where the argument fits
7452 entirely general registers. */
7453 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7455 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7457 /* C.8 if the argument has an alignment of 16 then the NGRN is
7458 rounded up to the next even number. */
7459 if (nregs == 2
7460 && ncrn % 2
7461 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7462 comparison is there because for > 16 * BITS_PER_UNIT
7463 alignment nregs should be > 2 and therefore it should be
7464 passed by reference rather than value. */
7465 && (aarch64_function_arg_alignment (mode, type, &abi_break)
7466 == 16 * BITS_PER_UNIT))
7468 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
7469 inform (input_location, "parameter passing for argument of type "
7470 "%qT changed in GCC 9.1", type);
7471 ++ncrn;
7472 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7475 /* If an argument with an SVE mode needs to be shifted up to the
7476 high part of the register, treat it as though it had an integer mode.
7477 Using the normal (parallel [...]) would suppress the shifting. */
7478 if (sve_p
7479 && BYTES_BIG_ENDIAN
7480 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7481 && aarch64_pad_reg_upward (mode, type, false))
7483 mode = int_mode_for_mode (mode).require ();
7484 sve_p = false;
7487 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7488 A reg is still generated for it, but the caller should be smart
7489 enough not to use it. */
7490 if (nregs == 0
7491 || (nregs == 1 && !sve_p)
7492 || GET_MODE_CLASS (mode) == MODE_INT)
7493 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7494 else
7496 rtx par;
7497 int i;
7499 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7500 for (i = 0; i < nregs; i++)
7502 scalar_int_mode reg_mode = word_mode;
7503 if (nregs == 1)
7504 reg_mode = int_mode_for_mode (mode).require ();
7505 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7506 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7507 GEN_INT (i * UNITS_PER_WORD));
7508 XVECEXP (par, 0, i) = tmp;
7510 pcum->aapcs_reg = par;
7513 pcum->aapcs_nextncrn = ncrn + nregs;
7514 return;
7517 /* C.11 */
7518 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7520 /* The argument is passed on stack; record the needed number of words for
7521 this argument and align the total size if necessary. */
7522 on_stack:
7523 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7525 if (aarch64_function_arg_alignment (mode, type, &abi_break)
7526 == 16 * BITS_PER_UNIT)
7528 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7529 if (pcum->aapcs_stack_size != new_size)
7531 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
7532 inform (input_location, "parameter passing for argument of type "
7533 "%qT changed in GCC 9.1", type);
7534 pcum->aapcs_stack_size = new_size;
7537 return;
7540 /* Implement TARGET_FUNCTION_ARG. */
7542 static rtx
7543 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7545 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7546 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7547 || pcum->pcs_variant == ARM_PCS_SIMD
7548 || pcum->pcs_variant == ARM_PCS_SVE);
7550 if (arg.end_marker_p ())
7551 return gen_int_mode (pcum->pcs_variant, DImode);
7553 aarch64_layout_arg (pcum_v, arg);
7554 return pcum->aapcs_reg;
7557 void
7558 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7559 const_tree fntype,
7560 rtx libname ATTRIBUTE_UNUSED,
7561 const_tree fndecl ATTRIBUTE_UNUSED,
7562 unsigned n_named ATTRIBUTE_UNUSED,
7563 bool silent_p)
7565 pcum->aapcs_ncrn = 0;
7566 pcum->aapcs_nvrn = 0;
7567 pcum->aapcs_nprn = 0;
7568 pcum->aapcs_nextncrn = 0;
7569 pcum->aapcs_nextnvrn = 0;
7570 pcum->aapcs_nextnprn = 0;
7571 if (fntype)
7572 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7573 else
7574 pcum->pcs_variant = ARM_PCS_AAPCS64;
7575 pcum->aapcs_reg = NULL_RTX;
7576 pcum->aapcs_arg_processed = false;
7577 pcum->aapcs_stack_words = 0;
7578 pcum->aapcs_stack_size = 0;
7579 pcum->silent_p = silent_p;
7581 if (!silent_p
7582 && !TARGET_FLOAT
7583 && fntype && fntype != error_mark_node)
7585 const_tree type = TREE_TYPE (fntype);
7586 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7587 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7588 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7589 &mode, &nregs, NULL, false))
7590 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7593 if (!silent_p
7594 && !TARGET_SVE
7595 && pcum->pcs_variant == ARM_PCS_SVE)
7597 /* We can't gracefully recover at this point, so make this a
7598 fatal error. */
7599 if (fndecl)
7600 fatal_error (input_location, "%qE requires the SVE ISA extension",
7601 fndecl);
7602 else
7603 fatal_error (input_location, "calls to functions of type %qT require"
7604 " the SVE ISA extension", fntype);
7608 static void
7609 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7610 const function_arg_info &arg)
7612 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7613 if (pcum->pcs_variant == ARM_PCS_AAPCS64
7614 || pcum->pcs_variant == ARM_PCS_SIMD
7615 || pcum->pcs_variant == ARM_PCS_SVE)
7617 aarch64_layout_arg (pcum_v, arg);
7618 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7619 != (pcum->aapcs_stack_words != 0));
7620 pcum->aapcs_arg_processed = false;
7621 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7622 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7623 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7624 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7625 pcum->aapcs_stack_words = 0;
7626 pcum->aapcs_reg = NULL_RTX;
7630 bool
7631 aarch64_function_arg_regno_p (unsigned regno)
7633 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7634 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
7637 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7638 PARM_BOUNDARY bits of alignment, but will be given anything up
7639 to STACK_BOUNDARY bits if the type requires it. This makes sure
7640 that both before and after the layout of each argument, the Next
7641 Stacked Argument Address (NSAA) will have a minimum alignment of
7642 8 bytes. */
7644 static unsigned int
7645 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7647 unsigned int abi_break;
7648 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7649 &abi_break);
7650 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7651 if (abi_break & warn_psabi)
7653 abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
7654 if (alignment != abi_break)
7655 inform (input_location, "parameter passing for argument of type "
7656 "%qT changed in GCC 9.1", type);
7659 return alignment;
7662 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
7664 static fixed_size_mode
7665 aarch64_get_reg_raw_mode (int regno)
7667 if (TARGET_SVE && FP_REGNUM_P (regno))
7668 /* Don't use the SVE part of the register for __builtin_apply and
7669 __builtin_return. The SVE registers aren't used by the normal PCS,
7670 so using them there would be a waste of time. The PCS extensions
7671 for SVE types are fundamentally incompatible with the
7672 __builtin_return/__builtin_apply interface. */
7673 return as_a <fixed_size_mode> (V16QImode);
7674 return default_get_reg_raw_mode (regno);
7677 /* Implement TARGET_FUNCTION_ARG_PADDING.
7679 Small aggregate types are placed in the lowest memory address.
7681 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
7683 static pad_direction
7684 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7686 /* On little-endian targets, the least significant byte of every stack
7687 argument is passed at the lowest byte address of the stack slot. */
7688 if (!BYTES_BIG_ENDIAN)
7689 return PAD_UPWARD;
7691 /* Otherwise, integral, floating-point and pointer types are padded downward:
7692 the least significant byte of a stack argument is passed at the highest
7693 byte address of the stack slot. */
7694 if (type
7695 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7696 || POINTER_TYPE_P (type))
7697 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7698 return PAD_DOWNWARD;
7700 /* Everything else padded upward, i.e. data in first byte of stack slot. */
7701 return PAD_UPWARD;
7704 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7706 It specifies padding for the last (may also be the only)
7707 element of a block move between registers and memory. If
7708 assuming the block is in the memory, padding upward means that
7709 the last element is padded after its highest significant byte,
7710 while in downward padding, the last element is padded at the
7711 its least significant byte side.
7713 Small aggregates and small complex types are always padded
7714 upwards.
7716 We don't need to worry about homogeneous floating-point or
7717 short-vector aggregates; their move is not affected by the
7718 padding direction determined here. Regardless of endianness,
7719 each element of such an aggregate is put in the least
7720 significant bits of a fp/simd register.
7722 Return !BYTES_BIG_ENDIAN if the least significant byte of the
7723 register has useful data, and return the opposite if the most
7724 significant byte does. */
7726 bool
7727 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7728 bool first ATTRIBUTE_UNUSED)
7731 /* Aside from pure scalable types, small composite types are always
7732 padded upward. */
7733 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7735 HOST_WIDE_INT size;
7736 if (type)
7737 size = int_size_in_bytes (type);
7738 else
7739 /* No frontends can create types with variable-sized modes, so we
7740 shouldn't be asked to pass or return them. */
7741 size = GET_MODE_SIZE (mode).to_constant ();
7742 if (size < 2 * UNITS_PER_WORD)
7744 pure_scalable_type_info pst_info;
7745 if (pst_info.analyze_registers (type))
7746 return false;
7747 return true;
7751 /* Otherwise, use the default padding. */
7752 return !BYTES_BIG_ENDIAN;
7755 static scalar_int_mode
7756 aarch64_libgcc_cmp_return_mode (void)
7758 return SImode;
7761 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7763 /* We use the 12-bit shifted immediate arithmetic instructions so values
7764 must be multiple of (1 << 12), i.e. 4096. */
7765 #define ARITH_FACTOR 4096
7767 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7768 #error Cannot use simple address calculation for stack probing
7769 #endif
7771 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7772 inclusive. These are offsets from the current stack pointer. */
7774 static void
7775 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7777 HOST_WIDE_INT size;
7778 if (!poly_size.is_constant (&size))
7780 sorry ("stack probes for SVE frames");
7781 return;
7784 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7786 /* See the same assertion on PROBE_INTERVAL above. */
7787 gcc_assert ((first % ARITH_FACTOR) == 0);
7789 /* See if we have a constant small number of probes to generate. If so,
7790 that's the easy case. */
7791 if (size <= PROBE_INTERVAL)
7793 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7795 emit_set_insn (reg1,
7796 plus_constant (Pmode,
7797 stack_pointer_rtx, -(first + base)));
7798 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7801 /* The run-time loop is made up of 8 insns in the generic case while the
7802 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
7803 else if (size <= 4 * PROBE_INTERVAL)
7805 HOST_WIDE_INT i, rem;
7807 emit_set_insn (reg1,
7808 plus_constant (Pmode,
7809 stack_pointer_rtx,
7810 -(first + PROBE_INTERVAL)));
7811 emit_stack_probe (reg1);
7813 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7814 it exceeds SIZE. If only two probes are needed, this will not
7815 generate any code. Then probe at FIRST + SIZE. */
7816 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7818 emit_set_insn (reg1,
7819 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7820 emit_stack_probe (reg1);
7823 rem = size - (i - PROBE_INTERVAL);
7824 if (rem > 256)
7826 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7828 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7829 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7831 else
7832 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7835 /* Otherwise, do the same as above, but in a loop. Note that we must be
7836 extra careful with variables wrapping around because we might be at
7837 the very top (or the very bottom) of the address space and we have
7838 to be able to handle this case properly; in particular, we use an
7839 equality test for the loop condition. */
7840 else
7842 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7844 /* Step 1: round SIZE to the previous multiple of the interval. */
7846 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7849 /* Step 2: compute initial and final value of the loop counter. */
7851 /* TEST_ADDR = SP + FIRST. */
7852 emit_set_insn (reg1,
7853 plus_constant (Pmode, stack_pointer_rtx, -first));
7855 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
7856 HOST_WIDE_INT adjustment = - (first + rounded_size);
7857 if (! aarch64_uimm12_shift (adjustment))
7859 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7860 true, Pmode);
7861 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7863 else
7864 emit_set_insn (reg2,
7865 plus_constant (Pmode, stack_pointer_rtx, adjustment));
7867 /* Step 3: the loop
7871 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7872 probe at TEST_ADDR
7874 while (TEST_ADDR != LAST_ADDR)
7876 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7877 until it is equal to ROUNDED_SIZE. */
7879 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7882 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7883 that SIZE is equal to ROUNDED_SIZE. */
7885 if (size != rounded_size)
7887 HOST_WIDE_INT rem = size - rounded_size;
7889 if (rem > 256)
7891 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7893 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7894 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7896 else
7897 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7901 /* Make sure nothing is scheduled before we are done. */
7902 emit_insn (gen_blockage ());
7905 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
7906 absolute addresses. */
7908 const char *
7909 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7911 static int labelno = 0;
7912 char loop_lab[32];
7913 rtx xops[2];
7915 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7917 /* Loop. */
7918 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7920 HOST_WIDE_INT stack_clash_probe_interval
7921 = 1 << param_stack_clash_protection_guard_size;
7923 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
7924 xops[0] = reg1;
7925 HOST_WIDE_INT interval;
7926 if (flag_stack_clash_protection)
7927 interval = stack_clash_probe_interval;
7928 else
7929 interval = PROBE_INTERVAL;
7931 gcc_assert (aarch64_uimm12_shift (interval));
7932 xops[1] = GEN_INT (interval);
7934 output_asm_insn ("sub\t%0, %0, %1", xops);
7936 /* If doing stack clash protection then we probe up by the ABI specified
7937 amount. We do this because we're dropping full pages at a time in the
7938 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7939 if (flag_stack_clash_protection)
7940 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7941 else
7942 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7944 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7945 by this amount for each iteration. */
7946 output_asm_insn ("str\txzr, [%0, %1]", xops);
7948 /* Test if TEST_ADDR == LAST_ADDR. */
7949 xops[1] = reg2;
7950 output_asm_insn ("cmp\t%0, %1", xops);
7952 /* Branch. */
7953 fputs ("\tb.ne\t", asm_out_file);
7954 assemble_name_raw (asm_out_file, loop_lab);
7955 fputc ('\n', asm_out_file);
7957 return "";
7960 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7961 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7962 of GUARD_SIZE. When a probe is emitted it is done at most
7963 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7964 at most MIN_PROBE_THRESHOLD. By the end of this function
7965 BASE = BASE - ADJUSTMENT. */
7967 const char *
7968 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7969 rtx min_probe_threshold, rtx guard_size)
7971 /* This function is not allowed to use any instruction generation function
7972 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7973 so instead emit the code you want using output_asm_insn. */
7974 gcc_assert (flag_stack_clash_protection);
7975 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7976 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7978 /* The minimum required allocation before the residual requires probing. */
7979 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7981 /* Clamp the value down to the nearest value that can be used with a cmp. */
7982 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7983 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7985 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7986 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7988 static int labelno = 0;
7989 char loop_start_lab[32];
7990 char loop_end_lab[32];
7991 rtx xops[2];
7993 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7994 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7996 /* Emit loop start label. */
7997 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7999 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
8000 xops[0] = adjustment;
8001 xops[1] = probe_offset_value_rtx;
8002 output_asm_insn ("cmp\t%0, %1", xops);
8004 /* Branch to end if not enough adjustment to probe. */
8005 fputs ("\tb.lt\t", asm_out_file);
8006 assemble_name_raw (asm_out_file, loop_end_lab);
8007 fputc ('\n', asm_out_file);
8009 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
8010 xops[0] = base;
8011 xops[1] = probe_offset_value_rtx;
8012 output_asm_insn ("sub\t%0, %0, %1", xops);
8014 /* Probe at BASE. */
8015 xops[1] = const0_rtx;
8016 output_asm_insn ("str\txzr, [%0, %1]", xops);
8018 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
8019 xops[0] = adjustment;
8020 xops[1] = probe_offset_value_rtx;
8021 output_asm_insn ("sub\t%0, %0, %1", xops);
8023 /* Branch to start if still more bytes to allocate. */
8024 fputs ("\tb\t", asm_out_file);
8025 assemble_name_raw (asm_out_file, loop_start_lab);
8026 fputc ('\n', asm_out_file);
8028 /* No probe leave. */
8029 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8031 /* BASE = BASE - ADJUSTMENT. */
8032 xops[0] = base;
8033 xops[1] = adjustment;
8034 output_asm_insn ("sub\t%0, %0, %1", xops);
8035 return "";
8038 /* Determine whether a frame chain needs to be generated. */
8039 static bool
8040 aarch64_needs_frame_chain (void)
8042 /* Force a frame chain for EH returns so the return address is at FP+8. */
8043 if (frame_pointer_needed || crtl->calls_eh_return)
8044 return true;
8046 /* A leaf function cannot have calls or write LR. */
8047 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8049 /* Don't use a frame chain in leaf functions if leaf frame pointers
8050 are disabled. */
8051 if (flag_omit_leaf_frame_pointer && is_leaf)
8052 return false;
8054 return aarch64_use_frame_pointer;
8057 /* Mark the registers that need to be saved by the callee and calculate
8058 the size of the callee-saved registers area and frame record (both FP
8059 and LR may be omitted). */
8060 static void
8061 aarch64_layout_frame (void)
8063 poly_int64 offset = 0;
8064 int regno, last_fp_reg = INVALID_REGNUM;
8065 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8066 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8067 bool frame_related_fp_reg_p = false;
8068 aarch64_frame &frame = cfun->machine->frame;
8070 frame.emit_frame_chain = aarch64_needs_frame_chain ();
8072 /* Adjust the outgoing arguments size if required. Keep it in sync with what
8073 the mid-end is doing. */
8074 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8076 #define SLOT_NOT_REQUIRED (-2)
8077 #define SLOT_REQUIRED (-1)
8079 frame.wb_push_candidate1 = INVALID_REGNUM;
8080 frame.wb_push_candidate2 = INVALID_REGNUM;
8081 frame.spare_pred_reg = INVALID_REGNUM;
8083 /* First mark all the registers that really need to be saved... */
8084 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8085 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8087 /* ... that includes the eh data registers (if needed)... */
8088 if (crtl->calls_eh_return)
8089 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8090 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8092 /* ... and any callee saved register that dataflow says is live. */
8093 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8094 if (df_regs_ever_live_p (regno)
8095 && !fixed_regs[regno]
8096 && (regno == R30_REGNUM
8097 || !crtl->abi->clobbers_full_reg_p (regno)))
8098 frame.reg_offset[regno] = SLOT_REQUIRED;
8100 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8101 if (df_regs_ever_live_p (regno)
8102 && !fixed_regs[regno]
8103 && !crtl->abi->clobbers_full_reg_p (regno))
8105 frame.reg_offset[regno] = SLOT_REQUIRED;
8106 last_fp_reg = regno;
8107 if (aarch64_emit_cfi_for_reg_p (regno))
8108 frame_related_fp_reg_p = true;
8111 /* Big-endian SVE frames need a spare predicate register in order
8112 to save Z8-Z15. Decide which register they should use. Prefer
8113 an unused argument register if possible, so that we don't force P4
8114 to be saved unnecessarily. */
8115 if (frame_related_fp_reg_p
8116 && crtl->abi->id () == ARM_PCS_SVE
8117 && BYTES_BIG_ENDIAN)
8119 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8120 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8121 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8122 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8123 break;
8124 gcc_assert (regno <= P7_REGNUM);
8125 frame.spare_pred_reg = regno;
8126 df_set_regs_ever_live (regno, true);
8129 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8130 if (df_regs_ever_live_p (regno)
8131 && !fixed_regs[regno]
8132 && !crtl->abi->clobbers_full_reg_p (regno))
8133 frame.reg_offset[regno] = SLOT_REQUIRED;
8135 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
8136 LR counts as an implicit probe which allows us to maintain the invariant
8137 described in the comment at expand_prologue. */
8138 gcc_assert (crtl->is_leaf
8139 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8141 /* Now assign stack slots for the registers. Start with the predicate
8142 registers, since predicate LDR and STR have a relatively small
8143 offset range. These saves happen below the hard frame pointer. */
8144 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8145 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8147 frame.reg_offset[regno] = offset;
8148 offset += BYTES_PER_SVE_PRED;
8151 if (maybe_ne (offset, 0))
8153 /* If we have any vector registers to save above the predicate registers,
8154 the offset of the vector register save slots need to be a multiple
8155 of the vector size. This lets us use the immediate forms of LDR/STR
8156 (or LD1/ST1 for big-endian).
8158 A vector register is 8 times the size of a predicate register,
8159 and we need to save a maximum of 12 predicate registers, so the
8160 first vector register will be at either #1, MUL VL or #2, MUL VL.
8162 If we don't have any vector registers to save, and we know how
8163 big the predicate save area is, we can just round it up to the
8164 next 16-byte boundary. */
8165 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8166 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8167 else
8169 if (known_le (offset, vector_save_size))
8170 offset = vector_save_size;
8171 else if (known_le (offset, vector_save_size * 2))
8172 offset = vector_save_size * 2;
8173 else
8174 gcc_unreachable ();
8178 /* If we need to save any SVE vector registers, add them next. */
8179 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8180 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8181 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8183 frame.reg_offset[regno] = offset;
8184 offset += vector_save_size;
8187 /* OFFSET is now the offset of the hard frame pointer from the bottom
8188 of the callee save area. */
8189 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8190 frame.below_hard_fp_saved_regs_size = offset;
8191 if (frame.emit_frame_chain)
8193 /* FP and LR are placed in the linkage record. */
8194 frame.reg_offset[R29_REGNUM] = offset;
8195 frame.wb_push_candidate1 = R29_REGNUM;
8196 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
8197 frame.wb_push_candidate2 = R30_REGNUM;
8198 offset += 2 * UNITS_PER_WORD;
8201 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8202 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8204 frame.reg_offset[regno] = offset;
8205 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8206 frame.wb_push_candidate1 = regno;
8207 else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8208 frame.wb_push_candidate2 = regno;
8209 offset += UNITS_PER_WORD;
8212 poly_int64 max_int_offset = offset;
8213 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8214 bool has_align_gap = maybe_ne (offset, max_int_offset);
8216 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8217 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8219 /* If there is an alignment gap between integer and fp callee-saves,
8220 allocate the last fp register to it if possible. */
8221 if (regno == last_fp_reg
8222 && has_align_gap
8223 && known_eq (vector_save_size, 8)
8224 && multiple_p (offset, 16))
8226 frame.reg_offset[regno] = max_int_offset;
8227 break;
8230 frame.reg_offset[regno] = offset;
8231 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8232 frame.wb_push_candidate1 = regno;
8233 else if (frame.wb_push_candidate2 == INVALID_REGNUM
8234 && frame.wb_push_candidate1 >= V0_REGNUM)
8235 frame.wb_push_candidate2 = regno;
8236 offset += vector_save_size;
8239 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8241 frame.saved_regs_size = offset;
8243 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
8245 poly_int64 above_outgoing_args
8246 = aligned_upper_bound (varargs_and_saved_regs_size
8247 + get_frame_size (),
8248 STACK_BOUNDARY / BITS_PER_UNIT);
8250 frame.hard_fp_offset
8251 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8253 /* Both these values are already aligned. */
8254 gcc_assert (multiple_p (crtl->outgoing_args_size,
8255 STACK_BOUNDARY / BITS_PER_UNIT));
8256 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
8258 frame.locals_offset = frame.saved_varargs_size;
8260 frame.initial_adjust = 0;
8261 frame.final_adjust = 0;
8262 frame.callee_adjust = 0;
8263 frame.sve_callee_adjust = 0;
8264 frame.callee_offset = 0;
8266 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8267 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8269 /* Shadow call stack only deals with functions where the LR is pushed
8270 onto the stack and without specifying the "no_sanitize" attribute
8271 with the argument "shadow-call-stack". */
8272 frame.is_scs_enabled
8273 = (!crtl->calls_eh_return
8274 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8275 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8277 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8278 restore x30, and we don't need to pop x30 again in the traditional
8279 way. Pop candidates record the registers that need to be popped
8280 eventually. */
8281 if (frame.is_scs_enabled)
8283 if (frame.wb_pop_candidate2 == R30_REGNUM)
8284 frame.wb_pop_candidate2 = INVALID_REGNUM;
8285 else if (frame.wb_pop_candidate1 == R30_REGNUM)
8286 frame.wb_pop_candidate1 = INVALID_REGNUM;
8289 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8290 256 to ensure that the offset meets the requirements of emit_move_insn.
8291 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8292 max_push_offset to 0, because no registers are popped at this time,
8293 so callee_adjust cannot be adjusted. */
8294 HOST_WIDE_INT max_push_offset = 0;
8295 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8296 max_push_offset = 512;
8297 else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8298 max_push_offset = 256;
8300 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
8301 HOST_WIDE_INT const_saved_regs_size;
8302 if (frame.frame_size.is_constant (&const_size)
8303 && const_size < max_push_offset
8304 && known_eq (frame.hard_fp_offset, const_size))
8306 /* Simple, small frame with no outgoing arguments:
8308 stp reg1, reg2, [sp, -frame_size]!
8309 stp reg3, reg4, [sp, 16] */
8310 frame.callee_adjust = const_size;
8312 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
8313 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8314 && const_outgoing_args_size + const_saved_regs_size < 512
8315 /* We could handle this case even with outgoing args, provided
8316 that the number of args left us with valid offsets for all
8317 predicate and vector save slots. It's such a rare case that
8318 it hardly seems worth the effort though. */
8319 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
8320 && !(cfun->calls_alloca
8321 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8322 && const_fp_offset < max_push_offset))
8324 /* Frame with small outgoing arguments:
8326 sub sp, sp, frame_size
8327 stp reg1, reg2, [sp, outgoing_args_size]
8328 stp reg3, reg4, [sp, outgoing_args_size + 16] */
8329 frame.initial_adjust = frame.frame_size;
8330 frame.callee_offset = const_outgoing_args_size;
8332 else if (saves_below_hard_fp_p
8333 && known_eq (frame.saved_regs_size,
8334 frame.below_hard_fp_saved_regs_size))
8336 /* Frame in which all saves are SVE saves:
8338 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8339 save SVE registers relative to SP
8340 sub sp, sp, outgoing_args_size */
8341 frame.initial_adjust = (frame.hard_fp_offset
8342 + frame.below_hard_fp_saved_regs_size);
8343 frame.final_adjust = crtl->outgoing_args_size;
8345 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
8346 && const_fp_offset < max_push_offset)
8348 /* Frame with large outgoing arguments or SVE saves, but with
8349 a small local area:
8351 stp reg1, reg2, [sp, -hard_fp_offset]!
8352 stp reg3, reg4, [sp, 16]
8353 [sub sp, sp, below_hard_fp_saved_regs_size]
8354 [save SVE registers relative to SP]
8355 sub sp, sp, outgoing_args_size */
8356 frame.callee_adjust = const_fp_offset;
8357 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8358 frame.final_adjust = crtl->outgoing_args_size;
8360 else
8362 /* Frame with large local area and outgoing arguments or SVE saves,
8363 using frame pointer:
8365 sub sp, sp, hard_fp_offset
8366 stp x29, x30, [sp, 0]
8367 add x29, sp, 0
8368 stp reg3, reg4, [sp, 16]
8369 [sub sp, sp, below_hard_fp_saved_regs_size]
8370 [save SVE registers relative to SP]
8371 sub sp, sp, outgoing_args_size */
8372 frame.initial_adjust = frame.hard_fp_offset;
8373 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8374 frame.final_adjust = crtl->outgoing_args_size;
8377 /* Make sure the individual adjustments add up to the full frame size. */
8378 gcc_assert (known_eq (frame.initial_adjust
8379 + frame.callee_adjust
8380 + frame.sve_callee_adjust
8381 + frame.final_adjust, frame.frame_size));
8383 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8385 /* We've decided not to associate any register saves with the initial
8386 stack allocation. */
8387 frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8388 frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8391 frame.laid_out = true;
8394 /* Return true if the register REGNO is saved on entry to
8395 the current function. */
8397 static bool
8398 aarch64_register_saved_on_entry (int regno)
8400 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8403 /* Return the next register up from REGNO up to LIMIT for the callee
8404 to save. */
8406 static unsigned
8407 aarch64_next_callee_save (unsigned regno, unsigned limit)
8409 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8410 regno ++;
8411 return regno;
8414 /* Push the register number REGNO of mode MODE to the stack with write-back
8415 adjusting the stack by ADJUSTMENT. */
8417 static void
8418 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8419 HOST_WIDE_INT adjustment)
8421 rtx base_rtx = stack_pointer_rtx;
8422 rtx insn, reg, mem;
8424 reg = gen_rtx_REG (mode, regno);
8425 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8426 plus_constant (Pmode, base_rtx, -adjustment));
8427 mem = gen_frame_mem (mode, mem);
8429 insn = emit_move_insn (mem, reg);
8430 RTX_FRAME_RELATED_P (insn) = 1;
8433 /* Generate and return an instruction to store the pair of registers
8434 REG and REG2 of mode MODE to location BASE with write-back adjusting
8435 the stack location BASE by ADJUSTMENT. */
8437 static rtx
8438 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8439 HOST_WIDE_INT adjustment)
8441 switch (mode)
8443 case E_DImode:
8444 return gen_storewb_pairdi_di (base, base, reg, reg2,
8445 GEN_INT (-adjustment),
8446 GEN_INT (UNITS_PER_WORD - adjustment));
8447 case E_DFmode:
8448 return gen_storewb_pairdf_di (base, base, reg, reg2,
8449 GEN_INT (-adjustment),
8450 GEN_INT (UNITS_PER_WORD - adjustment));
8451 case E_TFmode:
8452 return gen_storewb_pairtf_di (base, base, reg, reg2,
8453 GEN_INT (-adjustment),
8454 GEN_INT (UNITS_PER_VREG - adjustment));
8455 default:
8456 gcc_unreachable ();
8460 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8461 stack pointer by ADJUSTMENT. */
8463 static void
8464 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8466 rtx_insn *insn;
8467 machine_mode mode = aarch64_reg_save_mode (regno1);
8469 if (regno2 == INVALID_REGNUM)
8470 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8472 rtx reg1 = gen_rtx_REG (mode, regno1);
8473 rtx reg2 = gen_rtx_REG (mode, regno2);
8475 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8476 reg2, adjustment));
8477 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8478 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8479 RTX_FRAME_RELATED_P (insn) = 1;
8482 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8483 adjusting it by ADJUSTMENT afterwards. */
8485 static rtx
8486 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8487 HOST_WIDE_INT adjustment)
8489 switch (mode)
8491 case E_DImode:
8492 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8493 GEN_INT (UNITS_PER_WORD));
8494 case E_DFmode:
8495 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8496 GEN_INT (UNITS_PER_WORD));
8497 case E_TFmode:
8498 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8499 GEN_INT (UNITS_PER_VREG));
8500 default:
8501 gcc_unreachable ();
8505 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8506 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8507 into CFI_OPS. */
8509 static void
8510 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8511 rtx *cfi_ops)
8513 machine_mode mode = aarch64_reg_save_mode (regno1);
8514 rtx reg1 = gen_rtx_REG (mode, regno1);
8516 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8518 if (regno2 == INVALID_REGNUM)
8520 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8521 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8522 emit_move_insn (reg1, gen_frame_mem (mode, mem));
8524 else
8526 rtx reg2 = gen_rtx_REG (mode, regno2);
8527 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8528 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8529 reg2, adjustment));
8533 /* Generate and return a store pair instruction of mode MODE to store
8534 register REG1 to MEM1 and register REG2 to MEM2. */
8536 static rtx
8537 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8538 rtx reg2)
8540 switch (mode)
8542 case E_DImode:
8543 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8545 case E_DFmode:
8546 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8548 case E_TFmode:
8549 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8551 case E_V4SImode:
8552 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8554 case E_V16QImode:
8555 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8557 default:
8558 gcc_unreachable ();
8562 /* Generate and regurn a load pair isntruction of mode MODE to load register
8563 REG1 from MEM1 and register REG2 from MEM2. */
8565 static rtx
8566 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8567 rtx mem2)
8569 switch (mode)
8571 case E_DImode:
8572 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8574 case E_DFmode:
8575 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8577 case E_TFmode:
8578 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8580 case E_V4SImode:
8581 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8583 default:
8584 gcc_unreachable ();
8588 /* Return TRUE if return address signing should be enabled for the current
8589 function, otherwise return FALSE. */
8591 bool
8592 aarch64_return_address_signing_enabled (void)
8594 /* This function should only be called after frame laid out. */
8595 gcc_assert (cfun->machine->frame.laid_out);
8597 /* Turn return address signing off in any function that uses
8598 __builtin_eh_return. The address passed to __builtin_eh_return
8599 is not signed so either it has to be signed (with original sp)
8600 or the code path that uses it has to avoid authenticating it.
8601 Currently eh return introduces a return to anywhere gadget, no
8602 matter what we do here since it uses ret with user provided
8603 address. An ideal fix for that is to use indirect branch which
8604 can be protected with BTI j (to some extent). */
8605 if (crtl->calls_eh_return)
8606 return false;
8608 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8609 if its LR is pushed onto stack. */
8610 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
8611 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
8612 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8615 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8616 bool
8617 aarch64_bti_enabled (void)
8619 return (aarch64_enable_bti == 1);
8622 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8623 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8624 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
8626 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8627 or LD1D address
8629 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8630 if the variable isn't already nonnull
8632 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8633 Handle this case using a temporary base register that is suitable for
8634 all offsets in that range. Use ANCHOR_REG as this base register if it
8635 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
8637 static inline void
8638 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8639 rtx &anchor_reg, poly_int64 &offset,
8640 rtx &ptrue)
8642 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8644 /* This is the maximum valid offset of the anchor from the base.
8645 Lower values would be valid too. */
8646 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8647 if (!anchor_reg)
8649 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8650 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8651 gen_int_mode (anchor_offset, Pmode)));
8653 base_rtx = anchor_reg;
8654 offset -= anchor_offset;
8656 if (!ptrue)
8658 int pred_reg = cfun->machine->frame.spare_pred_reg;
8659 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8660 CONSTM1_RTX (VNx16BImode));
8661 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8665 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8666 is saved at BASE + OFFSET. */
8668 static void
8669 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8670 rtx base, poly_int64 offset)
8672 rtx mem = gen_frame_mem (GET_MODE (reg),
8673 plus_constant (Pmode, base, offset));
8674 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8677 /* Emit code to save the callee-saved registers from register number START
8678 to LIMIT to the stack at the location starting at offset START_OFFSET,
8679 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
8680 is true if the hard frame pointer has been set up. */
8682 static void
8683 aarch64_save_callee_saves (poly_int64 start_offset,
8684 unsigned start, unsigned limit, bool skip_wb,
8685 bool hard_fp_valid_p)
8687 rtx_insn *insn;
8688 unsigned regno;
8689 unsigned regno2;
8690 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8692 for (regno = aarch64_next_callee_save (start, limit);
8693 regno <= limit;
8694 regno = aarch64_next_callee_save (regno + 1, limit))
8696 rtx reg, mem;
8697 poly_int64 offset;
8698 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8700 if (skip_wb
8701 && (regno == cfun->machine->frame.wb_push_candidate1
8702 || regno == cfun->machine->frame.wb_push_candidate2))
8703 continue;
8705 if (cfun->machine->reg_is_wrapped_separately[regno])
8706 continue;
8708 machine_mode mode = aarch64_reg_save_mode (regno);
8709 reg = gen_rtx_REG (mode, regno);
8710 offset = start_offset + cfun->machine->frame.reg_offset[regno];
8711 rtx base_rtx = stack_pointer_rtx;
8712 poly_int64 sp_offset = offset;
8714 HOST_WIDE_INT const_offset;
8715 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8716 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8717 offset, ptrue);
8718 else if (GP_REGNUM_P (regno)
8719 && (!offset.is_constant (&const_offset) || const_offset >= 512))
8721 gcc_assert (known_eq (start_offset, 0));
8722 poly_int64 fp_offset
8723 = cfun->machine->frame.below_hard_fp_saved_regs_size;
8724 if (hard_fp_valid_p)
8725 base_rtx = hard_frame_pointer_rtx;
8726 else
8728 if (!anchor_reg)
8730 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8731 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8732 gen_int_mode (fp_offset, Pmode)));
8734 base_rtx = anchor_reg;
8736 offset -= fp_offset;
8738 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8739 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
8741 if (!aarch64_sve_mode_p (mode)
8742 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
8743 && !cfun->machine->reg_is_wrapped_separately[regno2]
8744 && known_eq (GET_MODE_SIZE (mode),
8745 cfun->machine->frame.reg_offset[regno2]
8746 - cfun->machine->frame.reg_offset[regno]))
8748 rtx reg2 = gen_rtx_REG (mode, regno2);
8749 rtx mem2;
8751 offset += GET_MODE_SIZE (mode);
8752 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8753 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
8754 reg2));
8756 /* The first part of a frame-related parallel insn is
8757 always assumed to be relevant to the frame
8758 calculations; subsequent parts, are only
8759 frame-related if explicitly marked. */
8760 if (aarch64_emit_cfi_for_reg_p (regno2))
8762 if (need_cfa_note_p)
8763 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
8764 sp_offset + GET_MODE_SIZE (mode));
8765 else
8766 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8769 regno = regno2;
8771 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8773 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
8774 need_cfa_note_p = true;
8776 else if (aarch64_sve_mode_p (mode))
8777 insn = emit_insn (gen_rtx_SET (mem, reg));
8778 else
8779 insn = emit_move_insn (mem, reg);
8781 RTX_FRAME_RELATED_P (insn) = frame_related_p;
8782 if (frame_related_p && need_cfa_note_p)
8783 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
8787 /* Emit code to restore the callee registers from register number START
8788 up to and including LIMIT. Restore from the stack offset START_OFFSET,
8789 skipping any write-back candidates if SKIP_WB is true. Write the
8790 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
8792 static void
8793 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
8794 unsigned limit, bool skip_wb, rtx *cfi_ops)
8796 unsigned regno;
8797 unsigned regno2;
8798 poly_int64 offset;
8799 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8801 for (regno = aarch64_next_callee_save (start, limit);
8802 regno <= limit;
8803 regno = aarch64_next_callee_save (regno + 1, limit))
8805 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8806 if (cfun->machine->reg_is_wrapped_separately[regno])
8807 continue;
8809 rtx reg, mem;
8811 if (skip_wb
8812 && (regno == cfun->machine->frame.wb_pop_candidate1
8813 || regno == cfun->machine->frame.wb_pop_candidate2))
8814 continue;
8816 machine_mode mode = aarch64_reg_save_mode (regno);
8817 reg = gen_rtx_REG (mode, regno);
8818 offset = start_offset + cfun->machine->frame.reg_offset[regno];
8819 rtx base_rtx = stack_pointer_rtx;
8820 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8821 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8822 offset, ptrue);
8823 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8825 if (!aarch64_sve_mode_p (mode)
8826 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
8827 && !cfun->machine->reg_is_wrapped_separately[regno2]
8828 && known_eq (GET_MODE_SIZE (mode),
8829 cfun->machine->frame.reg_offset[regno2]
8830 - cfun->machine->frame.reg_offset[regno]))
8832 rtx reg2 = gen_rtx_REG (mode, regno2);
8833 rtx mem2;
8835 offset += GET_MODE_SIZE (mode);
8836 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8837 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8839 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8840 regno = regno2;
8842 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8843 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8844 else if (aarch64_sve_mode_p (mode))
8845 emit_insn (gen_rtx_SET (reg, mem));
8846 else
8847 emit_move_insn (reg, mem);
8848 if (frame_related_p)
8849 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8853 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8854 of MODE. */
8856 static inline bool
8857 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8859 HOST_WIDE_INT multiple;
8860 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8861 && IN_RANGE (multiple, -8, 7));
8864 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8865 of MODE. */
8867 static inline bool
8868 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8870 HOST_WIDE_INT multiple;
8871 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8872 && IN_RANGE (multiple, -32, 31));
8875 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8876 of MODE. */
8878 static inline bool
8879 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8881 HOST_WIDE_INT multiple;
8882 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8883 && IN_RANGE (multiple, 0, 63));
8886 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8887 of MODE. */
8889 bool
8890 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8892 HOST_WIDE_INT multiple;
8893 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8894 && IN_RANGE (multiple, -64, 63));
8897 /* Return true if OFFSET is a signed 9-bit value. */
8899 bool
8900 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8901 poly_int64 offset)
8903 HOST_WIDE_INT const_offset;
8904 return (offset.is_constant (&const_offset)
8905 && IN_RANGE (const_offset, -256, 255));
8908 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8909 of MODE. */
8911 static inline bool
8912 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8914 HOST_WIDE_INT multiple;
8915 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8916 && IN_RANGE (multiple, -256, 255));
8919 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8920 of MODE. */
8922 static inline bool
8923 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8925 HOST_WIDE_INT multiple;
8926 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8927 && IN_RANGE (multiple, 0, 4095));
8930 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
8932 static sbitmap
8933 aarch64_get_separate_components (void)
8935 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8936 bitmap_clear (components);
8938 /* The registers we need saved to the frame. */
8939 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8940 if (aarch64_register_saved_on_entry (regno))
8942 /* Punt on saves and restores that use ST1D and LD1D. We could
8943 try to be smarter, but it would involve making sure that the
8944 spare predicate register itself is safe to use at the save
8945 and restore points. Also, when a frame pointer is being used,
8946 the slots are often out of reach of ST1D and LD1D anyway. */
8947 machine_mode mode = aarch64_reg_save_mode (regno);
8948 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8949 continue;
8951 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8953 /* If the register is saved in the first SVE save slot, we use
8954 it as a stack probe for -fstack-clash-protection. */
8955 if (flag_stack_clash_protection
8956 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
8957 && known_eq (offset, 0))
8958 continue;
8960 /* Get the offset relative to the register we'll use. */
8961 if (frame_pointer_needed)
8962 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8963 else
8964 offset += crtl->outgoing_args_size;
8966 /* Check that we can access the stack slot of the register with one
8967 direct load with no adjustments needed. */
8968 if (aarch64_sve_mode_p (mode)
8969 ? offset_9bit_signed_scaled_p (mode, offset)
8970 : offset_12bit_unsigned_scaled_p (mode, offset))
8971 bitmap_set_bit (components, regno);
8974 /* Don't mess with the hard frame pointer. */
8975 if (frame_pointer_needed)
8976 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8978 /* If the spare predicate register used by big-endian SVE code
8979 is call-preserved, it must be saved in the main prologue
8980 before any saves that use it. */
8981 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
8982 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
8984 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
8985 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
8986 /* If registers have been chosen to be stored/restored with
8987 writeback don't interfere with them to avoid having to output explicit
8988 stack adjustment instructions. */
8989 if (reg2 != INVALID_REGNUM)
8990 bitmap_clear_bit (components, reg2);
8991 if (reg1 != INVALID_REGNUM)
8992 bitmap_clear_bit (components, reg1);
8994 bitmap_clear_bit (components, LR_REGNUM);
8995 bitmap_clear_bit (components, SP_REGNUM);
8997 return components;
9000 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
9002 static sbitmap
9003 aarch64_components_for_bb (basic_block bb)
9005 bitmap in = DF_LIVE_IN (bb);
9006 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9007 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9009 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9010 bitmap_clear (components);
9012 /* Clobbered registers don't generate values in any meaningful sense,
9013 since nothing after the clobber can rely on their value. And we can't
9014 say that partially-clobbered registers are unconditionally killed,
9015 because whether they're killed or not depends on the mode of the
9016 value they're holding. Thus partially call-clobbered registers
9017 appear in neither the kill set nor the gen set.
9019 Check manually for any calls that clobber more of a register than the
9020 current function can. */
9021 function_abi_aggregator callee_abis;
9022 rtx_insn *insn;
9023 FOR_BB_INSNS (bb, insn)
9024 if (CALL_P (insn))
9025 callee_abis.note_callee_abi (insn_callee_abi (insn));
9026 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9028 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9029 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9030 if (!fixed_regs[regno]
9031 && !crtl->abi->clobbers_full_reg_p (regno)
9032 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9033 || bitmap_bit_p (in, regno)
9034 || bitmap_bit_p (gen, regno)
9035 || bitmap_bit_p (kill, regno)))
9037 bitmap_set_bit (components, regno);
9039 /* If there is a callee-save at an adjacent offset, add it too
9040 to increase the use of LDP/STP. */
9041 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9042 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9044 if (regno2 <= LAST_SAVED_REGNUM)
9046 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9047 if (regno < regno2
9048 ? known_eq (offset + 8, offset2)
9049 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9050 bitmap_set_bit (components, regno2);
9054 return components;
9057 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9058 Nothing to do for aarch64. */
9060 static void
9061 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9065 /* Return the next set bit in BMP from START onwards. Return the total number
9066 of bits in BMP if no set bit is found at or after START. */
9068 static unsigned int
9069 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9071 unsigned int nbits = SBITMAP_SIZE (bmp);
9072 if (start == nbits)
9073 return start;
9075 gcc_assert (start < nbits);
9076 for (unsigned int i = start; i < nbits; i++)
9077 if (bitmap_bit_p (bmp, i))
9078 return i;
9080 return nbits;
9083 /* Do the work for aarch64_emit_prologue_components and
9084 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9085 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9086 for these components or the epilogue sequence. That is, it determines
9087 whether we should emit stores or loads and what kind of CFA notes to attach
9088 to the insns. Otherwise the logic for the two sequences is very
9089 similar. */
9091 static void
9092 aarch64_process_components (sbitmap components, bool prologue_p)
9094 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9095 ? HARD_FRAME_POINTER_REGNUM
9096 : STACK_POINTER_REGNUM);
9098 unsigned last_regno = SBITMAP_SIZE (components);
9099 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9100 rtx_insn *insn = NULL;
9102 while (regno != last_regno)
9104 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9105 machine_mode mode = aarch64_reg_save_mode (regno);
9107 rtx reg = gen_rtx_REG (mode, regno);
9108 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9109 if (frame_pointer_needed)
9110 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9111 else
9112 offset += crtl->outgoing_args_size;
9114 rtx addr = plus_constant (Pmode, ptr_reg, offset);
9115 rtx mem = gen_frame_mem (mode, addr);
9117 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9118 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9119 /* No more registers to handle after REGNO.
9120 Emit a single save/restore and exit. */
9121 if (regno2 == last_regno)
9123 insn = emit_insn (set);
9124 if (frame_related_p)
9126 RTX_FRAME_RELATED_P (insn) = 1;
9127 if (prologue_p)
9128 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9129 else
9130 add_reg_note (insn, REG_CFA_RESTORE, reg);
9132 break;
9135 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9136 /* The next register is not of the same class or its offset is not
9137 mergeable with the current one into a pair. */
9138 if (aarch64_sve_mode_p (mode)
9139 || !satisfies_constraint_Ump (mem)
9140 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9141 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9142 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9143 GET_MODE_SIZE (mode)))
9145 insn = emit_insn (set);
9146 if (frame_related_p)
9148 RTX_FRAME_RELATED_P (insn) = 1;
9149 if (prologue_p)
9150 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9151 else
9152 add_reg_note (insn, REG_CFA_RESTORE, reg);
9155 regno = regno2;
9156 continue;
9159 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9161 /* REGNO2 can be saved/restored in a pair with REGNO. */
9162 rtx reg2 = gen_rtx_REG (mode, regno2);
9163 if (frame_pointer_needed)
9164 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9165 else
9166 offset2 += crtl->outgoing_args_size;
9167 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9168 rtx mem2 = gen_frame_mem (mode, addr2);
9169 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9170 : gen_rtx_SET (reg2, mem2);
9172 if (prologue_p)
9173 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9174 else
9175 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9177 if (frame_related_p || frame_related2_p)
9179 RTX_FRAME_RELATED_P (insn) = 1;
9180 if (prologue_p)
9182 if (frame_related_p)
9183 add_reg_note (insn, REG_CFA_OFFSET, set);
9184 if (frame_related2_p)
9185 add_reg_note (insn, REG_CFA_OFFSET, set2);
9187 else
9189 if (frame_related_p)
9190 add_reg_note (insn, REG_CFA_RESTORE, reg);
9191 if (frame_related2_p)
9192 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9196 regno = aarch64_get_next_set_bit (components, regno2 + 1);
9200 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9202 static void
9203 aarch64_emit_prologue_components (sbitmap components)
9205 aarch64_process_components (components, true);
9208 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9210 static void
9211 aarch64_emit_epilogue_components (sbitmap components)
9213 aarch64_process_components (components, false);
9216 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9218 static void
9219 aarch64_set_handled_components (sbitmap components)
9221 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9222 if (bitmap_bit_p (components, regno))
9223 cfun->machine->reg_is_wrapped_separately[regno] = true;
9226 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9227 determining the probe offset for alloca. */
9229 static HOST_WIDE_INT
9230 aarch64_stack_clash_protection_alloca_probe_range (void)
9232 return STACK_CLASH_CALLER_GUARD;
9236 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9237 registers. If POLY_SIZE is not large enough to require a probe this function
9238 will only adjust the stack. When allocating the stack space
9239 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9240 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9241 arguments. If we are then we ensure that any allocation larger than the ABI
9242 defined buffer needs a probe so that the invariant of having a 1KB buffer is
9243 maintained.
9245 We emit barriers after each stack adjustment to prevent optimizations from
9246 breaking the invariant that we never drop the stack more than a page. This
9247 invariant is needed to make it easier to correctly handle asynchronous
9248 events, e.g. if we were to allow the stack to be dropped by more than a page
9249 and then have multiple probes up and we take a signal somewhere in between
9250 then the signal handler doesn't know the state of the stack and can make no
9251 assumptions about which pages have been probed. */
9253 static void
9254 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9255 poly_int64 poly_size,
9256 bool frame_related_p,
9257 bool final_adjustment_p)
9259 HOST_WIDE_INT guard_size
9260 = 1 << param_stack_clash_protection_guard_size;
9261 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9262 HOST_WIDE_INT min_probe_threshold
9263 = (final_adjustment_p
9264 ? guard_used_by_caller
9265 : guard_size - guard_used_by_caller);
9266 /* When doing the final adjustment for the outgoing arguments, take into
9267 account any unprobed space there is above the current SP. There are
9268 two cases:
9270 - When saving SVE registers below the hard frame pointer, we force
9271 the lowest save to take place in the prologue before doing the final
9272 adjustment (i.e. we don't allow the save to be shrink-wrapped).
9273 This acts as a probe at SP, so there is no unprobed space.
9275 - When there are no SVE register saves, we use the store of the link
9276 register as a probe. We can't assume that LR was saved at position 0
9277 though, so treat any space below it as unprobed. */
9278 if (final_adjustment_p
9279 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9281 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9282 if (known_ge (lr_offset, 0))
9283 min_probe_threshold -= lr_offset.to_constant ();
9284 else
9285 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9288 poly_int64 frame_size = cfun->machine->frame.frame_size;
9290 /* We should always have a positive probe threshold. */
9291 gcc_assert (min_probe_threshold > 0);
9293 if (flag_stack_clash_protection && !final_adjustment_p)
9295 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9296 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9297 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9299 if (known_eq (frame_size, 0))
9301 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9303 else if (known_lt (initial_adjust + sve_callee_adjust,
9304 guard_size - guard_used_by_caller)
9305 && known_lt (final_adjust, guard_used_by_caller))
9307 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9311 /* If SIZE is not large enough to require probing, just adjust the stack and
9312 exit. */
9313 if (known_lt (poly_size, min_probe_threshold)
9314 || !flag_stack_clash_protection)
9316 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9317 return;
9320 HOST_WIDE_INT size;
9321 /* Handle the SVE non-constant case first. */
9322 if (!poly_size.is_constant (&size))
9324 if (dump_file)
9326 fprintf (dump_file, "Stack clash SVE prologue: ");
9327 print_dec (poly_size, dump_file);
9328 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9331 /* First calculate the amount of bytes we're actually spilling. */
9332 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9333 poly_size, temp1, temp2, false, true);
9335 rtx_insn *insn = get_last_insn ();
9337 if (frame_related_p)
9339 /* This is done to provide unwinding information for the stack
9340 adjustments we're about to do, however to prevent the optimizers
9341 from removing the R11 move and leaving the CFA note (which would be
9342 very wrong) we tie the old and new stack pointer together.
9343 The tie will expand to nothing but the optimizers will not touch
9344 the instruction. */
9345 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9346 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9347 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9349 /* We want the CFA independent of the stack pointer for the
9350 duration of the loop. */
9351 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9352 RTX_FRAME_RELATED_P (insn) = 1;
9355 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9356 rtx guard_const = gen_int_mode (guard_size, Pmode);
9358 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9359 stack_pointer_rtx, temp1,
9360 probe_const, guard_const));
9362 /* Now reset the CFA register if needed. */
9363 if (frame_related_p)
9365 add_reg_note (insn, REG_CFA_DEF_CFA,
9366 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9367 gen_int_mode (poly_size, Pmode)));
9368 RTX_FRAME_RELATED_P (insn) = 1;
9371 return;
9374 if (dump_file)
9375 fprintf (dump_file,
9376 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9377 " bytes, probing will be required.\n", size);
9379 /* Round size to the nearest multiple of guard_size, and calculate the
9380 residual as the difference between the original size and the rounded
9381 size. */
9382 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9383 HOST_WIDE_INT residual = size - rounded_size;
9385 /* We can handle a small number of allocations/probes inline. Otherwise
9386 punt to a loop. */
9387 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9389 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9391 aarch64_sub_sp (NULL, temp2, guard_size, true);
9392 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9393 guard_used_by_caller));
9394 emit_insn (gen_blockage ());
9396 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9398 else
9400 /* Compute the ending address. */
9401 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9402 temp1, NULL, false, true);
9403 rtx_insn *insn = get_last_insn ();
9405 /* For the initial allocation, we don't have a frame pointer
9406 set up, so we always need CFI notes. If we're doing the
9407 final allocation, then we may have a frame pointer, in which
9408 case it is the CFA, otherwise we need CFI notes.
9410 We can determine which allocation we are doing by looking at
9411 the value of FRAME_RELATED_P since the final allocations are not
9412 frame related. */
9413 if (frame_related_p)
9415 /* We want the CFA independent of the stack pointer for the
9416 duration of the loop. */
9417 add_reg_note (insn, REG_CFA_DEF_CFA,
9418 plus_constant (Pmode, temp1, rounded_size));
9419 RTX_FRAME_RELATED_P (insn) = 1;
9422 /* This allocates and probes the stack. Note that this re-uses some of
9423 the existing Ada stack protection code. However we are guaranteed not
9424 to enter the non loop or residual branches of that code.
9426 The non-loop part won't be entered because if our allocation amount
9427 doesn't require a loop, the case above would handle it.
9429 The residual amount won't be entered because TEMP1 is a mutliple of
9430 the allocation size. The residual will always be 0. As such, the only
9431 part we are actually using from that code is the loop setup. The
9432 actual probing is done in aarch64_output_probe_stack_range. */
9433 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9434 stack_pointer_rtx, temp1));
9436 /* Now reset the CFA register if needed. */
9437 if (frame_related_p)
9439 add_reg_note (insn, REG_CFA_DEF_CFA,
9440 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9441 RTX_FRAME_RELATED_P (insn) = 1;
9444 emit_insn (gen_blockage ());
9445 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9448 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9449 be probed. This maintains the requirement that each page is probed at
9450 least once. For initial probing we probe only if the allocation is
9451 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9452 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9453 GUARD_SIZE. This works that for any allocation that is large enough to
9454 trigger a probe here, we'll have at least one, and if they're not large
9455 enough for this code to emit anything for them, The page would have been
9456 probed by the saving of FP/LR either by this function or any callees. If
9457 we don't have any callees then we won't have more stack adjustments and so
9458 are still safe. */
9459 if (residual)
9461 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9462 /* If we're doing final adjustments, and we've done any full page
9463 allocations then any residual needs to be probed. */
9464 if (final_adjustment_p && rounded_size != 0)
9465 min_probe_threshold = 0;
9466 /* If doing a small final adjustment, we always probe at offset 0.
9467 This is done to avoid issues when LR is not at position 0 or when
9468 the final adjustment is smaller than the probing offset. */
9469 else if (final_adjustment_p && rounded_size == 0)
9470 residual_probe_offset = 0;
9472 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9473 if (residual >= min_probe_threshold)
9475 if (dump_file)
9476 fprintf (dump_file,
9477 "Stack clash AArch64 prologue residuals: "
9478 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9479 "\n", residual);
9481 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9482 residual_probe_offset));
9483 emit_insn (gen_blockage ());
9488 /* Return 1 if the register is used by the epilogue. We need to say the
9489 return register is used, but only after epilogue generation is complete.
9490 Note that in the case of sibcalls, the values "used by the epilogue" are
9491 considered live at the start of the called function.
9493 For SIMD functions we need to return 1 for FP registers that are saved and
9494 restored by a function but are not zero in call_used_regs. If we do not do
9495 this optimizations may remove the restore of the register. */
9498 aarch64_epilogue_uses (int regno)
9500 if (epilogue_completed)
9502 if (regno == LR_REGNUM)
9503 return 1;
9505 return 0;
9508 /* AArch64 stack frames generated by this compiler look like:
9510 +-------------------------------+
9512 | incoming stack arguments |
9514 +-------------------------------+
9515 | | <-- incoming stack pointer (aligned)
9516 | callee-allocated save area |
9517 | for register varargs |
9519 +-------------------------------+
9520 | local variables | <-- frame_pointer_rtx
9522 +-------------------------------+
9523 | padding | \
9524 +-------------------------------+ |
9525 | callee-saved registers | | frame.saved_regs_size
9526 +-------------------------------+ |
9527 | LR' | |
9528 +-------------------------------+ |
9529 | FP' | |
9530 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
9531 | SVE vector registers | | \
9532 +-------------------------------+ | | below_hard_fp_saved_regs_size
9533 | SVE predicate registers | / /
9534 +-------------------------------+
9535 | dynamic allocation |
9536 +-------------------------------+
9537 | padding |
9538 +-------------------------------+
9539 | outgoing stack arguments | <-- arg_pointer
9541 +-------------------------------+
9542 | | <-- stack_pointer_rtx (aligned)
9544 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9545 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9546 unchanged.
9548 By default for stack-clash we assume the guard is at least 64KB, but this
9549 value is configurable to either 4KB or 64KB. We also force the guard size to
9550 be the same as the probing interval and both values are kept in sync.
9552 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9553 on the guard size) of stack space without probing.
9555 When probing is needed, we emit a probe at the start of the prologue
9556 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9558 We have to track how much space has been allocated and the only stores
9559 to the stack we track as implicit probes are the FP/LR stores.
9561 For outgoing arguments we probe if the size is larger than 1KB, such that
9562 the ABI specified buffer is maintained for the next callee.
9564 The following registers are reserved during frame layout and should not be
9565 used for any other purpose:
9567 - r11: Used by stack clash protection when SVE is enabled, and also
9568 as an anchor register when saving and restoring registers
9569 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9570 - r14 and r15: Used for speculation tracking.
9571 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9572 - r30(LR), r29(FP): Used by standard frame layout.
9574 These registers must be avoided in frame layout related code unless the
9575 explicit intention is to interact with one of the features listed above. */
9577 /* Generate the prologue instructions for entry into a function.
9578 Establish the stack frame by decreasing the stack pointer with a
9579 properly calculated size and, if necessary, create a frame record
9580 filled with the values of LR and previous frame pointer. The
9581 current FP is also set up if it is in use. */
9583 void
9584 aarch64_expand_prologue (void)
9586 poly_int64 frame_size = cfun->machine->frame.frame_size;
9587 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9588 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9589 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9590 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9591 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9592 poly_int64 below_hard_fp_saved_regs_size
9593 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9594 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9595 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9596 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
9597 rtx_insn *insn;
9599 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
9601 /* Fold the SVE allocation into the initial allocation.
9602 We don't do this in aarch64_layout_arg to avoid pessimizing
9603 the epilogue code. */
9604 initial_adjust += sve_callee_adjust;
9605 sve_callee_adjust = 0;
9608 /* Sign return address for functions. */
9609 if (aarch64_return_address_signing_enabled ())
9611 switch (aarch64_ra_sign_key)
9613 case AARCH64_KEY_A:
9614 insn = emit_insn (gen_paciasp ());
9615 break;
9616 case AARCH64_KEY_B:
9617 insn = emit_insn (gen_pacibsp ());
9618 break;
9619 default:
9620 gcc_unreachable ();
9622 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9623 RTX_FRAME_RELATED_P (insn) = 1;
9626 /* Push return address to shadow call stack. */
9627 if (cfun->machine->frame.is_scs_enabled)
9628 emit_insn (gen_scs_push ());
9630 if (flag_stack_usage_info)
9631 current_function_static_stack_size = constant_lower_bound (frame_size);
9633 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9635 if (crtl->is_leaf && !cfun->calls_alloca)
9637 if (maybe_gt (frame_size, PROBE_INTERVAL)
9638 && maybe_gt (frame_size, get_stack_check_protect ()))
9639 aarch64_emit_probe_stack_range (get_stack_check_protect (),
9640 (frame_size
9641 - get_stack_check_protect ()));
9643 else if (maybe_gt (frame_size, 0))
9644 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9647 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9648 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9650 /* In theory we should never have both an initial adjustment
9651 and a callee save adjustment. Verify that is the case since the
9652 code below does not handle it for -fstack-clash-protection. */
9653 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9655 /* Will only probe if the initial adjustment is larger than the guard
9656 less the amount of the guard reserved for use by the caller's
9657 outgoing args. */
9658 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9659 true, false);
9661 if (callee_adjust != 0)
9662 aarch64_push_regs (reg1, reg2, callee_adjust);
9664 /* The offset of the frame chain record (if any) from the current SP. */
9665 poly_int64 chain_offset = (initial_adjust + callee_adjust
9666 - cfun->machine->frame.hard_fp_offset);
9667 gcc_assert (known_ge (chain_offset, 0));
9669 /* The offset of the bottom of the save area from the current SP. */
9670 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
9672 if (emit_frame_chain)
9674 if (callee_adjust == 0)
9676 reg1 = R29_REGNUM;
9677 reg2 = R30_REGNUM;
9678 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
9679 false, false);
9681 else
9682 gcc_assert (known_eq (chain_offset, 0));
9683 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9684 stack_pointer_rtx, chain_offset,
9685 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
9686 if (frame_pointer_needed && !frame_size.is_constant ())
9688 /* Variable-sized frames need to describe the save slot
9689 address using DW_CFA_expression rather than DW_CFA_offset.
9690 This means that, without taking further action, the
9691 locations of the registers that we've already saved would
9692 remain based on the stack pointer even after we redefine
9693 the CFA based on the frame pointer. We therefore need new
9694 DW_CFA_expressions to re-express the save slots with addresses
9695 based on the frame pointer. */
9696 rtx_insn *insn = get_last_insn ();
9697 gcc_assert (RTX_FRAME_RELATED_P (insn));
9699 /* Add an explicit CFA definition if this was previously
9700 implicit. */
9701 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9703 rtx src = plus_constant (Pmode, stack_pointer_rtx,
9704 callee_offset);
9705 add_reg_note (insn, REG_CFA_ADJUST_CFA,
9706 gen_rtx_SET (hard_frame_pointer_rtx, src));
9709 /* Change the save slot expressions for the registers that
9710 we've already saved. */
9711 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9712 hard_frame_pointer_rtx, UNITS_PER_WORD);
9713 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9714 hard_frame_pointer_rtx, 0);
9716 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
9719 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
9720 callee_adjust != 0 || emit_frame_chain,
9721 emit_frame_chain);
9722 if (maybe_ne (sve_callee_adjust, 0))
9724 gcc_assert (!flag_stack_clash_protection
9725 || known_eq (initial_adjust, 0));
9726 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9727 sve_callee_adjust,
9728 !frame_pointer_needed, false);
9729 saved_regs_offset += sve_callee_adjust;
9731 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
9732 false, emit_frame_chain);
9733 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
9734 callee_adjust != 0 || emit_frame_chain,
9735 emit_frame_chain);
9737 /* We may need to probe the final adjustment if it is larger than the guard
9738 that is assumed by the called. */
9739 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9740 !frame_pointer_needed, true);
9743 /* Return TRUE if we can use a simple_return insn.
9745 This function checks whether the callee saved stack is empty, which
9746 means no restore actions are need. The pro_and_epilogue will use
9747 this to check whether shrink-wrapping opt is feasible. */
9749 bool
9750 aarch64_use_return_insn_p (void)
9752 if (!reload_completed)
9753 return false;
9755 if (crtl->profile)
9756 return false;
9758 return known_eq (cfun->machine->frame.frame_size, 0);
9761 /* Generate the epilogue instructions for returning from a function.
9762 This is almost exactly the reverse of the prolog sequence, except
9763 that we need to insert barriers to avoid scheduling loads that read
9764 from a deallocated stack, and we optimize the unwind records by
9765 emitting them all together if possible. */
9766 void
9767 aarch64_expand_epilogue (bool for_sibcall)
9769 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9770 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9771 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9772 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9773 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9774 poly_int64 below_hard_fp_saved_regs_size
9775 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9776 unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
9777 unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
9778 unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
9779 ? R29_REGNUM : R30_REGNUM);
9780 rtx cfi_ops = NULL;
9781 rtx_insn *insn;
9782 /* A stack clash protection prologue may not have left EP0_REGNUM or
9783 EP1_REGNUM in a usable state. The same is true for allocations
9784 with an SVE component, since we then need both temporary registers
9785 for each allocation. For stack clash we are in a usable state if
9786 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
9787 HOST_WIDE_INT guard_size
9788 = 1 << param_stack_clash_protection_guard_size;
9789 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9791 /* We can re-use the registers when:
9793 (a) the deallocation amount is the same as the corresponding
9794 allocation amount (which is false if we combine the initial
9795 and SVE callee save allocations in the prologue); and
9797 (b) the allocation amount doesn't need a probe (which is false
9798 if the amount is guard_size - guard_used_by_caller or greater).
9800 In such situations the register should remain live with the correct
9801 value. */
9802 bool can_inherit_p = (initial_adjust.is_constant ()
9803 && final_adjust.is_constant ()
9804 && (!flag_stack_clash_protection
9805 || (known_lt (initial_adjust,
9806 guard_size - guard_used_by_caller)
9807 && known_eq (sve_callee_adjust, 0))));
9809 /* We need to add memory barrier to prevent read from deallocated stack. */
9810 bool need_barrier_p
9811 = maybe_ne (get_frame_size ()
9812 + cfun->machine->frame.saved_varargs_size, 0);
9814 /* Emit a barrier to prevent loads from a deallocated stack. */
9815 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9816 || cfun->calls_alloca
9817 || crtl->calls_eh_return)
9819 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9820 need_barrier_p = false;
9823 /* Restore the stack pointer from the frame pointer if it may not
9824 be the same as the stack pointer. */
9825 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9826 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9827 if (frame_pointer_needed
9828 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9829 /* If writeback is used when restoring callee-saves, the CFA
9830 is restored on the instruction doing the writeback. */
9831 aarch64_add_offset (Pmode, stack_pointer_rtx,
9832 hard_frame_pointer_rtx,
9833 -callee_offset - below_hard_fp_saved_regs_size,
9834 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
9835 else
9836 /* The case where we need to re-use the register here is very rare, so
9837 avoid the complicated condition and just always emit a move if the
9838 immediate doesn't fit. */
9839 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
9841 /* Restore the vector registers before the predicate registers,
9842 so that we can use P4 as a temporary for big-endian SVE frames. */
9843 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
9844 callee_adjust != 0, &cfi_ops);
9845 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
9846 false, &cfi_ops);
9847 if (maybe_ne (sve_callee_adjust, 0))
9848 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
9850 /* When shadow call stack is enabled, the scs_pop in the epilogue will
9851 restore x30, we don't need to restore x30 again in the traditional
9852 way. */
9853 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
9854 R0_REGNUM, last_gpr,
9855 callee_adjust != 0, &cfi_ops);
9857 if (need_barrier_p)
9858 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9860 if (callee_adjust != 0)
9861 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9863 /* If we have no register restore information, the CFA must have been
9864 defined in terms of the stack pointer since the end of the prologue. */
9865 gcc_assert (cfi_ops || !frame_pointer_needed);
9867 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9869 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
9870 insn = get_last_insn ();
9871 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9872 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9873 RTX_FRAME_RELATED_P (insn) = 1;
9874 cfi_ops = NULL;
9877 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9878 add restriction on emit_move optimization to leaf functions. */
9879 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
9880 (!can_inherit_p || !crtl->is_leaf
9881 || df_regs_ever_live_p (EP0_REGNUM)));
9883 if (cfi_ops)
9885 /* Emit delayed restores and reset the CFA to be SP. */
9886 insn = get_last_insn ();
9887 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9888 REG_NOTES (insn) = cfi_ops;
9889 RTX_FRAME_RELATED_P (insn) = 1;
9892 /* Pop return address from shadow call stack. */
9893 if (cfun->machine->frame.is_scs_enabled)
9895 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
9896 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
9898 insn = emit_insn (gen_scs_pop ());
9899 add_reg_note (insn, REG_CFA_RESTORE, reg);
9900 RTX_FRAME_RELATED_P (insn) = 1;
9903 /* We prefer to emit the combined return/authenticate instruction RETAA,
9904 however there are three cases in which we must instead emit an explicit
9905 authentication instruction.
9907 1) Sibcalls don't return in a normal way, so if we're about to call one
9908 we must authenticate.
9910 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
9911 generating code for !TARGET_ARMV8_3 we can't use it and must
9912 explicitly authenticate.
9914 if (aarch64_return_address_signing_enabled ()
9915 && (for_sibcall || !TARGET_ARMV8_3))
9917 switch (aarch64_ra_sign_key)
9919 case AARCH64_KEY_A:
9920 insn = emit_insn (gen_autiasp ());
9921 break;
9922 case AARCH64_KEY_B:
9923 insn = emit_insn (gen_autibsp ());
9924 break;
9925 default:
9926 gcc_unreachable ();
9928 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9929 RTX_FRAME_RELATED_P (insn) = 1;
9932 /* Stack adjustment for exception handler. */
9933 if (crtl->calls_eh_return && !for_sibcall)
9935 /* We need to unwind the stack by the offset computed by
9936 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
9937 to be SP; letting the CFA move during this adjustment
9938 is just as correct as retaining the CFA from the body
9939 of the function. Therefore, do nothing special. */
9940 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
9943 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
9944 if (!for_sibcall)
9945 emit_jump_insn (ret_rtx);
9948 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
9949 normally or return to a previous frame after unwinding.
9951 An EH return uses a single shared return sequence. The epilogue is
9952 exactly like a normal epilogue except that it has an extra input
9953 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
9954 that must be applied after the frame has been destroyed. An extra label
9955 is inserted before the epilogue which initializes this register to zero,
9956 and this is the entry point for a normal return.
9958 An actual EH return updates the return address, initializes the stack
9959 adjustment and jumps directly into the epilogue (bypassing the zeroing
9960 of the adjustment). Since the return address is typically saved on the
9961 stack when a function makes a call, the saved LR must be updated outside
9962 the epilogue.
9964 This poses problems as the store is generated well before the epilogue,
9965 so the offset of LR is not known yet. Also optimizations will remove the
9966 store as it appears dead, even after the epilogue is generated (as the
9967 base or offset for loading LR is different in many cases).
9969 To avoid these problems this implementation forces the frame pointer
9970 in eh_return functions so that the location of LR is fixed and known early.
9971 It also marks the store volatile, so no optimization is permitted to
9972 remove the store. */
9974 aarch64_eh_return_handler_rtx (void)
9976 rtx tmp = gen_frame_mem (Pmode,
9977 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
9979 /* Mark the store volatile, so no optimization is permitted to remove it. */
9980 MEM_VOLATILE_P (tmp) = true;
9981 return tmp;
9984 /* Output code to add DELTA to the first argument, and then jump
9985 to FUNCTION. Used for C++ multiple inheritance. */
9986 static void
9987 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
9988 HOST_WIDE_INT delta,
9989 HOST_WIDE_INT vcall_offset,
9990 tree function)
9992 /* The this pointer is always in x0. Note that this differs from
9993 Arm where the this pointer maybe bumped to r1 if r0 is required
9994 to return a pointer to an aggregate. On AArch64 a result value
9995 pointer will be in x8. */
9996 int this_regno = R0_REGNUM;
9997 rtx this_rtx, temp0, temp1, addr, funexp;
9998 rtx_insn *insn;
9999 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10001 if (aarch64_bti_enabled ())
10002 emit_insn (gen_bti_c());
10004 reload_completed = 1;
10005 emit_note (NOTE_INSN_PROLOGUE_END);
10007 this_rtx = gen_rtx_REG (Pmode, this_regno);
10008 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10009 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10011 if (vcall_offset == 0)
10012 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10013 else
10015 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10017 addr = this_rtx;
10018 if (delta != 0)
10020 if (delta >= -256 && delta < 256)
10021 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10022 plus_constant (Pmode, this_rtx, delta));
10023 else
10024 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10025 temp1, temp0, false);
10028 if (Pmode == ptr_mode)
10029 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10030 else
10031 aarch64_emit_move (temp0,
10032 gen_rtx_ZERO_EXTEND (Pmode,
10033 gen_rtx_MEM (ptr_mode, addr)));
10035 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10036 addr = plus_constant (Pmode, temp0, vcall_offset);
10037 else
10039 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10040 Pmode);
10041 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10044 if (Pmode == ptr_mode)
10045 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10046 else
10047 aarch64_emit_move (temp1,
10048 gen_rtx_SIGN_EXTEND (Pmode,
10049 gen_rtx_MEM (ptr_mode, addr)));
10051 emit_insn (gen_add2_insn (this_rtx, temp1));
10054 /* Generate a tail call to the target function. */
10055 if (!TREE_USED (function))
10057 assemble_external (function);
10058 TREE_USED (function) = 1;
10060 funexp = XEXP (DECL_RTL (function), 0);
10061 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10062 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10063 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10064 SIBLING_CALL_P (insn) = 1;
10066 insn = get_insns ();
10067 shorten_branches (insn);
10069 assemble_start_function (thunk, fnname);
10070 final_start_function (insn, file, 1);
10071 final (insn, file, 1);
10072 final_end_function ();
10073 assemble_end_function (thunk, fnname);
10075 /* Stop pretending to be a post-reload pass. */
10076 reload_completed = 0;
10079 static bool
10080 aarch64_tls_referenced_p (rtx x)
10082 if (!TARGET_HAVE_TLS)
10083 return false;
10084 subrtx_iterator::array_type array;
10085 FOR_EACH_SUBRTX (iter, array, x, ALL)
10087 const_rtx x = *iter;
10088 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10089 return true;
10090 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10091 TLS offsets, not real symbol references. */
10092 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10093 iter.skip_subrtxes ();
10095 return false;
10099 /* Return true if val can be encoded as a 12-bit unsigned immediate with
10100 a left shift of 0 or 12 bits. */
10101 bool
10102 aarch64_uimm12_shift (HOST_WIDE_INT val)
10104 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
10105 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
10109 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
10110 that can be created with a left shift of 0 or 12. */
10111 static HOST_WIDE_INT
10112 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
10114 /* Check to see if the value fits in 24 bits, as that is the maximum we can
10115 handle correctly. */
10116 gcc_assert ((val & 0xffffff) == val);
10118 if (((val & 0xfff) << 0) == val)
10119 return val;
10121 return val & (0xfff << 12);
10124 /* Return true if val is an immediate that can be loaded into a
10125 register by a MOVZ instruction. */
10126 static bool
10127 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
10129 if (GET_MODE_SIZE (mode) > 4)
10131 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
10132 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
10133 return 1;
10135 else
10137 /* Ignore sign extension. */
10138 val &= (HOST_WIDE_INT) 0xffffffff;
10140 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
10141 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
10144 /* Test whether:
10146 X = (X & AND_VAL) | IOR_VAL;
10148 can be implemented using:
10150 MOVK X, #(IOR_VAL >> shift), LSL #shift
10152 Return the shift if so, otherwise return -1. */
10154 aarch64_movk_shift (const wide_int_ref &and_val,
10155 const wide_int_ref &ior_val)
10157 unsigned int precision = and_val.get_precision ();
10158 unsigned HOST_WIDE_INT mask = 0xffff;
10159 for (unsigned int shift = 0; shift < precision; shift += 16)
10161 if (and_val == ~mask && (ior_val & mask) == ior_val)
10162 return shift;
10163 mask <<= 16;
10165 return -1;
10168 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
10169 64-bit (DImode) integer. */
10171 static unsigned HOST_WIDE_INT
10172 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
10174 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
10175 while (size < 64)
10177 val &= (HOST_WIDE_INT_1U << size) - 1;
10178 val |= val << size;
10179 size *= 2;
10181 return val;
10184 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
10186 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
10188 0x0000000100000001ull,
10189 0x0001000100010001ull,
10190 0x0101010101010101ull,
10191 0x1111111111111111ull,
10192 0x5555555555555555ull,
10196 /* Return true if val is a valid bitmask immediate. */
10198 bool
10199 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
10201 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
10202 int bits;
10204 /* Check for a single sequence of one bits and return quickly if so.
10205 The special cases of all ones and all zeroes returns false. */
10206 val = aarch64_replicate_bitmask_imm (val_in, mode);
10207 tmp = val + (val & -val);
10209 if (tmp == (tmp & -tmp))
10210 return (val + 1) > 1;
10212 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
10213 if (mode == SImode)
10214 val = (val << 32) | (val & 0xffffffff);
10216 /* Invert if the immediate doesn't start with a zero bit - this means we
10217 only need to search for sequences of one bits. */
10218 if (val & 1)
10219 val = ~val;
10221 /* Find the first set bit and set tmp to val with the first sequence of one
10222 bits removed. Return success if there is a single sequence of ones. */
10223 first_one = val & -val;
10224 tmp = val & (val + first_one);
10226 if (tmp == 0)
10227 return true;
10229 /* Find the next set bit and compute the difference in bit position. */
10230 next_one = tmp & -tmp;
10231 bits = clz_hwi (first_one) - clz_hwi (next_one);
10232 mask = val ^ tmp;
10234 /* Check the bit position difference is a power of 2, and that the first
10235 sequence of one bits fits within 'bits' bits. */
10236 if ((mask >> bits) != 0 || bits != (bits & -bits))
10237 return false;
10239 /* Check the sequence of one bits is repeated 64/bits times. */
10240 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
10243 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
10244 Assumed precondition: VAL_IN Is not zero. */
10246 unsigned HOST_WIDE_INT
10247 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
10249 int lowest_bit_set = ctz_hwi (val_in);
10250 int highest_bit_set = floor_log2 (val_in);
10251 gcc_assert (val_in != 0);
10253 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
10254 (HOST_WIDE_INT_1U << lowest_bit_set));
10257 /* Create constant where bits outside of lowest bit set to highest bit set
10258 are set to 1. */
10260 unsigned HOST_WIDE_INT
10261 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
10263 return val_in | ~aarch64_and_split_imm1 (val_in);
10266 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
10268 bool
10269 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
10271 scalar_int_mode int_mode;
10272 if (!is_a <scalar_int_mode> (mode, &int_mode))
10273 return false;
10275 if (aarch64_bitmask_imm (val_in, int_mode))
10276 return false;
10278 if (aarch64_move_imm (val_in, int_mode))
10279 return false;
10281 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
10283 return aarch64_bitmask_imm (imm2, int_mode);
10286 /* Return true if val is an immediate that can be loaded into a
10287 register in a single instruction. */
10288 bool
10289 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
10291 scalar_int_mode int_mode;
10292 if (!is_a <scalar_int_mode> (mode, &int_mode))
10293 return false;
10295 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
10296 return 1;
10297 return aarch64_bitmask_imm (val, int_mode);
10300 static bool
10301 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10303 if (GET_CODE (x) == HIGH)
10304 return true;
10306 /* There's no way to calculate VL-based values using relocations. */
10307 subrtx_iterator::array_type array;
10308 FOR_EACH_SUBRTX (iter, array, x, ALL)
10309 if (GET_CODE (*iter) == CONST_POLY_INT)
10310 return true;
10312 poly_int64 offset;
10313 rtx base = strip_offset_and_salt (x, &offset);
10314 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10316 /* We checked for POLY_INT_CST offsets above. */
10317 if (aarch64_classify_symbol (base, offset.to_constant ())
10318 != SYMBOL_FORCE_TO_MEM)
10319 return true;
10320 else
10321 /* Avoid generating a 64-bit relocation in ILP32; leave
10322 to aarch64_expand_mov_immediate to handle it properly. */
10323 return mode != ptr_mode;
10326 return aarch64_tls_referenced_p (x);
10329 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10330 The expansion for a table switch is quite expensive due to the number
10331 of instructions, the table lookup and hard to predict indirect jump.
10332 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10333 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10334 performance. When optimizing for size, use 8 for smallest codesize. */
10336 static unsigned int
10337 aarch64_case_values_threshold (void)
10339 /* Use the specified limit for the number of cases before using jump
10340 tables at higher optimization levels. */
10341 if (optimize > 2
10342 && aarch64_tune_params.max_case_values != 0)
10343 return aarch64_tune_params.max_case_values;
10344 else
10345 return optimize_size ? 8 : 11;
10348 /* Return true if register REGNO is a valid index register.
10349 STRICT_P is true if REG_OK_STRICT is in effect. */
10351 bool
10352 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10354 if (!HARD_REGISTER_NUM_P (regno))
10356 if (!strict_p)
10357 return true;
10359 if (!reg_renumber)
10360 return false;
10362 regno = reg_renumber[regno];
10364 return GP_REGNUM_P (regno);
10367 /* Return true if register REGNO is a valid base register for mode MODE.
10368 STRICT_P is true if REG_OK_STRICT is in effect. */
10370 bool
10371 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10373 if (!HARD_REGISTER_NUM_P (regno))
10375 if (!strict_p)
10376 return true;
10378 if (!reg_renumber)
10379 return false;
10381 regno = reg_renumber[regno];
10384 /* The fake registers will be eliminated to either the stack or
10385 hard frame pointer, both of which are usually valid base registers.
10386 Reload deals with the cases where the eliminated form isn't valid. */
10387 return (GP_REGNUM_P (regno)
10388 || regno == SP_REGNUM
10389 || regno == FRAME_POINTER_REGNUM
10390 || regno == ARG_POINTER_REGNUM);
10393 /* Return true if X is a valid base register for mode MODE.
10394 STRICT_P is true if REG_OK_STRICT is in effect. */
10396 static bool
10397 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10399 if (!strict_p
10400 && SUBREG_P (x)
10401 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10402 x = SUBREG_REG (x);
10404 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10407 /* Return true if address offset is a valid index. If it is, fill in INFO
10408 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10410 static bool
10411 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10412 machine_mode mode, bool strict_p)
10414 enum aarch64_address_type type;
10415 rtx index;
10416 int shift;
10418 /* (reg:P) */
10419 if ((REG_P (x) || SUBREG_P (x))
10420 && GET_MODE (x) == Pmode)
10422 type = ADDRESS_REG_REG;
10423 index = x;
10424 shift = 0;
10426 /* (sign_extend:DI (reg:SI)) */
10427 else if ((GET_CODE (x) == SIGN_EXTEND
10428 || GET_CODE (x) == ZERO_EXTEND)
10429 && GET_MODE (x) == DImode
10430 && GET_MODE (XEXP (x, 0)) == SImode)
10432 type = (GET_CODE (x) == SIGN_EXTEND)
10433 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10434 index = XEXP (x, 0);
10435 shift = 0;
10437 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10438 else if (GET_CODE (x) == MULT
10439 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10440 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10441 && GET_MODE (XEXP (x, 0)) == DImode
10442 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10443 && CONST_INT_P (XEXP (x, 1)))
10445 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10446 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10447 index = XEXP (XEXP (x, 0), 0);
10448 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10450 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10451 else if (GET_CODE (x) == ASHIFT
10452 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10453 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10454 && GET_MODE (XEXP (x, 0)) == DImode
10455 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10456 && CONST_INT_P (XEXP (x, 1)))
10458 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10459 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10460 index = XEXP (XEXP (x, 0), 0);
10461 shift = INTVAL (XEXP (x, 1));
10463 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10464 (const_int 0xffffffff<<shift)) */
10465 else if (GET_CODE (x) == AND
10466 && GET_MODE (x) == DImode
10467 && GET_CODE (XEXP (x, 0)) == MULT
10468 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10469 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10470 && CONST_INT_P (XEXP (x, 1)))
10472 type = ADDRESS_REG_UXTW;
10473 index = XEXP (XEXP (x, 0), 0);
10474 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10475 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10476 shift = -1;
10478 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10479 (const_int 0xffffffff<<shift)) */
10480 else if (GET_CODE (x) == AND
10481 && GET_MODE (x) == DImode
10482 && GET_CODE (XEXP (x, 0)) == ASHIFT
10483 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10484 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10485 && CONST_INT_P (XEXP (x, 1)))
10487 type = ADDRESS_REG_UXTW;
10488 index = XEXP (XEXP (x, 0), 0);
10489 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10490 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10491 shift = -1;
10493 /* (mult:P (reg:P) (const_int scale)) */
10494 else if (GET_CODE (x) == MULT
10495 && GET_MODE (x) == Pmode
10496 && GET_MODE (XEXP (x, 0)) == Pmode
10497 && CONST_INT_P (XEXP (x, 1)))
10499 type = ADDRESS_REG_REG;
10500 index = XEXP (x, 0);
10501 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10503 /* (ashift:P (reg:P) (const_int shift)) */
10504 else if (GET_CODE (x) == ASHIFT
10505 && GET_MODE (x) == Pmode
10506 && GET_MODE (XEXP (x, 0)) == Pmode
10507 && CONST_INT_P (XEXP (x, 1)))
10509 type = ADDRESS_REG_REG;
10510 index = XEXP (x, 0);
10511 shift = INTVAL (XEXP (x, 1));
10513 else
10514 return false;
10516 if (!strict_p
10517 && SUBREG_P (index)
10518 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10519 index = SUBREG_REG (index);
10521 if (aarch64_sve_data_mode_p (mode))
10523 if (type != ADDRESS_REG_REG
10524 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10525 return false;
10527 else
10529 if (shift != 0
10530 && !(IN_RANGE (shift, 1, 3)
10531 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10532 return false;
10535 if (REG_P (index)
10536 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10538 info->type = type;
10539 info->offset = index;
10540 info->shift = shift;
10541 return true;
10544 return false;
10547 /* Return true if MODE is one of the modes for which we
10548 support LDP/STP operations. */
10550 static bool
10551 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10553 return mode == SImode || mode == DImode
10554 || mode == SFmode || mode == DFmode
10555 || mode == SDmode || mode == DDmode
10556 || (aarch64_vector_mode_supported_p (mode)
10557 && (known_eq (GET_MODE_SIZE (mode), 8)
10558 || (known_eq (GET_MODE_SIZE (mode), 16)
10559 && (aarch64_tune_params.extra_tuning_flags
10560 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10563 /* Return true if REGNO is a virtual pointer register, or an eliminable
10564 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10565 include stack_pointer or hard_frame_pointer. */
10566 static bool
10567 virt_or_elim_regno_p (unsigned regno)
10569 return ((regno >= FIRST_VIRTUAL_REGISTER
10570 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10571 || regno == FRAME_POINTER_REGNUM
10572 || regno == ARG_POINTER_REGNUM);
10575 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10576 If it is, fill in INFO appropriately. STRICT_P is true if
10577 REG_OK_STRICT is in effect. */
10579 bool
10580 aarch64_classify_address (struct aarch64_address_info *info,
10581 rtx x, machine_mode mode, bool strict_p,
10582 aarch64_addr_query_type type)
10584 enum rtx_code code = GET_CODE (x);
10585 rtx op0, op1;
10586 poly_int64 offset;
10588 HOST_WIDE_INT const_size;
10590 /* Whether a vector mode is partial doesn't affect address legitimacy.
10591 Partial vectors like VNx8QImode allow the same indexed addressing
10592 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10593 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10594 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10595 vec_flags &= ~VEC_PARTIAL;
10597 /* On BE, we use load/store pair for all large int mode load/stores.
10598 TI/TF/TDmode may also use a load/store pair. */
10599 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10600 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10601 || type == ADDR_QUERY_LDP_STP_N
10602 || mode == TImode
10603 || mode == TFmode
10604 || mode == TDmode
10605 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
10606 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10607 corresponds to the actual size of the memory being loaded/stored and the
10608 mode of the corresponding addressing mode is half of that. */
10609 if (type == ADDR_QUERY_LDP_STP_N)
10611 if (known_eq (GET_MODE_SIZE (mode), 16))
10612 mode = DFmode;
10613 else if (known_eq (GET_MODE_SIZE (mode), 8))
10614 mode = SFmode;
10615 else
10616 return false;
10619 bool allow_reg_index_p = (!load_store_pair_p
10620 && ((vec_flags == 0
10621 && known_lt (GET_MODE_SIZE (mode), 16))
10622 || vec_flags == VEC_ADVSIMD
10623 || vec_flags & VEC_SVE_DATA));
10625 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10626 The latter is not valid for SVE predicates, and that's rejected through
10627 allow_reg_index_p above. */
10628 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10629 && (code != REG && code != PLUS))
10630 return false;
10632 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10633 REG addressing. */
10634 if (advsimd_struct_p
10635 && !BYTES_BIG_ENDIAN
10636 && (code != POST_INC && code != REG))
10637 return false;
10639 gcc_checking_assert (GET_MODE (x) == VOIDmode
10640 || SCALAR_INT_MODE_P (GET_MODE (x)));
10642 switch (code)
10644 case REG:
10645 case SUBREG:
10646 info->type = ADDRESS_REG_IMM;
10647 info->base = x;
10648 info->offset = const0_rtx;
10649 info->const_offset = 0;
10650 return aarch64_base_register_rtx_p (x, strict_p);
10652 case PLUS:
10653 op0 = XEXP (x, 0);
10654 op1 = XEXP (x, 1);
10656 if (! strict_p
10657 && REG_P (op0)
10658 && virt_or_elim_regno_p (REGNO (op0))
10659 && poly_int_rtx_p (op1, &offset))
10661 info->type = ADDRESS_REG_IMM;
10662 info->base = op0;
10663 info->offset = op1;
10664 info->const_offset = offset;
10666 return true;
10669 if (maybe_ne (GET_MODE_SIZE (mode), 0)
10670 && aarch64_base_register_rtx_p (op0, strict_p)
10671 && poly_int_rtx_p (op1, &offset))
10673 info->type = ADDRESS_REG_IMM;
10674 info->base = op0;
10675 info->offset = op1;
10676 info->const_offset = offset;
10678 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10679 registers and individual Q registers. The available
10680 address modes are:
10681 X,X: 7-bit signed scaled offset
10682 Q: 9-bit signed offset
10683 We conservatively require an offset representable in either mode.
10684 When performing the check for pairs of X registers i.e. LDP/STP
10685 pass down DImode since that is the natural size of the LDP/STP
10686 instruction memory accesses. */
10687 if (mode == TImode || mode == TFmode || mode == TDmode)
10688 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10689 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10690 || offset_12bit_unsigned_scaled_p (mode, offset)));
10692 if (mode == V8DImode)
10693 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10694 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10696 /* A 7bit offset check because OImode will emit a ldp/stp
10697 instruction (only big endian will get here).
10698 For ldp/stp instructions, the offset is scaled for the size of a
10699 single element of the pair. */
10700 if (aarch64_advsimd_partial_struct_mode_p (mode)
10701 && known_eq (GET_MODE_SIZE (mode), 16))
10702 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10703 if (aarch64_advsimd_full_struct_mode_p (mode)
10704 && known_eq (GET_MODE_SIZE (mode), 32))
10705 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10707 /* Three 9/12 bit offsets checks because CImode will emit three
10708 ldr/str instructions (only big endian will get here). */
10709 if (aarch64_advsimd_partial_struct_mode_p (mode)
10710 && known_eq (GET_MODE_SIZE (mode), 24))
10711 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10712 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10713 offset + 16)
10714 || offset_12bit_unsigned_scaled_p (DImode,
10715 offset + 16)));
10716 if (aarch64_advsimd_full_struct_mode_p (mode)
10717 && known_eq (GET_MODE_SIZE (mode), 48))
10718 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10719 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10720 offset + 32)
10721 || offset_12bit_unsigned_scaled_p (TImode,
10722 offset + 32)));
10724 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10725 instructions (only big endian will get here). */
10726 if (aarch64_advsimd_partial_struct_mode_p (mode)
10727 && known_eq (GET_MODE_SIZE (mode), 32))
10728 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10729 && aarch64_offset_7bit_signed_scaled_p (DImode,
10730 offset + 16));
10731 if (aarch64_advsimd_full_struct_mode_p (mode)
10732 && known_eq (GET_MODE_SIZE (mode), 64))
10733 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10734 && aarch64_offset_7bit_signed_scaled_p (TImode,
10735 offset + 32));
10737 /* Make "m" use the LD1 offset range for SVE data modes, so
10738 that pre-RTL optimizers like ivopts will work to that
10739 instead of the wider LDR/STR range. */
10740 if (vec_flags == VEC_SVE_DATA)
10741 return (type == ADDR_QUERY_M
10742 ? offset_4bit_signed_scaled_p (mode, offset)
10743 : offset_9bit_signed_scaled_p (mode, offset));
10745 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10747 poly_int64 end_offset = (offset
10748 + GET_MODE_SIZE (mode)
10749 - BYTES_PER_SVE_VECTOR);
10750 return (type == ADDR_QUERY_M
10751 ? offset_4bit_signed_scaled_p (mode, offset)
10752 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10753 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10754 end_offset)));
10757 if (vec_flags == VEC_SVE_PRED)
10758 return offset_9bit_signed_scaled_p (mode, offset);
10760 if (load_store_pair_p)
10761 return ((known_eq (GET_MODE_SIZE (mode), 4)
10762 || known_eq (GET_MODE_SIZE (mode), 8)
10763 || known_eq (GET_MODE_SIZE (mode), 16))
10764 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10765 else
10766 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10767 || offset_12bit_unsigned_scaled_p (mode, offset));
10770 if (allow_reg_index_p)
10772 /* Look for base + (scaled/extended) index register. */
10773 if (aarch64_base_register_rtx_p (op0, strict_p)
10774 && aarch64_classify_index (info, op1, mode, strict_p))
10776 info->base = op0;
10777 return true;
10779 if (aarch64_base_register_rtx_p (op1, strict_p)
10780 && aarch64_classify_index (info, op0, mode, strict_p))
10782 info->base = op1;
10783 return true;
10787 return false;
10789 case POST_INC:
10790 case POST_DEC:
10791 case PRE_INC:
10792 case PRE_DEC:
10793 info->type = ADDRESS_REG_WB;
10794 info->base = XEXP (x, 0);
10795 info->offset = NULL_RTX;
10796 return aarch64_base_register_rtx_p (info->base, strict_p);
10798 case POST_MODIFY:
10799 case PRE_MODIFY:
10800 info->type = ADDRESS_REG_WB;
10801 info->base = XEXP (x, 0);
10802 if (GET_CODE (XEXP (x, 1)) == PLUS
10803 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10804 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10805 && aarch64_base_register_rtx_p (info->base, strict_p))
10807 info->offset = XEXP (XEXP (x, 1), 1);
10808 info->const_offset = offset;
10810 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10811 registers and individual Q registers. The available
10812 address modes are:
10813 X,X: 7-bit signed scaled offset
10814 Q: 9-bit signed offset
10815 We conservatively require an offset representable in either mode.
10817 if (mode == TImode || mode == TFmode || mode == TDmode)
10818 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10819 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10821 if (load_store_pair_p)
10822 return ((known_eq (GET_MODE_SIZE (mode), 4)
10823 || known_eq (GET_MODE_SIZE (mode), 8)
10824 || known_eq (GET_MODE_SIZE (mode), 16))
10825 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10826 else
10827 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10829 return false;
10831 case CONST:
10832 case SYMBOL_REF:
10833 case LABEL_REF:
10834 /* load literal: pc-relative constant pool entry. Only supported
10835 for SI mode or larger. */
10836 info->type = ADDRESS_SYMBOLIC;
10838 if (!load_store_pair_p
10839 && GET_MODE_SIZE (mode).is_constant (&const_size)
10840 && const_size >= 4)
10842 poly_int64 offset;
10843 rtx sym = strip_offset_and_salt (x, &offset);
10844 return ((LABEL_REF_P (sym)
10845 || (SYMBOL_REF_P (sym)
10846 && CONSTANT_POOL_ADDRESS_P (sym)
10847 && aarch64_pcrelative_literal_loads)));
10849 return false;
10851 case LO_SUM:
10852 info->type = ADDRESS_LO_SUM;
10853 info->base = XEXP (x, 0);
10854 info->offset = XEXP (x, 1);
10855 if (allow_reg_index_p
10856 && aarch64_base_register_rtx_p (info->base, strict_p))
10858 poly_int64 offset;
10859 HOST_WIDE_INT const_offset;
10860 rtx sym = strip_offset_and_salt (info->offset, &offset);
10861 if (SYMBOL_REF_P (sym)
10862 && offset.is_constant (&const_offset)
10863 && (aarch64_classify_symbol (sym, const_offset)
10864 == SYMBOL_SMALL_ABSOLUTE))
10866 /* The symbol and offset must be aligned to the access size. */
10867 unsigned int align;
10869 if (CONSTANT_POOL_ADDRESS_P (sym))
10870 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10871 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10873 tree exp = SYMBOL_REF_DECL (sym);
10874 align = TYPE_ALIGN (TREE_TYPE (exp));
10875 align = aarch64_constant_alignment (exp, align);
10877 else if (SYMBOL_REF_DECL (sym))
10878 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10879 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10880 && SYMBOL_REF_BLOCK (sym) != NULL)
10881 align = SYMBOL_REF_BLOCK (sym)->alignment;
10882 else
10883 align = BITS_PER_UNIT;
10885 poly_int64 ref_size = GET_MODE_SIZE (mode);
10886 if (known_eq (ref_size, 0))
10887 ref_size = GET_MODE_SIZE (DImode);
10889 return (multiple_p (const_offset, ref_size)
10890 && multiple_p (align / BITS_PER_UNIT, ref_size));
10893 return false;
10895 default:
10896 return false;
10900 /* Return true if the address X is valid for a PRFM instruction.
10901 STRICT_P is true if we should do strict checking with
10902 aarch64_classify_address. */
10904 bool
10905 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10907 struct aarch64_address_info addr;
10909 /* PRFM accepts the same addresses as DImode... */
10910 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10911 if (!res)
10912 return false;
10914 /* ... except writeback forms. */
10915 return addr.type != ADDRESS_REG_WB;
10918 bool
10919 aarch64_symbolic_address_p (rtx x)
10921 poly_int64 offset;
10922 x = strip_offset_and_salt (x, &offset);
10923 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10926 /* Classify the base of symbolic expression X. */
10928 enum aarch64_symbol_type
10929 aarch64_classify_symbolic_expression (rtx x)
10931 rtx offset;
10933 split_const (x, &x, &offset);
10934 return aarch64_classify_symbol (x, INTVAL (offset));
10938 /* Return TRUE if X is a legitimate address for accessing memory in
10939 mode MODE. */
10940 static bool
10941 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
10943 struct aarch64_address_info addr;
10945 return aarch64_classify_address (&addr, x, mode, strict_p);
10948 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10949 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
10950 bool
10951 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10952 aarch64_addr_query_type type)
10954 struct aarch64_address_info addr;
10956 return aarch64_classify_address (&addr, x, mode, strict_p, type);
10959 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
10961 static bool
10962 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10963 poly_int64 orig_offset,
10964 machine_mode mode)
10966 HOST_WIDE_INT size;
10967 if (GET_MODE_SIZE (mode).is_constant (&size))
10969 HOST_WIDE_INT const_offset, second_offset;
10971 /* A general SVE offset is A * VQ + B. Remove the A component from
10972 coefficient 0 in order to get the constant B. */
10973 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10975 /* Split an out-of-range address displacement into a base and
10976 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
10977 range otherwise to increase opportunities for sharing the base
10978 address of different sizes. Unaligned accesses use the signed
10979 9-bit range, TImode/TFmode/TDmode use the intersection of signed
10980 scaled 7-bit and signed 9-bit offset. */
10981 if (mode == TImode || mode == TFmode || mode == TDmode)
10982 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10983 else if ((const_offset & (size - 1)) != 0)
10984 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10985 else
10986 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10988 if (second_offset == 0 || known_eq (orig_offset, second_offset))
10989 return false;
10991 /* Split the offset into second_offset and the rest. */
10992 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10993 *offset2 = gen_int_mode (second_offset, Pmode);
10994 return true;
10996 else
10998 /* Get the mode we should use as the basis of the range. For structure
10999 modes this is the mode of one vector. */
11000 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11001 machine_mode step_mode
11002 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11004 /* Get the "mul vl" multiplier we'd like to use. */
11005 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11006 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11007 if (vec_flags & VEC_SVE_DATA)
11008 /* LDR supports a 9-bit range, but the move patterns for
11009 structure modes require all vectors to be in range of the
11010 same base. The simplest way of accomodating that while still
11011 promoting reuse of anchor points between different modes is
11012 to use an 8-bit range unconditionally. */
11013 vnum = ((vnum + 128) & 255) - 128;
11014 else
11015 /* Predicates are only handled singly, so we might as well use
11016 the full range. */
11017 vnum = ((vnum + 256) & 511) - 256;
11018 if (vnum == 0)
11019 return false;
11021 /* Convert the "mul vl" multiplier into a byte offset. */
11022 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11023 if (known_eq (second_offset, orig_offset))
11024 return false;
11026 /* Split the offset into second_offset and the rest. */
11027 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11028 *offset2 = gen_int_mode (second_offset, Pmode);
11029 return true;
11033 /* Return the binary representation of floating point constant VALUE in INTVAL.
11034 If the value cannot be converted, return false without setting INTVAL.
11035 The conversion is done in the given MODE. */
11036 bool
11037 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11040 /* We make a general exception for 0. */
11041 if (aarch64_float_const_zero_rtx_p (value))
11043 *intval = 0;
11044 return true;
11047 scalar_float_mode mode;
11048 if (!CONST_DOUBLE_P (value)
11049 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11050 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11051 /* Only support up to DF mode. */
11052 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11053 return false;
11055 unsigned HOST_WIDE_INT ival = 0;
11057 long res[2];
11058 real_to_target (res,
11059 CONST_DOUBLE_REAL_VALUE (value),
11060 REAL_MODE_FORMAT (mode));
11062 if (mode == DFmode || mode == DDmode)
11064 int order = BYTES_BIG_ENDIAN ? 1 : 0;
11065 ival = zext_hwi (res[order], 32);
11066 ival |= (zext_hwi (res[1 - order], 32) << 32);
11068 else
11069 ival = zext_hwi (res[0], 32);
11071 *intval = ival;
11072 return true;
11075 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11076 single MOV(+MOVK) followed by an FMOV. */
11077 bool
11078 aarch64_float_const_rtx_p (rtx x)
11080 machine_mode mode = GET_MODE (x);
11081 if (mode == VOIDmode)
11082 return false;
11084 /* Determine whether it's cheaper to write float constants as
11085 mov/movk pairs over ldr/adrp pairs. */
11086 unsigned HOST_WIDE_INT ival;
11088 if (CONST_DOUBLE_P (x)
11089 && SCALAR_FLOAT_MODE_P (mode)
11090 && aarch64_reinterpret_float_as_int (x, &ival))
11092 scalar_int_mode imode = (mode == HFmode
11093 ? SImode
11094 : int_mode_for_mode (mode).require ());
11095 int num_instr = aarch64_internal_mov_immediate
11096 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11097 return num_instr < 3;
11100 return false;
11103 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11104 Floating Point). */
11105 bool
11106 aarch64_float_const_zero_rtx_p (rtx x)
11108 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11109 zr as our callers expect, so no need to check the actual
11110 value if X is of Decimal Floating Point type. */
11111 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11112 return false;
11114 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11115 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11116 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11119 /* Return TRUE if rtx X is immediate constant that fits in a single
11120 MOVI immediate operation. */
11121 bool
11122 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11124 if (!TARGET_SIMD)
11125 return false;
11127 machine_mode vmode;
11128 scalar_int_mode imode;
11129 unsigned HOST_WIDE_INT ival;
11131 if (CONST_DOUBLE_P (x)
11132 && SCALAR_FLOAT_MODE_P (mode))
11134 if (!aarch64_reinterpret_float_as_int (x, &ival))
11135 return false;
11137 /* We make a general exception for 0. */
11138 if (aarch64_float_const_zero_rtx_p (x))
11139 return true;
11141 imode = int_mode_for_mode (mode).require ();
11143 else if (CONST_INT_P (x)
11144 && is_a <scalar_int_mode> (mode, &imode))
11145 ival = INTVAL (x);
11146 else
11147 return false;
11149 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11150 a 128 bit vector mode. */
11151 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11153 vmode = aarch64_simd_container_mode (imode, width);
11154 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11156 return aarch64_simd_valid_immediate (v_op, NULL);
11160 /* Return the fixed registers used for condition codes. */
11162 static bool
11163 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11165 *p1 = CC_REGNUM;
11166 *p2 = INVALID_REGNUM;
11167 return true;
11170 /* This function is used by the call expanders of the machine description.
11171 RESULT is the register in which the result is returned. It's NULL for
11172 "call" and "sibcall".
11173 MEM is the location of the function call.
11174 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11175 SIBCALL indicates whether this function call is normal call or sibling call.
11176 It will generate different pattern accordingly. */
11178 void
11179 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11181 rtx call, callee, tmp;
11182 rtvec vec;
11183 machine_mode mode;
11185 gcc_assert (MEM_P (mem));
11186 callee = XEXP (mem, 0);
11187 mode = GET_MODE (callee);
11188 gcc_assert (mode == Pmode);
11190 /* Decide if we should generate indirect calls by loading the
11191 address of the callee into a register before performing
11192 the branch-and-link. */
11193 if (SYMBOL_REF_P (callee)
11194 ? (aarch64_is_long_call_p (callee)
11195 || aarch64_is_noplt_call_p (callee))
11196 : !REG_P (callee))
11197 XEXP (mem, 0) = force_reg (mode, callee);
11199 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11201 if (result != NULL_RTX)
11202 call = gen_rtx_SET (result, call);
11204 if (sibcall)
11205 tmp = ret_rtx;
11206 else
11207 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11209 gcc_assert (CONST_INT_P (callee_abi));
11210 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11211 UNSPEC_CALLEE_ABI);
11213 vec = gen_rtvec (3, call, callee_abi, tmp);
11214 call = gen_rtx_PARALLEL (VOIDmode, vec);
11216 aarch64_emit_call_insn (call);
11219 /* Emit call insn with PAT and do aarch64-specific handling. */
11221 void
11222 aarch64_emit_call_insn (rtx pat)
11224 rtx insn = emit_call_insn (pat);
11226 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11227 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11228 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11231 machine_mode
11232 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11234 machine_mode mode_x = GET_MODE (x);
11235 rtx_code code_x = GET_CODE (x);
11237 /* All floating point compares return CCFP if it is an equality
11238 comparison, and CCFPE otherwise. */
11239 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11241 switch (code)
11243 case EQ:
11244 case NE:
11245 case UNORDERED:
11246 case ORDERED:
11247 case UNLT:
11248 case UNLE:
11249 case UNGT:
11250 case UNGE:
11251 case UNEQ:
11252 return CCFPmode;
11254 case LT:
11255 case LE:
11256 case GT:
11257 case GE:
11258 case LTGT:
11259 return CCFPEmode;
11261 default:
11262 gcc_unreachable ();
11266 /* Equality comparisons of short modes against zero can be performed
11267 using the TST instruction with the appropriate bitmask. */
11268 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11269 && (code == EQ || code == NE)
11270 && (mode_x == HImode || mode_x == QImode))
11271 return CC_NZmode;
11273 /* Similarly, comparisons of zero_extends from shorter modes can
11274 be performed using an ANDS with an immediate mask. */
11275 if (y == const0_rtx && code_x == ZERO_EXTEND
11276 && (mode_x == SImode || mode_x == DImode)
11277 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11278 && (code == EQ || code == NE))
11279 return CC_NZmode;
11281 if ((mode_x == SImode || mode_x == DImode)
11282 && y == const0_rtx
11283 && (code == EQ || code == NE || code == LT || code == GE)
11284 && (code_x == PLUS || code_x == MINUS || code_x == AND
11285 || code_x == NEG
11286 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11287 && CONST_INT_P (XEXP (x, 2)))))
11288 return CC_NZmode;
11290 /* A compare with a shifted operand. Because of canonicalization,
11291 the comparison will have to be swapped when we emit the assembly
11292 code. */
11293 if ((mode_x == SImode || mode_x == DImode)
11294 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11295 && (code_x == ASHIFT || code_x == ASHIFTRT
11296 || code_x == LSHIFTRT
11297 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11298 return CC_SWPmode;
11300 /* Similarly for a negated operand, but we can only do this for
11301 equalities. */
11302 if ((mode_x == SImode || mode_x == DImode)
11303 && (REG_P (y) || SUBREG_P (y))
11304 && (code == EQ || code == NE)
11305 && code_x == NEG)
11306 return CC_Zmode;
11308 /* A test for unsigned overflow from an addition. */
11309 if ((mode_x == DImode || mode_x == TImode)
11310 && (code == LTU || code == GEU)
11311 && code_x == PLUS
11312 && rtx_equal_p (XEXP (x, 0), y))
11313 return CC_Cmode;
11315 /* A test for unsigned overflow from an add with carry. */
11316 if ((mode_x == DImode || mode_x == TImode)
11317 && (code == LTU || code == GEU)
11318 && code_x == PLUS
11319 && CONST_SCALAR_INT_P (y)
11320 && (rtx_mode_t (y, mode_x)
11321 == (wi::shwi (1, mode_x)
11322 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11323 return CC_ADCmode;
11325 /* A test for signed overflow. */
11326 if ((mode_x == DImode || mode_x == TImode)
11327 && code == NE
11328 && code_x == PLUS
11329 && GET_CODE (y) == SIGN_EXTEND)
11330 return CC_Vmode;
11332 /* For everything else, return CCmode. */
11333 return CCmode;
11336 static int
11337 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11340 aarch64_get_condition_code (rtx x)
11342 machine_mode mode = GET_MODE (XEXP (x, 0));
11343 enum rtx_code comp_code = GET_CODE (x);
11345 if (GET_MODE_CLASS (mode) != MODE_CC)
11346 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11347 return aarch64_get_condition_code_1 (mode, comp_code);
11350 static int
11351 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11353 switch (mode)
11355 case E_CCFPmode:
11356 case E_CCFPEmode:
11357 switch (comp_code)
11359 case GE: return AARCH64_GE;
11360 case GT: return AARCH64_GT;
11361 case LE: return AARCH64_LS;
11362 case LT: return AARCH64_MI;
11363 case NE: return AARCH64_NE;
11364 case EQ: return AARCH64_EQ;
11365 case ORDERED: return AARCH64_VC;
11366 case UNORDERED: return AARCH64_VS;
11367 case UNLT: return AARCH64_LT;
11368 case UNLE: return AARCH64_LE;
11369 case UNGT: return AARCH64_HI;
11370 case UNGE: return AARCH64_PL;
11371 default: return -1;
11373 break;
11375 case E_CCmode:
11376 switch (comp_code)
11378 case NE: return AARCH64_NE;
11379 case EQ: return AARCH64_EQ;
11380 case GE: return AARCH64_GE;
11381 case GT: return AARCH64_GT;
11382 case LE: return AARCH64_LE;
11383 case LT: return AARCH64_LT;
11384 case GEU: return AARCH64_CS;
11385 case GTU: return AARCH64_HI;
11386 case LEU: return AARCH64_LS;
11387 case LTU: return AARCH64_CC;
11388 default: return -1;
11390 break;
11392 case E_CC_SWPmode:
11393 switch (comp_code)
11395 case NE: return AARCH64_NE;
11396 case EQ: return AARCH64_EQ;
11397 case GE: return AARCH64_LE;
11398 case GT: return AARCH64_LT;
11399 case LE: return AARCH64_GE;
11400 case LT: return AARCH64_GT;
11401 case GEU: return AARCH64_LS;
11402 case GTU: return AARCH64_CC;
11403 case LEU: return AARCH64_CS;
11404 case LTU: return AARCH64_HI;
11405 default: return -1;
11407 break;
11409 case E_CC_NZCmode:
11410 switch (comp_code)
11412 case NE: return AARCH64_NE; /* = any */
11413 case EQ: return AARCH64_EQ; /* = none */
11414 case GE: return AARCH64_PL; /* = nfrst */
11415 case LT: return AARCH64_MI; /* = first */
11416 case GEU: return AARCH64_CS; /* = nlast */
11417 case GTU: return AARCH64_HI; /* = pmore */
11418 case LEU: return AARCH64_LS; /* = plast */
11419 case LTU: return AARCH64_CC; /* = last */
11420 default: return -1;
11422 break;
11424 case E_CC_NZmode:
11425 switch (comp_code)
11427 case NE: return AARCH64_NE;
11428 case EQ: return AARCH64_EQ;
11429 case GE: return AARCH64_PL;
11430 case LT: return AARCH64_MI;
11431 default: return -1;
11433 break;
11435 case E_CC_Zmode:
11436 switch (comp_code)
11438 case NE: return AARCH64_NE;
11439 case EQ: return AARCH64_EQ;
11440 default: return -1;
11442 break;
11444 case E_CC_Cmode:
11445 switch (comp_code)
11447 case LTU: return AARCH64_CS;
11448 case GEU: return AARCH64_CC;
11449 default: return -1;
11451 break;
11453 case E_CC_ADCmode:
11454 switch (comp_code)
11456 case GEU: return AARCH64_CS;
11457 case LTU: return AARCH64_CC;
11458 default: return -1;
11460 break;
11462 case E_CC_Vmode:
11463 switch (comp_code)
11465 case NE: return AARCH64_VS;
11466 case EQ: return AARCH64_VC;
11467 default: return -1;
11469 break;
11471 default:
11472 return -1;
11475 return -1;
11478 bool
11479 aarch64_const_vec_all_same_in_range_p (rtx x,
11480 HOST_WIDE_INT minval,
11481 HOST_WIDE_INT maxval)
11483 rtx elt;
11484 return (const_vec_duplicate_p (x, &elt)
11485 && CONST_INT_P (elt)
11486 && IN_RANGE (INTVAL (elt), minval, maxval));
11489 bool
11490 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11492 return aarch64_const_vec_all_same_in_range_p (x, val, val);
11495 /* Return true if VEC is a constant in which every element is in the range
11496 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11498 static bool
11499 aarch64_const_vec_all_in_range_p (rtx vec,
11500 HOST_WIDE_INT minval,
11501 HOST_WIDE_INT maxval)
11503 if (!CONST_VECTOR_P (vec)
11504 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11505 return false;
11507 int nunits;
11508 if (!CONST_VECTOR_STEPPED_P (vec))
11509 nunits = const_vector_encoded_nelts (vec);
11510 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11511 return false;
11513 for (int i = 0; i < nunits; i++)
11515 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11516 if (!CONST_INT_P (vec_elem)
11517 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11518 return false;
11520 return true;
11523 /* N Z C V. */
11524 #define AARCH64_CC_V 1
11525 #define AARCH64_CC_C (1 << 1)
11526 #define AARCH64_CC_Z (1 << 2)
11527 #define AARCH64_CC_N (1 << 3)
11529 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11530 static const int aarch64_nzcv_codes[] =
11532 0, /* EQ, Z == 1. */
11533 AARCH64_CC_Z, /* NE, Z == 0. */
11534 0, /* CS, C == 1. */
11535 AARCH64_CC_C, /* CC, C == 0. */
11536 0, /* MI, N == 1. */
11537 AARCH64_CC_N, /* PL, N == 0. */
11538 0, /* VS, V == 1. */
11539 AARCH64_CC_V, /* VC, V == 0. */
11540 0, /* HI, C ==1 && Z == 0. */
11541 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
11542 AARCH64_CC_V, /* GE, N == V. */
11543 0, /* LT, N != V. */
11544 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
11545 0, /* LE, !(Z == 0 && N == V). */
11546 0, /* AL, Any. */
11547 0 /* NV, Any. */
11550 /* Print floating-point vector immediate operand X to F, negating it
11551 first if NEGATE is true. Return true on success, false if it isn't
11552 a constant we can handle. */
11554 static bool
11555 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11557 rtx elt;
11559 if (!const_vec_duplicate_p (x, &elt))
11560 return false;
11562 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11563 if (negate)
11564 r = real_value_negate (&r);
11566 /* Handle the SVE single-bit immediates specially, since they have a
11567 fixed form in the assembly syntax. */
11568 if (real_equal (&r, &dconst0))
11569 asm_fprintf (f, "0.0");
11570 else if (real_equal (&r, &dconst2))
11571 asm_fprintf (f, "2.0");
11572 else if (real_equal (&r, &dconst1))
11573 asm_fprintf (f, "1.0");
11574 else if (real_equal (&r, &dconsthalf))
11575 asm_fprintf (f, "0.5");
11576 else
11578 const int buf_size = 20;
11579 char float_buf[buf_size] = {'\0'};
11580 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11581 1, GET_MODE (elt));
11582 asm_fprintf (f, "%s", float_buf);
11585 return true;
11588 /* Return the equivalent letter for size. */
11589 static char
11590 sizetochar (int size)
11592 switch (size)
11594 case 64: return 'd';
11595 case 32: return 's';
11596 case 16: return 'h';
11597 case 8 : return 'b';
11598 default: gcc_unreachable ();
11602 /* Print operand X to file F in a target specific manner according to CODE.
11603 The acceptable formatting commands given by CODE are:
11604 'c': An integer or symbol address without a preceding #
11605 sign.
11606 'C': Take the duplicated element in a vector constant
11607 and print it in hex.
11608 'D': Take the duplicated element in a vector constant
11609 and print it as an unsigned integer, in decimal.
11610 'e': Print the sign/zero-extend size as a character 8->b,
11611 16->h, 32->w. Can also be used for masks:
11612 0xff->b, 0xffff->h, 0xffffffff->w.
11613 'I': If the operand is a duplicated vector constant,
11614 replace it with the duplicated scalar. If the
11615 operand is then a floating-point constant, replace
11616 it with the integer bit representation. Print the
11617 transformed constant as a signed decimal number.
11618 'p': Prints N such that 2^N == X (X must be power of 2 and
11619 const int).
11620 'P': Print the number of non-zero bits in X (a const_int).
11621 'H': Print the higher numbered register of a pair (TImode)
11622 of regs.
11623 'm': Print a condition (eq, ne, etc).
11624 'M': Same as 'm', but invert condition.
11625 'N': Take the duplicated element in a vector constant
11626 and print the negative of it in decimal.
11627 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11628 'S/T/U/V': Print a FP/SIMD register name for a register list.
11629 The register printed is the FP/SIMD register name
11630 of X + 0/1/2/3 for S/T/U/V.
11631 'R': Print a scalar Integer/FP/SIMD register name + 1.
11632 'X': Print bottom 16 bits of integer constant in hex.
11633 'w/x': Print a general register name or the zero register
11634 (32-bit or 64-bit).
11635 '0': Print a normal operand, if it's a general register,
11636 then we assume DImode.
11637 'k': Print NZCV for conditional compare instructions.
11638 'A': Output address constant representing the first
11639 argument of X, specifying a relocation offset
11640 if appropriate.
11641 'L': Output constant address specified by X
11642 with a relocation offset if appropriate.
11643 'G': Prints address of X, specifying a PC relative
11644 relocation mode if appropriate.
11645 'y': Output address of LDP or STP - this is used for
11646 some LDP/STPs which don't use a PARALLEL in their
11647 pattern (so the mode needs to be adjusted).
11648 'z': Output address of a typical LDP or STP. */
11650 static void
11651 aarch64_print_operand (FILE *f, rtx x, int code)
11653 rtx elt;
11654 switch (code)
11656 case 'c':
11657 if (CONST_INT_P (x))
11658 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11659 else
11661 poly_int64 offset;
11662 rtx base = strip_offset_and_salt (x, &offset);
11663 if (SYMBOL_REF_P (base))
11664 output_addr_const (f, x);
11665 else
11666 output_operand_lossage ("unsupported operand for code '%c'", code);
11668 break;
11670 case 'e':
11672 x = unwrap_const_vec_duplicate (x);
11673 if (!CONST_INT_P (x))
11675 output_operand_lossage ("invalid operand for '%%%c'", code);
11676 return;
11679 HOST_WIDE_INT val = INTVAL (x);
11680 if ((val & ~7) == 8 || val == 0xff)
11681 fputc ('b', f);
11682 else if ((val & ~7) == 16 || val == 0xffff)
11683 fputc ('h', f);
11684 else if ((val & ~7) == 32 || val == 0xffffffff)
11685 fputc ('w', f);
11686 else
11688 output_operand_lossage ("invalid operand for '%%%c'", code);
11689 return;
11692 break;
11694 case 'p':
11696 int n;
11698 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11700 output_operand_lossage ("invalid operand for '%%%c'", code);
11701 return;
11704 asm_fprintf (f, "%d", n);
11706 break;
11708 case 'P':
11709 if (!CONST_INT_P (x))
11711 output_operand_lossage ("invalid operand for '%%%c'", code);
11712 return;
11715 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11716 break;
11718 case 'H':
11719 if (x == const0_rtx)
11721 asm_fprintf (f, "xzr");
11722 break;
11725 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11727 output_operand_lossage ("invalid operand for '%%%c'", code);
11728 return;
11731 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11732 break;
11734 case 'I':
11736 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11737 if (CONST_INT_P (x))
11738 asm_fprintf (f, "%wd", INTVAL (x));
11739 else
11741 output_operand_lossage ("invalid operand for '%%%c'", code);
11742 return;
11744 break;
11747 case 'M':
11748 case 'm':
11750 int cond_code;
11751 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
11752 if (x == const_true_rtx)
11754 if (code == 'M')
11755 fputs ("nv", f);
11756 return;
11759 if (!COMPARISON_P (x))
11761 output_operand_lossage ("invalid operand for '%%%c'", code);
11762 return;
11765 cond_code = aarch64_get_condition_code (x);
11766 gcc_assert (cond_code >= 0);
11767 if (code == 'M')
11768 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11769 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11770 fputs (aarch64_sve_condition_codes[cond_code], f);
11771 else
11772 fputs (aarch64_condition_codes[cond_code], f);
11774 break;
11776 case 'N':
11777 if (!const_vec_duplicate_p (x, &elt))
11779 output_operand_lossage ("invalid vector constant");
11780 return;
11783 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11784 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
11785 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11786 && aarch64_print_vector_float_operand (f, x, true))
11788 else
11790 output_operand_lossage ("invalid vector constant");
11791 return;
11793 break;
11795 case 'b':
11796 case 'h':
11797 case 's':
11798 case 'd':
11799 case 'q':
11800 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11802 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11803 return;
11805 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
11806 break;
11808 case 'S':
11809 case 'T':
11810 case 'U':
11811 case 'V':
11812 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11814 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11815 return;
11817 asm_fprintf (f, "%c%d",
11818 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
11819 REGNO (x) - V0_REGNUM + (code - 'S'));
11820 break;
11822 case 'R':
11823 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
11824 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
11825 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
11826 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
11827 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
11828 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11829 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
11830 else
11831 output_operand_lossage ("incompatible register operand for '%%%c'",
11832 code);
11833 break;
11835 case 'X':
11836 if (!CONST_INT_P (x))
11838 output_operand_lossage ("invalid operand for '%%%c'", code);
11839 return;
11841 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
11842 break;
11844 case 'C':
11846 /* Print a replicated constant in hex. */
11847 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11849 output_operand_lossage ("invalid operand for '%%%c'", code);
11850 return;
11852 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11853 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11855 break;
11857 case 'D':
11859 /* Print a replicated constant in decimal, treating it as
11860 unsigned. */
11861 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11863 output_operand_lossage ("invalid operand for '%%%c'", code);
11864 return;
11866 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11867 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11869 break;
11871 case 'w':
11872 case 'x':
11873 if (x == const0_rtx
11874 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
11876 asm_fprintf (f, "%czr", code);
11877 break;
11880 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11882 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
11883 break;
11886 if (REG_P (x) && REGNO (x) == SP_REGNUM)
11888 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
11889 break;
11892 /* Fall through */
11894 case 0:
11895 if (x == NULL)
11897 output_operand_lossage ("missing operand");
11898 return;
11901 switch (GET_CODE (x))
11903 case REG:
11904 if (aarch64_sve_data_mode_p (GET_MODE (x)))
11906 if (REG_NREGS (x) == 1)
11907 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
11908 else
11910 char suffix
11911 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
11912 asm_fprintf (f, "{z%d.%c - z%d.%c}",
11913 REGNO (x) - V0_REGNUM, suffix,
11914 END_REGNO (x) - V0_REGNUM - 1, suffix);
11917 else
11918 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
11919 break;
11921 case MEM:
11922 output_address (GET_MODE (x), XEXP (x, 0));
11923 break;
11925 case LABEL_REF:
11926 case SYMBOL_REF:
11927 output_addr_const (asm_out_file, x);
11928 break;
11930 case CONST_INT:
11931 asm_fprintf (f, "%wd", INTVAL (x));
11932 break;
11934 case CONST:
11935 if (!VECTOR_MODE_P (GET_MODE (x)))
11937 output_addr_const (asm_out_file, x);
11938 break;
11940 /* fall through */
11942 case CONST_VECTOR:
11943 if (!const_vec_duplicate_p (x, &elt))
11945 output_operand_lossage ("invalid vector constant");
11946 return;
11949 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11950 asm_fprintf (f, "%wd", INTVAL (elt));
11951 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11952 && aarch64_print_vector_float_operand (f, x, false))
11954 else
11956 output_operand_lossage ("invalid vector constant");
11957 return;
11959 break;
11961 case CONST_DOUBLE:
11962 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
11963 be getting CONST_DOUBLEs holding integers. */
11964 gcc_assert (GET_MODE (x) != VOIDmode);
11965 if (aarch64_float_const_zero_rtx_p (x))
11967 fputc ('0', f);
11968 break;
11970 else if (aarch64_float_const_representable_p (x))
11972 #define buf_size 20
11973 char float_buf[buf_size] = {'\0'};
11974 real_to_decimal_for_mode (float_buf,
11975 CONST_DOUBLE_REAL_VALUE (x),
11976 buf_size, buf_size,
11977 1, GET_MODE (x));
11978 asm_fprintf (asm_out_file, "%s", float_buf);
11979 break;
11980 #undef buf_size
11982 output_operand_lossage ("invalid constant");
11983 return;
11984 default:
11985 output_operand_lossage ("invalid operand");
11986 return;
11988 break;
11990 case 'A':
11991 if (GET_CODE (x) == HIGH)
11992 x = XEXP (x, 0);
11994 switch (aarch64_classify_symbolic_expression (x))
11996 case SYMBOL_SMALL_GOT_4G:
11997 asm_fprintf (asm_out_file, ":got:");
11998 break;
12000 case SYMBOL_SMALL_TLSGD:
12001 asm_fprintf (asm_out_file, ":tlsgd:");
12002 break;
12004 case SYMBOL_SMALL_TLSDESC:
12005 asm_fprintf (asm_out_file, ":tlsdesc:");
12006 break;
12008 case SYMBOL_SMALL_TLSIE:
12009 asm_fprintf (asm_out_file, ":gottprel:");
12010 break;
12012 case SYMBOL_TLSLE24:
12013 asm_fprintf (asm_out_file, ":tprel:");
12014 break;
12016 case SYMBOL_TINY_GOT:
12017 gcc_unreachable ();
12018 break;
12020 default:
12021 break;
12023 output_addr_const (asm_out_file, x);
12024 break;
12026 case 'L':
12027 switch (aarch64_classify_symbolic_expression (x))
12029 case SYMBOL_SMALL_GOT_4G:
12030 asm_fprintf (asm_out_file, ":got_lo12:");
12031 break;
12033 case SYMBOL_SMALL_TLSGD:
12034 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12035 break;
12037 case SYMBOL_SMALL_TLSDESC:
12038 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12039 break;
12041 case SYMBOL_SMALL_TLSIE:
12042 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12043 break;
12045 case SYMBOL_TLSLE12:
12046 asm_fprintf (asm_out_file, ":tprel_lo12:");
12047 break;
12049 case SYMBOL_TLSLE24:
12050 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12051 break;
12053 case SYMBOL_TINY_GOT:
12054 asm_fprintf (asm_out_file, ":got:");
12055 break;
12057 case SYMBOL_TINY_TLSIE:
12058 asm_fprintf (asm_out_file, ":gottprel:");
12059 break;
12061 default:
12062 break;
12064 output_addr_const (asm_out_file, x);
12065 break;
12067 case 'G':
12068 switch (aarch64_classify_symbolic_expression (x))
12070 case SYMBOL_TLSLE24:
12071 asm_fprintf (asm_out_file, ":tprel_hi12:");
12072 break;
12073 default:
12074 break;
12076 output_addr_const (asm_out_file, x);
12077 break;
12079 case 'k':
12081 HOST_WIDE_INT cond_code;
12083 if (!CONST_INT_P (x))
12085 output_operand_lossage ("invalid operand for '%%%c'", code);
12086 return;
12089 cond_code = INTVAL (x);
12090 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12091 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12093 break;
12095 case 'y':
12096 case 'z':
12098 machine_mode mode = GET_MODE (x);
12100 if (!MEM_P (x)
12101 || (code == 'y'
12102 && maybe_ne (GET_MODE_SIZE (mode), 8)
12103 && maybe_ne (GET_MODE_SIZE (mode), 16)))
12105 output_operand_lossage ("invalid operand for '%%%c'", code);
12106 return;
12109 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12110 code == 'y'
12111 ? ADDR_QUERY_LDP_STP_N
12112 : ADDR_QUERY_LDP_STP))
12113 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12115 break;
12117 default:
12118 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12119 return;
12123 /* Print address 'x' of a memory access with mode 'mode'.
12124 'op' is the context required by aarch64_classify_address. It can either be
12125 MEM for a normal memory access or PARALLEL for LDP/STP. */
12126 static bool
12127 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12128 aarch64_addr_query_type type)
12130 struct aarch64_address_info addr;
12131 unsigned int size, vec_flags;
12133 /* Check all addresses are Pmode - including ILP32. */
12134 if (GET_MODE (x) != Pmode
12135 && (!CONST_INT_P (x)
12136 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12138 output_operand_lossage ("invalid address mode");
12139 return false;
12142 if (aarch64_classify_address (&addr, x, mode, true, type))
12143 switch (addr.type)
12145 case ADDRESS_REG_IMM:
12146 if (known_eq (addr.const_offset, 0))
12148 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12149 return true;
12152 vec_flags = aarch64_classify_vector_mode (mode);
12153 if (vec_flags & VEC_ANY_SVE)
12155 HOST_WIDE_INT vnum
12156 = exact_div (addr.const_offset,
12157 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12158 asm_fprintf (f, "[%s, #%wd, mul vl]",
12159 reg_names[REGNO (addr.base)], vnum);
12160 return true;
12163 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12164 INTVAL (addr.offset));
12165 return true;
12167 case ADDRESS_REG_REG:
12168 if (addr.shift == 0)
12169 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12170 reg_names [REGNO (addr.offset)]);
12171 else
12172 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12173 reg_names [REGNO (addr.offset)], addr.shift);
12174 return true;
12176 case ADDRESS_REG_UXTW:
12177 if (addr.shift == 0)
12178 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12179 REGNO (addr.offset) - R0_REGNUM);
12180 else
12181 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12182 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12183 return true;
12185 case ADDRESS_REG_SXTW:
12186 if (addr.shift == 0)
12187 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12188 REGNO (addr.offset) - R0_REGNUM);
12189 else
12190 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12191 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12192 return true;
12194 case ADDRESS_REG_WB:
12195 /* Writeback is only supported for fixed-width modes. */
12196 size = GET_MODE_SIZE (mode).to_constant ();
12197 switch (GET_CODE (x))
12199 case PRE_INC:
12200 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12201 return true;
12202 case POST_INC:
12203 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12204 return true;
12205 case PRE_DEC:
12206 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12207 return true;
12208 case POST_DEC:
12209 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12210 return true;
12211 case PRE_MODIFY:
12212 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12213 INTVAL (addr.offset));
12214 return true;
12215 case POST_MODIFY:
12216 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12217 INTVAL (addr.offset));
12218 return true;
12219 default:
12220 break;
12222 break;
12224 case ADDRESS_LO_SUM:
12225 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12226 output_addr_const (f, addr.offset);
12227 asm_fprintf (f, "]");
12228 return true;
12230 case ADDRESS_SYMBOLIC:
12231 output_addr_const (f, x);
12232 return true;
12235 return false;
12238 /* Print address 'x' of a memory access with mode 'mode'. */
12239 static void
12240 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12242 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12243 output_addr_const (f, x);
12246 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12248 static bool
12249 aarch64_output_addr_const_extra (FILE *file, rtx x)
12251 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12253 output_addr_const (file, XVECEXP (x, 0, 0));
12254 return true;
12256 return false;
12259 bool
12260 aarch64_label_mentioned_p (rtx x)
12262 const char *fmt;
12263 int i;
12265 if (LABEL_REF_P (x))
12266 return true;
12268 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12269 referencing instruction, but they are constant offsets, not
12270 symbols. */
12271 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12272 return false;
12274 fmt = GET_RTX_FORMAT (GET_CODE (x));
12275 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12277 if (fmt[i] == 'E')
12279 int j;
12281 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12282 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12283 return 1;
12285 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12286 return 1;
12289 return 0;
12292 /* Implement REGNO_REG_CLASS. */
12294 enum reg_class
12295 aarch64_regno_regclass (unsigned regno)
12297 if (STUB_REGNUM_P (regno))
12298 return STUB_REGS;
12300 if (GP_REGNUM_P (regno))
12301 return GENERAL_REGS;
12303 if (regno == SP_REGNUM)
12304 return STACK_REG;
12306 if (regno == FRAME_POINTER_REGNUM
12307 || regno == ARG_POINTER_REGNUM)
12308 return POINTER_REGS;
12310 if (FP_REGNUM_P (regno))
12311 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12312 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12314 if (PR_REGNUM_P (regno))
12315 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12317 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12318 return FFR_REGS;
12320 return NO_REGS;
12323 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12324 If OFFSET is out of range, return an offset of an anchor point
12325 that is in range. Return 0 otherwise. */
12327 static HOST_WIDE_INT
12328 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12329 machine_mode mode)
12331 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12332 if (size > 16)
12333 return (offset + 0x400) & ~0x7f0;
12335 /* For offsets that aren't a multiple of the access size, the limit is
12336 -256...255. */
12337 if (offset & (size - 1))
12339 /* BLKmode typically uses LDP of X-registers. */
12340 if (mode == BLKmode)
12341 return (offset + 512) & ~0x3ff;
12342 return (offset + 0x100) & ~0x1ff;
12345 /* Small negative offsets are supported. */
12346 if (IN_RANGE (offset, -256, 0))
12347 return 0;
12349 if (mode == TImode || mode == TFmode || mode == TDmode)
12350 return (offset + 0x100) & ~0x1ff;
12352 /* Use 12-bit offset by access size. */
12353 return offset & (~0xfff * size);
12356 static rtx
12357 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
12359 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12360 where mask is selected by alignment and size of the offset.
12361 We try to pick as large a range for the offset as possible to
12362 maximize the chance of a CSE. However, for aligned addresses
12363 we limit the range to 4k so that structures with different sized
12364 elements are likely to use the same base. We need to be careful
12365 not to split a CONST for some forms of address expression, otherwise
12366 it will generate sub-optimal code. */
12368 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12370 rtx base = XEXP (x, 0);
12371 rtx offset_rtx = XEXP (x, 1);
12372 HOST_WIDE_INT offset = INTVAL (offset_rtx);
12374 if (GET_CODE (base) == PLUS)
12376 rtx op0 = XEXP (base, 0);
12377 rtx op1 = XEXP (base, 1);
12379 /* Force any scaling into a temp for CSE. */
12380 op0 = force_reg (Pmode, op0);
12381 op1 = force_reg (Pmode, op1);
12383 /* Let the pointer register be in op0. */
12384 if (REG_POINTER (op1))
12385 std::swap (op0, op1);
12387 /* If the pointer is virtual or frame related, then we know that
12388 virtual register instantiation or register elimination is going
12389 to apply a second constant. We want the two constants folded
12390 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12391 if (virt_or_elim_regno_p (REGNO (op0)))
12393 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12394 NULL_RTX, true, OPTAB_DIRECT);
12395 return gen_rtx_PLUS (Pmode, base, op1);
12398 /* Otherwise, in order to encourage CSE (and thence loop strength
12399 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12400 base = expand_binop (Pmode, add_optab, op0, op1,
12401 NULL_RTX, true, OPTAB_DIRECT);
12402 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12405 HOST_WIDE_INT size;
12406 if (GET_MODE_SIZE (mode).is_constant (&size))
12408 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12409 mode);
12410 if (base_offset != 0)
12412 base = plus_constant (Pmode, base, base_offset);
12413 base = force_operand (base, NULL_RTX);
12414 return plus_constant (Pmode, base, offset - base_offset);
12419 return x;
12422 static reg_class_t
12423 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12424 reg_class_t rclass,
12425 machine_mode mode,
12426 secondary_reload_info *sri)
12428 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12429 LDR and STR. See the comment at the head of aarch64-sve.md for
12430 more details about the big-endian handling. */
12431 if (reg_class_subset_p (rclass, FP_REGS)
12432 && !((REG_P (x) && HARD_REGISTER_P (x))
12433 || aarch64_simd_valid_immediate (x, NULL))
12434 && mode != VNx16QImode)
12436 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12437 if ((vec_flags & VEC_SVE_DATA)
12438 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12440 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12441 return NO_REGS;
12445 /* If we have to disable direct literal pool loads and stores because the
12446 function is too big, then we need a scratch register. */
12447 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12448 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12449 || targetm.vector_mode_supported_p (GET_MODE (x)))
12450 && !aarch64_pcrelative_literal_loads)
12452 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12453 return NO_REGS;
12456 /* Without the TARGET_SIMD instructions we cannot move a Q register
12457 to a Q register directly. We need a scratch. */
12458 if (REG_P (x)
12459 && (mode == TFmode || mode == TImode || mode == TDmode)
12460 && mode == GET_MODE (x)
12461 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
12462 && reg_class_subset_p (rclass, FP_REGS))
12464 sri->icode = code_for_aarch64_reload_mov (mode);
12465 return NO_REGS;
12468 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12469 because AArch64 has richer addressing modes for LDR/STR instructions
12470 than LDP/STP instructions. */
12471 if (TARGET_FLOAT && rclass == GENERAL_REGS
12472 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12473 return FP_REGS;
12475 if (rclass == FP_REGS
12476 && (mode == TImode || mode == TFmode || mode == TDmode)
12477 && CONSTANT_P(x))
12478 return GENERAL_REGS;
12480 return NO_REGS;
12483 static bool
12484 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12486 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12488 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12489 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12490 if (frame_pointer_needed)
12491 return to == HARD_FRAME_POINTER_REGNUM;
12492 return true;
12495 poly_int64
12496 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12498 if (to == HARD_FRAME_POINTER_REGNUM)
12500 if (from == ARG_POINTER_REGNUM)
12501 return cfun->machine->frame.hard_fp_offset;
12503 if (from == FRAME_POINTER_REGNUM)
12504 return cfun->machine->frame.hard_fp_offset
12505 - cfun->machine->frame.locals_offset;
12508 if (to == STACK_POINTER_REGNUM)
12510 if (from == FRAME_POINTER_REGNUM)
12511 return cfun->machine->frame.frame_size
12512 - cfun->machine->frame.locals_offset;
12515 return cfun->machine->frame.frame_size;
12519 /* Get return address without mangling. */
12522 aarch64_return_addr_rtx (void)
12524 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12525 /* Note: aarch64_return_address_signing_enabled only
12526 works after cfun->machine->frame.laid_out is set,
12527 so here we don't know if the return address will
12528 be signed or not. */
12529 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12530 emit_move_insn (lr, val);
12531 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12532 return lr;
12536 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12537 previous frame. */
12540 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12542 if (count != 0)
12543 return const0_rtx;
12544 return aarch64_return_addr_rtx ();
12547 static void
12548 aarch64_asm_trampoline_template (FILE *f)
12550 /* Even if the current function doesn't have branch protection, some
12551 later function might, so since this template is only generated once
12552 we have to add a BTI just in case. */
12553 asm_fprintf (f, "\thint\t34 // bti c\n");
12555 if (TARGET_ILP32)
12557 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12558 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12560 else
12562 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12563 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12565 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12567 /* We always emit a speculation barrier.
12568 This is because the same trampoline template is used for every nested
12569 function. Since nested functions are not particularly common or
12570 performant we don't worry too much about the extra instructions to copy
12571 around.
12572 This is not yet a problem, since we have not yet implemented function
12573 specific attributes to choose between hardening against straight line
12574 speculation or not, but such function specific attributes are likely to
12575 happen in the future. */
12576 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12578 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12579 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12582 static void
12583 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12585 rtx fnaddr, mem, a_tramp;
12586 const int tramp_code_sz = 24;
12588 /* Don't need to copy the trailing D-words, we fill those in below. */
12589 /* We create our own memory address in Pmode so that `emit_block_move` can
12590 use parts of the backend which expect Pmode addresses. */
12591 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12592 emit_block_move (gen_rtx_MEM (BLKmode, temp),
12593 assemble_trampoline_template (),
12594 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12595 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12596 fnaddr = XEXP (DECL_RTL (fndecl), 0);
12597 if (GET_MODE (fnaddr) != ptr_mode)
12598 fnaddr = convert_memory_address (ptr_mode, fnaddr);
12599 emit_move_insn (mem, fnaddr);
12601 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12602 emit_move_insn (mem, chain_value);
12604 /* XXX We should really define a "clear_cache" pattern and use
12605 gen_clear_cache(). */
12606 a_tramp = XEXP (m_tramp, 0);
12607 maybe_emit_call_builtin___clear_cache (a_tramp,
12608 plus_constant (ptr_mode,
12609 a_tramp,
12610 TRAMPOLINE_SIZE));
12613 static unsigned char
12614 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12616 /* ??? Logically we should only need to provide a value when
12617 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12618 can hold MODE, but at the moment we need to handle all modes.
12619 Just ignore any runtime parts for registers that can't store them. */
12620 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12621 unsigned int nregs, vec_flags;
12622 switch (regclass)
12624 case STUB_REGS:
12625 case TAILCALL_ADDR_REGS:
12626 case POINTER_REGS:
12627 case GENERAL_REGS:
12628 case ALL_REGS:
12629 case POINTER_AND_FP_REGS:
12630 case FP_REGS:
12631 case FP_LO_REGS:
12632 case FP_LO8_REGS:
12633 vec_flags = aarch64_classify_vector_mode (mode);
12634 if ((vec_flags & VEC_SVE_DATA)
12635 && constant_multiple_p (GET_MODE_SIZE (mode),
12636 aarch64_vl_bytes (mode, vec_flags), &nregs))
12637 return nregs;
12638 return (vec_flags & VEC_ADVSIMD
12639 ? CEIL (lowest_size, UNITS_PER_VREG)
12640 : CEIL (lowest_size, UNITS_PER_WORD));
12641 case STACK_REG:
12642 case PR_REGS:
12643 case PR_LO_REGS:
12644 case PR_HI_REGS:
12645 case FFR_REGS:
12646 case PR_AND_FFR_REGS:
12647 return 1;
12649 case NO_REGS:
12650 return 0;
12652 default:
12653 break;
12655 gcc_unreachable ();
12658 static reg_class_t
12659 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12661 if (regclass == POINTER_REGS)
12662 return GENERAL_REGS;
12664 if (regclass == STACK_REG)
12666 if (REG_P(x)
12667 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12668 return regclass;
12670 return NO_REGS;
12673 /* Register eliminiation can result in a request for
12674 SP+constant->FP_REGS. We cannot support such operations which
12675 use SP as source and an FP_REG as destination, so reject out
12676 right now. */
12677 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12679 rtx lhs = XEXP (x, 0);
12681 /* Look through a possible SUBREG introduced by ILP32. */
12682 if (SUBREG_P (lhs))
12683 lhs = SUBREG_REG (lhs);
12685 gcc_assert (REG_P (lhs));
12686 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12687 POINTER_REGS));
12688 return NO_REGS;
12691 return regclass;
12694 void
12695 aarch64_asm_output_labelref (FILE* f, const char *name)
12697 asm_fprintf (f, "%U%s", name);
12700 static void
12701 aarch64_elf_asm_constructor (rtx symbol, int priority)
12703 if (priority == DEFAULT_INIT_PRIORITY)
12704 default_ctor_section_asm_out_constructor (symbol, priority);
12705 else
12707 section *s;
12708 /* While priority is known to be in range [0, 65535], so 18 bytes
12709 would be enough, the compiler might not know that. To avoid
12710 -Wformat-truncation false positive, use a larger size. */
12711 char buf[23];
12712 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12713 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12714 switch_to_section (s);
12715 assemble_align (POINTER_SIZE);
12716 assemble_aligned_integer (POINTER_BYTES, symbol);
12720 static void
12721 aarch64_elf_asm_destructor (rtx symbol, int priority)
12723 if (priority == DEFAULT_INIT_PRIORITY)
12724 default_dtor_section_asm_out_destructor (symbol, priority);
12725 else
12727 section *s;
12728 /* While priority is known to be in range [0, 65535], so 18 bytes
12729 would be enough, the compiler might not know that. To avoid
12730 -Wformat-truncation false positive, use a larger size. */
12731 char buf[23];
12732 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12733 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12734 switch_to_section (s);
12735 assemble_align (POINTER_SIZE);
12736 assemble_aligned_integer (POINTER_BYTES, symbol);
12740 const char*
12741 aarch64_output_casesi (rtx *operands)
12743 char buf[100];
12744 char label[100];
12745 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12746 int index;
12747 static const char *const patterns[4][2] =
12750 "ldrb\t%w3, [%0,%w1,uxtw]",
12751 "add\t%3, %4, %w3, sxtb #2"
12754 "ldrh\t%w3, [%0,%w1,uxtw #1]",
12755 "add\t%3, %4, %w3, sxth #2"
12758 "ldr\t%w3, [%0,%w1,uxtw #2]",
12759 "add\t%3, %4, %w3, sxtw #2"
12761 /* We assume that DImode is only generated when not optimizing and
12762 that we don't really need 64-bit address offsets. That would
12763 imply an object file with 8GB of code in a single function! */
12765 "ldr\t%w3, [%0,%w1,uxtw #2]",
12766 "add\t%3, %4, %w3, sxtw #2"
12770 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
12772 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
12773 index = exact_log2 (GET_MODE_SIZE (mode));
12775 gcc_assert (index >= 0 && index <= 3);
12777 /* Need to implement table size reduction, by chaning the code below. */
12778 output_asm_insn (patterns[index][0], operands);
12779 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
12780 snprintf (buf, sizeof (buf),
12781 "adr\t%%4, %s", targetm.strip_name_encoding (label));
12782 output_asm_insn (buf, operands);
12783 output_asm_insn (patterns[index][1], operands);
12784 output_asm_insn ("br\t%3", operands);
12785 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
12786 operands);
12787 assemble_label (asm_out_file, label);
12788 return "";
12792 /* Return size in bits of an arithmetic operand which is shifted/scaled and
12793 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
12794 operator. */
12797 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
12799 if (shift >= 0 && shift <= 3)
12801 int size;
12802 for (size = 8; size <= 32; size *= 2)
12804 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
12805 if (mask == bits << shift)
12806 return size;
12809 return 0;
12812 /* Constant pools are per function only when PC relative
12813 literal loads are true or we are in the large memory
12814 model. */
12816 static inline bool
12817 aarch64_can_use_per_function_literal_pools_p (void)
12819 return (aarch64_pcrelative_literal_loads
12820 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
12823 static bool
12824 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
12826 /* We can't use blocks for constants when we're using a per-function
12827 constant pool. */
12828 return !aarch64_can_use_per_function_literal_pools_p ();
12831 /* Select appropriate section for constants depending
12832 on where we place literal pools. */
12834 static section *
12835 aarch64_select_rtx_section (machine_mode mode,
12836 rtx x,
12837 unsigned HOST_WIDE_INT align)
12839 if (aarch64_can_use_per_function_literal_pools_p ())
12840 return function_section (current_function_decl);
12842 return default_elf_select_rtx_section (mode, x, align);
12845 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
12846 void
12847 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
12848 HOST_WIDE_INT offset)
12850 /* When using per-function literal pools, we must ensure that any code
12851 section is aligned to the minimal instruction length, lest we get
12852 errors from the assembler re "unaligned instructions". */
12853 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
12854 ASM_OUTPUT_ALIGN (f, 2);
12857 /* Costs. */
12859 /* Helper function for rtx cost calculation. Strip a shift expression
12860 from X. Returns the inner operand if successful, or the original
12861 expression on failure. */
12862 static rtx
12863 aarch64_strip_shift (rtx x)
12865 rtx op = x;
12867 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
12868 we can convert both to ROR during final output. */
12869 if ((GET_CODE (op) == ASHIFT
12870 || GET_CODE (op) == ASHIFTRT
12871 || GET_CODE (op) == LSHIFTRT
12872 || GET_CODE (op) == ROTATERT
12873 || GET_CODE (op) == ROTATE)
12874 && CONST_INT_P (XEXP (op, 1)))
12875 return XEXP (op, 0);
12877 if (GET_CODE (op) == MULT
12878 && CONST_INT_P (XEXP (op, 1))
12879 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
12880 return XEXP (op, 0);
12882 return x;
12885 /* Helper function for rtx cost calculation. Strip an extend
12886 expression from X. Returns the inner operand if successful, or the
12887 original expression on failure. We deal with a number of possible
12888 canonicalization variations here. If STRIP_SHIFT is true, then
12889 we can strip off a shift also. */
12890 static rtx
12891 aarch64_strip_extend (rtx x, bool strip_shift)
12893 scalar_int_mode mode;
12894 rtx op = x;
12896 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
12897 return op;
12899 if (GET_CODE (op) == AND
12900 && GET_CODE (XEXP (op, 0)) == MULT
12901 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
12902 && CONST_INT_P (XEXP (op, 1))
12903 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
12904 INTVAL (XEXP (op, 1))) != 0)
12905 return XEXP (XEXP (op, 0), 0);
12907 /* Now handle extended register, as this may also have an optional
12908 left shift by 1..4. */
12909 if (strip_shift
12910 && GET_CODE (op) == ASHIFT
12911 && CONST_INT_P (XEXP (op, 1))
12912 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
12913 op = XEXP (op, 0);
12915 if (GET_CODE (op) == ZERO_EXTEND
12916 || GET_CODE (op) == SIGN_EXTEND)
12917 op = XEXP (op, 0);
12919 if (op != x)
12920 return op;
12922 return x;
12925 /* Helper function for rtx cost calculation. Strip extension as well as any
12926 inner VEC_SELECT high-half from X. Returns the inner vector operand if
12927 successful, or the original expression on failure. */
12928 static rtx
12929 aarch64_strip_extend_vec_half (rtx x)
12931 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
12933 x = XEXP (x, 0);
12934 if (GET_CODE (x) == VEC_SELECT
12935 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
12936 XEXP (x, 1)))
12937 x = XEXP (x, 0);
12939 return x;
12942 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
12943 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
12944 operand if successful, or the original expression on failure. */
12945 static rtx
12946 aarch64_strip_duplicate_vec_elt (rtx x)
12948 if (GET_CODE (x) == VEC_DUPLICATE
12949 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
12951 x = XEXP (x, 0);
12952 if (GET_CODE (x) == VEC_SELECT)
12953 x = XEXP (x, 0);
12954 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
12955 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
12956 x = XEXP (XEXP (x, 0), 0);
12958 return x;
12961 /* Return true iff CODE is a shift supported in combination
12962 with arithmetic instructions. */
12964 static bool
12965 aarch64_shift_p (enum rtx_code code)
12967 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
12971 /* Return true iff X is a cheap shift without a sign extend. */
12973 static bool
12974 aarch64_cheap_mult_shift_p (rtx x)
12976 rtx op0, op1;
12978 op0 = XEXP (x, 0);
12979 op1 = XEXP (x, 1);
12981 if (!(aarch64_tune_params.extra_tuning_flags
12982 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
12983 return false;
12985 if (GET_CODE (op0) == SIGN_EXTEND)
12986 return false;
12988 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
12989 && UINTVAL (op1) <= 4)
12990 return true;
12992 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
12993 return false;
12995 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
12997 if (l2 > 0 && l2 <= 4)
12998 return true;
13000 return false;
13003 /* Helper function for rtx cost calculation. Calculate the cost of
13004 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13005 Return the calculated cost of the expression, recursing manually in to
13006 operands where needed. */
13008 static int
13009 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13011 rtx op0, op1;
13012 const struct cpu_cost_table *extra_cost
13013 = aarch64_tune_params.insn_extra_cost;
13014 int cost = 0;
13015 bool compound_p = (outer == PLUS || outer == MINUS);
13016 machine_mode mode = GET_MODE (x);
13018 gcc_checking_assert (code == MULT);
13020 op0 = XEXP (x, 0);
13021 op1 = XEXP (x, 1);
13023 if (VECTOR_MODE_P (mode))
13025 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13026 if (vec_flags & VEC_ADVSIMD)
13028 /* The select-operand-high-half versions of the instruction have the
13029 same cost as the three vector version - don't add the costs of the
13030 extension or selection into the costs of the multiply. */
13031 op0 = aarch64_strip_extend_vec_half (op0);
13032 op1 = aarch64_strip_extend_vec_half (op1);
13033 /* The by-element versions of the instruction have the same costs as
13034 the normal 3-vector version. We make an assumption that the input
13035 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13036 costing of a MUL by element pre RA is a bit optimistic. */
13037 op0 = aarch64_strip_duplicate_vec_elt (op0);
13038 op1 = aarch64_strip_duplicate_vec_elt (op1);
13040 cost += rtx_cost (op0, mode, MULT, 0, speed);
13041 cost += rtx_cost (op1, mode, MULT, 1, speed);
13042 if (speed)
13044 if (GET_CODE (x) == MULT)
13045 cost += extra_cost->vect.mult;
13046 /* This is to catch the SSRA costing currently flowing here. */
13047 else
13048 cost += extra_cost->vect.alu;
13050 return cost;
13053 /* Integer multiply/fma. */
13054 if (GET_MODE_CLASS (mode) == MODE_INT)
13056 /* The multiply will be canonicalized as a shift, cost it as such. */
13057 if (aarch64_shift_p (GET_CODE (x))
13058 || (CONST_INT_P (op1)
13059 && exact_log2 (INTVAL (op1)) > 0))
13061 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13062 || GET_CODE (op0) == SIGN_EXTEND;
13063 if (speed)
13065 if (compound_p)
13067 /* If the shift is considered cheap,
13068 then don't add any cost. */
13069 if (aarch64_cheap_mult_shift_p (x))
13071 else if (REG_P (op1))
13072 /* ARITH + shift-by-register. */
13073 cost += extra_cost->alu.arith_shift_reg;
13074 else if (is_extend)
13075 /* ARITH + extended register. We don't have a cost field
13076 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13077 cost += extra_cost->alu.extend_arith;
13078 else
13079 /* ARITH + shift-by-immediate. */
13080 cost += extra_cost->alu.arith_shift;
13082 else
13083 /* LSL (immediate). */
13084 cost += extra_cost->alu.shift;
13087 /* Strip extends as we will have costed them in the case above. */
13088 if (is_extend)
13089 op0 = aarch64_strip_extend (op0, true);
13091 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13093 return cost;
13096 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13097 compound and let the below cases handle it. After all, MNEG is a
13098 special-case alias of MSUB. */
13099 if (GET_CODE (op0) == NEG)
13101 op0 = XEXP (op0, 0);
13102 compound_p = true;
13105 /* Integer multiplies or FMAs have zero/sign extending variants. */
13106 if ((GET_CODE (op0) == ZERO_EXTEND
13107 && GET_CODE (op1) == ZERO_EXTEND)
13108 || (GET_CODE (op0) == SIGN_EXTEND
13109 && GET_CODE (op1) == SIGN_EXTEND))
13111 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13112 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13114 if (speed)
13116 if (compound_p)
13117 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13118 cost += extra_cost->mult[0].extend_add;
13119 else
13120 /* MUL/SMULL/UMULL. */
13121 cost += extra_cost->mult[0].extend;
13124 return cost;
13127 /* This is either an integer multiply or a MADD. In both cases
13128 we want to recurse and cost the operands. */
13129 cost += rtx_cost (op0, mode, MULT, 0, speed);
13130 cost += rtx_cost (op1, mode, MULT, 1, speed);
13132 if (speed)
13134 if (compound_p)
13135 /* MADD/MSUB. */
13136 cost += extra_cost->mult[mode == DImode].add;
13137 else
13138 /* MUL. */
13139 cost += extra_cost->mult[mode == DImode].simple;
13142 return cost;
13144 else
13146 if (speed)
13148 /* Floating-point FMA/FMUL can also support negations of the
13149 operands, unless the rounding mode is upward or downward in
13150 which case FNMUL is different than FMUL with operand negation. */
13151 bool neg0 = GET_CODE (op0) == NEG;
13152 bool neg1 = GET_CODE (op1) == NEG;
13153 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13155 if (neg0)
13156 op0 = XEXP (op0, 0);
13157 if (neg1)
13158 op1 = XEXP (op1, 0);
13161 if (compound_p)
13162 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13163 cost += extra_cost->fp[mode == DFmode].fma;
13164 else
13165 /* FMUL/FNMUL. */
13166 cost += extra_cost->fp[mode == DFmode].mult;
13169 cost += rtx_cost (op0, mode, MULT, 0, speed);
13170 cost += rtx_cost (op1, mode, MULT, 1, speed);
13171 return cost;
13175 static int
13176 aarch64_address_cost (rtx x,
13177 machine_mode mode,
13178 addr_space_t as ATTRIBUTE_UNUSED,
13179 bool speed)
13181 enum rtx_code c = GET_CODE (x);
13182 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13183 struct aarch64_address_info info;
13184 int cost = 0;
13185 info.shift = 0;
13187 if (!aarch64_classify_address (&info, x, mode, false))
13189 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13191 /* This is a CONST or SYMBOL ref which will be split
13192 in a different way depending on the code model in use.
13193 Cost it through the generic infrastructure. */
13194 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13195 /* Divide through by the cost of one instruction to
13196 bring it to the same units as the address costs. */
13197 cost_symbol_ref /= COSTS_N_INSNS (1);
13198 /* The cost is then the cost of preparing the address,
13199 followed by an immediate (possibly 0) offset. */
13200 return cost_symbol_ref + addr_cost->imm_offset;
13202 else
13204 /* This is most likely a jump table from a case
13205 statement. */
13206 return addr_cost->register_offset;
13210 switch (info.type)
13212 case ADDRESS_LO_SUM:
13213 case ADDRESS_SYMBOLIC:
13214 case ADDRESS_REG_IMM:
13215 cost += addr_cost->imm_offset;
13216 break;
13218 case ADDRESS_REG_WB:
13219 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13220 cost += addr_cost->pre_modify;
13221 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13223 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13224 if (nvectors == 3)
13225 cost += addr_cost->post_modify_ld3_st3;
13226 else if (nvectors == 4)
13227 cost += addr_cost->post_modify_ld4_st4;
13228 else
13229 cost += addr_cost->post_modify;
13231 else
13232 gcc_unreachable ();
13234 break;
13236 case ADDRESS_REG_REG:
13237 cost += addr_cost->register_offset;
13238 break;
13240 case ADDRESS_REG_SXTW:
13241 cost += addr_cost->register_sextend;
13242 break;
13244 case ADDRESS_REG_UXTW:
13245 cost += addr_cost->register_zextend;
13246 break;
13248 default:
13249 gcc_unreachable ();
13253 if (info.shift > 0)
13255 /* For the sake of calculating the cost of the shifted register
13256 component, we can treat same sized modes in the same way. */
13257 if (known_eq (GET_MODE_BITSIZE (mode), 16))
13258 cost += addr_cost->addr_scale_costs.hi;
13259 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13260 cost += addr_cost->addr_scale_costs.si;
13261 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13262 cost += addr_cost->addr_scale_costs.di;
13263 else
13264 /* We can't tell, or this is a 128-bit vector. */
13265 cost += addr_cost->addr_scale_costs.ti;
13268 return cost;
13271 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13272 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13273 to be taken. */
13276 aarch64_branch_cost (bool speed_p, bool predictable_p)
13278 /* When optimizing for speed, use the cost of unpredictable branches. */
13279 const struct cpu_branch_cost *branch_costs =
13280 aarch64_tune_params.branch_costs;
13282 if (!speed_p || predictable_p)
13283 return branch_costs->predictable;
13284 else
13285 return branch_costs->unpredictable;
13288 /* Return true if X is a zero or sign extract
13289 usable in an ADD or SUB (extended register) instruction. */
13290 static bool
13291 aarch64_rtx_arith_op_extract_p (rtx x)
13293 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13294 No shift. */
13295 if (GET_CODE (x) == SIGN_EXTEND
13296 || GET_CODE (x) == ZERO_EXTEND)
13297 return REG_P (XEXP (x, 0));
13299 return false;
13302 static bool
13303 aarch64_frint_unspec_p (unsigned int u)
13305 switch (u)
13307 case UNSPEC_FRINTZ:
13308 case UNSPEC_FRINTP:
13309 case UNSPEC_FRINTM:
13310 case UNSPEC_FRINTA:
13311 case UNSPEC_FRINTN:
13312 case UNSPEC_FRINTX:
13313 case UNSPEC_FRINTI:
13314 return true;
13316 default:
13317 return false;
13321 /* Return true iff X is an rtx that will match an extr instruction
13322 i.e. as described in the *extr<mode>5_insn family of patterns.
13323 OP0 and OP1 will be set to the operands of the shifts involved
13324 on success and will be NULL_RTX otherwise. */
13326 static bool
13327 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13329 rtx op0, op1;
13330 scalar_int_mode mode;
13331 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13332 return false;
13334 *res_op0 = NULL_RTX;
13335 *res_op1 = NULL_RTX;
13337 if (GET_CODE (x) != IOR)
13338 return false;
13340 op0 = XEXP (x, 0);
13341 op1 = XEXP (x, 1);
13343 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13344 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13346 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13347 if (GET_CODE (op1) == ASHIFT)
13348 std::swap (op0, op1);
13350 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13351 return false;
13353 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13354 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13356 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13357 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13359 *res_op0 = XEXP (op0, 0);
13360 *res_op1 = XEXP (op1, 0);
13361 return true;
13365 return false;
13368 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13369 storing it in *COST. Result is true if the total cost of the operation
13370 has now been calculated. */
13371 static bool
13372 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13374 rtx inner;
13375 rtx comparator;
13376 enum rtx_code cmpcode;
13377 const struct cpu_cost_table *extra_cost
13378 = aarch64_tune_params.insn_extra_cost;
13380 if (COMPARISON_P (op0))
13382 inner = XEXP (op0, 0);
13383 comparator = XEXP (op0, 1);
13384 cmpcode = GET_CODE (op0);
13386 else
13388 inner = op0;
13389 comparator = const0_rtx;
13390 cmpcode = NE;
13393 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13395 /* Conditional branch. */
13396 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13397 return true;
13398 else
13400 if (cmpcode == NE || cmpcode == EQ)
13402 if (comparator == const0_rtx)
13404 /* TBZ/TBNZ/CBZ/CBNZ. */
13405 if (GET_CODE (inner) == ZERO_EXTRACT)
13406 /* TBZ/TBNZ. */
13407 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13408 ZERO_EXTRACT, 0, speed);
13409 else
13410 /* CBZ/CBNZ. */
13411 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13413 return true;
13415 if (register_operand (inner, VOIDmode)
13416 && aarch64_imm24 (comparator, VOIDmode))
13418 /* SUB and SUBS. */
13419 *cost += COSTS_N_INSNS (2);
13420 if (speed)
13421 *cost += extra_cost->alu.arith * 2;
13422 return true;
13425 else if (cmpcode == LT || cmpcode == GE)
13427 /* TBZ/TBNZ. */
13428 if (comparator == const0_rtx)
13429 return true;
13433 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13435 /* CCMP. */
13436 if (GET_CODE (op1) == COMPARE)
13438 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13439 if (XEXP (op1, 1) == const0_rtx)
13440 *cost += 1;
13441 if (speed)
13443 machine_mode mode = GET_MODE (XEXP (op1, 0));
13445 if (GET_MODE_CLASS (mode) == MODE_INT)
13446 *cost += extra_cost->alu.arith;
13447 else
13448 *cost += extra_cost->fp[mode == DFmode].compare;
13450 return true;
13453 /* It's a conditional operation based on the status flags,
13454 so it must be some flavor of CSEL. */
13456 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13457 if (GET_CODE (op1) == NEG
13458 || GET_CODE (op1) == NOT
13459 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13460 op1 = XEXP (op1, 0);
13461 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13463 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13464 op1 = XEXP (op1, 0);
13465 op2 = XEXP (op2, 0);
13467 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13469 inner = XEXP (op1, 0);
13470 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13471 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13472 op1 = XEXP (inner, 0);
13475 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13476 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13477 return true;
13480 /* We don't know what this is, cost all operands. */
13481 return false;
13484 /* Check whether X is a bitfield operation of the form shift + extend that
13485 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13486 operand to which the bitfield operation is applied. Otherwise return
13487 NULL_RTX. */
13489 static rtx
13490 aarch64_extend_bitfield_pattern_p (rtx x)
13492 rtx_code outer_code = GET_CODE (x);
13493 machine_mode outer_mode = GET_MODE (x);
13495 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13496 && outer_mode != SImode && outer_mode != DImode)
13497 return NULL_RTX;
13499 rtx inner = XEXP (x, 0);
13500 rtx_code inner_code = GET_CODE (inner);
13501 machine_mode inner_mode = GET_MODE (inner);
13502 rtx op = NULL_RTX;
13504 switch (inner_code)
13506 case ASHIFT:
13507 if (CONST_INT_P (XEXP (inner, 1))
13508 && (inner_mode == QImode || inner_mode == HImode))
13509 op = XEXP (inner, 0);
13510 break;
13511 case LSHIFTRT:
13512 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13513 && (inner_mode == QImode || inner_mode == HImode))
13514 op = XEXP (inner, 0);
13515 break;
13516 case ASHIFTRT:
13517 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13518 && (inner_mode == QImode || inner_mode == HImode))
13519 op = XEXP (inner, 0);
13520 break;
13521 default:
13522 break;
13525 return op;
13528 /* Return true if the mask and a shift amount from an RTX of the form
13529 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13530 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
13532 bool
13533 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13534 rtx shft_amnt)
13536 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13537 && INTVAL (mask) > 0
13538 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13539 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13540 && (UINTVAL (mask)
13541 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13544 /* Return true if the masks and a shift amount from an RTX of the form
13545 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13546 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
13548 bool
13549 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13550 unsigned HOST_WIDE_INT mask1,
13551 unsigned HOST_WIDE_INT shft_amnt,
13552 unsigned HOST_WIDE_INT mask2)
13554 unsigned HOST_WIDE_INT t;
13556 /* Verify that there is no overlap in what bits are set in the two masks. */
13557 if (mask1 != ~mask2)
13558 return false;
13560 /* Verify that mask2 is not all zeros or ones. */
13561 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13562 return false;
13564 /* The shift amount should always be less than the mode size. */
13565 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13567 /* Verify that the mask being shifted is contiguous and would be in the
13568 least significant bits after shifting by shft_amnt. */
13569 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13570 return (t == (t & -t));
13573 /* Calculate the cost of calculating X, storing it in *COST. Result
13574 is true if the total cost of the operation has now been calculated. */
13575 static bool
13576 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13577 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13579 rtx op0, op1, op2;
13580 const struct cpu_cost_table *extra_cost
13581 = aarch64_tune_params.insn_extra_cost;
13582 rtx_code code = GET_CODE (x);
13583 scalar_int_mode int_mode;
13585 /* By default, assume that everything has equivalent cost to the
13586 cheapest instruction. Any additional costs are applied as a delta
13587 above this default. */
13588 *cost = COSTS_N_INSNS (1);
13590 switch (code)
13592 case SET:
13593 /* The cost depends entirely on the operands to SET. */
13594 *cost = 0;
13595 op0 = SET_DEST (x);
13596 op1 = SET_SRC (x);
13598 switch (GET_CODE (op0))
13600 case MEM:
13601 if (speed)
13603 rtx address = XEXP (op0, 0);
13604 if (VECTOR_MODE_P (mode))
13605 *cost += extra_cost->ldst.storev;
13606 else if (GET_MODE_CLASS (mode) == MODE_INT)
13607 *cost += extra_cost->ldst.store;
13608 else if (mode == SFmode || mode == SDmode)
13609 *cost += extra_cost->ldst.storef;
13610 else if (mode == DFmode || mode == DDmode)
13611 *cost += extra_cost->ldst.stored;
13613 *cost +=
13614 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13615 0, speed));
13618 *cost += rtx_cost (op1, mode, SET, 1, speed);
13619 return true;
13621 case SUBREG:
13622 if (! REG_P (SUBREG_REG (op0)))
13623 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13625 /* Fall through. */
13626 case REG:
13627 /* The cost is one per vector-register copied. */
13628 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13630 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13631 *cost = COSTS_N_INSNS (nregs);
13633 /* const0_rtx is in general free, but we will use an
13634 instruction to set a register to 0. */
13635 else if (REG_P (op1) || op1 == const0_rtx)
13637 /* The cost is 1 per register copied. */
13638 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13639 *cost = COSTS_N_INSNS (nregs);
13641 else
13642 /* Cost is just the cost of the RHS of the set. */
13643 *cost += rtx_cost (op1, mode, SET, 1, speed);
13644 return true;
13646 case ZERO_EXTRACT:
13647 case SIGN_EXTRACT:
13648 /* Bit-field insertion. Strip any redundant widening of
13649 the RHS to meet the width of the target. */
13650 if (SUBREG_P (op1))
13651 op1 = SUBREG_REG (op1);
13652 if ((GET_CODE (op1) == ZERO_EXTEND
13653 || GET_CODE (op1) == SIGN_EXTEND)
13654 && CONST_INT_P (XEXP (op0, 1))
13655 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13656 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13657 op1 = XEXP (op1, 0);
13659 if (CONST_INT_P (op1))
13661 /* MOV immediate is assumed to always be cheap. */
13662 *cost = COSTS_N_INSNS (1);
13664 else
13666 /* BFM. */
13667 if (speed)
13668 *cost += extra_cost->alu.bfi;
13669 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13672 return true;
13674 default:
13675 /* We can't make sense of this, assume default cost. */
13676 *cost = COSTS_N_INSNS (1);
13677 return false;
13679 return false;
13681 case CONST_INT:
13682 /* If an instruction can incorporate a constant within the
13683 instruction, the instruction's expression avoids calling
13684 rtx_cost() on the constant. If rtx_cost() is called on a
13685 constant, then it is usually because the constant must be
13686 moved into a register by one or more instructions.
13688 The exception is constant 0, which can be expressed
13689 as XZR/WZR and is therefore free. The exception to this is
13690 if we have (set (reg) (const0_rtx)) in which case we must cost
13691 the move. However, we can catch that when we cost the SET, so
13692 we don't need to consider that here. */
13693 if (x == const0_rtx)
13694 *cost = 0;
13695 else
13697 /* To an approximation, building any other constant is
13698 proportionally expensive to the number of instructions
13699 required to build that constant. This is true whether we
13700 are compiling for SPEED or otherwise. */
13701 if (!is_a <scalar_int_mode> (mode, &int_mode))
13702 int_mode = word_mode;
13703 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13704 (NULL_RTX, x, false, int_mode));
13706 return true;
13708 case CONST_DOUBLE:
13710 /* First determine number of instructions to do the move
13711 as an integer constant. */
13712 if (!aarch64_float_const_representable_p (x)
13713 && !aarch64_can_const_movi_rtx_p (x, mode)
13714 && aarch64_float_const_rtx_p (x))
13716 unsigned HOST_WIDE_INT ival;
13717 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13718 gcc_assert (succeed);
13720 scalar_int_mode imode = (mode == HFmode
13721 ? SImode
13722 : int_mode_for_mode (mode).require ());
13723 int ncost = aarch64_internal_mov_immediate
13724 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13725 *cost += COSTS_N_INSNS (ncost);
13726 return true;
13729 if (speed)
13731 /* mov[df,sf]_aarch64. */
13732 if (aarch64_float_const_representable_p (x))
13733 /* FMOV (scalar immediate). */
13734 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
13735 else if (!aarch64_float_const_zero_rtx_p (x))
13737 /* This will be a load from memory. */
13738 if (mode == DFmode || mode == DDmode)
13739 *cost += extra_cost->ldst.loadd;
13740 else
13741 *cost += extra_cost->ldst.loadf;
13743 else
13744 /* Otherwise this is +0.0. We get this using MOVI d0, #0
13745 or MOV v0.s[0], wzr - neither of which are modeled by the
13746 cost tables. Just use the default cost. */
13751 return true;
13753 case MEM:
13754 if (speed)
13756 /* For loads we want the base cost of a load, plus an
13757 approximation for the additional cost of the addressing
13758 mode. */
13759 rtx address = XEXP (x, 0);
13760 if (VECTOR_MODE_P (mode))
13761 *cost += extra_cost->ldst.loadv;
13762 else if (GET_MODE_CLASS (mode) == MODE_INT)
13763 *cost += extra_cost->ldst.load;
13764 else if (mode == SFmode || mode == SDmode)
13765 *cost += extra_cost->ldst.loadf;
13766 else if (mode == DFmode || mode == DDmode)
13767 *cost += extra_cost->ldst.loadd;
13769 *cost +=
13770 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13771 0, speed));
13774 return true;
13776 case NEG:
13777 op0 = XEXP (x, 0);
13779 if (VECTOR_MODE_P (mode))
13781 if (speed)
13783 /* FNEG. */
13784 *cost += extra_cost->vect.alu;
13786 return false;
13789 if (GET_MODE_CLASS (mode) == MODE_INT)
13791 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
13792 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
13794 /* CSETM. */
13795 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
13796 return true;
13799 /* Cost this as SUB wzr, X. */
13800 op0 = CONST0_RTX (mode);
13801 op1 = XEXP (x, 0);
13802 goto cost_minus;
13805 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13807 /* Support (neg(fma...)) as a single instruction only if
13808 sign of zeros is unimportant. This matches the decision
13809 making in aarch64.md. */
13810 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
13812 /* FNMADD. */
13813 *cost = rtx_cost (op0, mode, NEG, 0, speed);
13814 return true;
13816 if (GET_CODE (op0) == MULT)
13818 /* FNMUL. */
13819 *cost = rtx_cost (op0, mode, NEG, 0, speed);
13820 return true;
13822 if (speed)
13823 /* FNEG. */
13824 *cost += extra_cost->fp[mode == DFmode].neg;
13825 return false;
13828 return false;
13830 case CLRSB:
13831 case CLZ:
13832 if (speed)
13834 if (VECTOR_MODE_P (mode))
13835 *cost += extra_cost->vect.alu;
13836 else
13837 *cost += extra_cost->alu.clz;
13840 return false;
13842 case CTZ:
13843 *cost = COSTS_N_INSNS (2);
13845 if (speed)
13846 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
13847 return false;
13849 case COMPARE:
13850 op0 = XEXP (x, 0);
13851 op1 = XEXP (x, 1);
13853 if (op1 == const0_rtx
13854 && GET_CODE (op0) == AND)
13856 x = op0;
13857 mode = GET_MODE (op0);
13858 goto cost_logic;
13861 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
13863 /* TODO: A write to the CC flags possibly costs extra, this
13864 needs encoding in the cost tables. */
13866 mode = GET_MODE (op0);
13867 /* ANDS. */
13868 if (GET_CODE (op0) == AND)
13870 x = op0;
13871 goto cost_logic;
13874 if (GET_CODE (op0) == PLUS)
13876 /* ADDS (and CMN alias). */
13877 x = op0;
13878 goto cost_plus;
13881 if (GET_CODE (op0) == MINUS)
13883 /* SUBS. */
13884 x = op0;
13885 goto cost_minus;
13888 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
13889 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
13890 && CONST_INT_P (XEXP (op0, 2)))
13892 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
13893 Handle it here directly rather than going to cost_logic
13894 since we know the immediate generated for the TST is valid
13895 so we can avoid creating an intermediate rtx for it only
13896 for costing purposes. */
13897 if (speed)
13898 *cost += extra_cost->alu.logical;
13900 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
13901 ZERO_EXTRACT, 0, speed);
13902 return true;
13905 if (GET_CODE (op1) == NEG)
13907 /* CMN. */
13908 if (speed)
13909 *cost += extra_cost->alu.arith;
13911 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
13912 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
13913 return true;
13916 /* CMP.
13918 Compare can freely swap the order of operands, and
13919 canonicalization puts the more complex operation first.
13920 But the integer MINUS logic expects the shift/extend
13921 operation in op1. */
13922 if (! (REG_P (op0)
13923 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
13925 op0 = XEXP (x, 1);
13926 op1 = XEXP (x, 0);
13928 goto cost_minus;
13931 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
13933 /* FCMP. */
13934 if (speed)
13935 *cost += extra_cost->fp[mode == DFmode].compare;
13937 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
13939 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
13940 /* FCMP supports constant 0.0 for no extra cost. */
13941 return true;
13943 return false;
13946 if (VECTOR_MODE_P (mode))
13948 /* Vector compare. */
13949 if (speed)
13950 *cost += extra_cost->vect.alu;
13952 if (aarch64_float_const_zero_rtx_p (op1))
13954 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
13955 cost. */
13956 return true;
13958 return false;
13960 return false;
13962 case MINUS:
13964 op0 = XEXP (x, 0);
13965 op1 = XEXP (x, 1);
13967 cost_minus:
13968 if (VECTOR_MODE_P (mode))
13970 /* SUBL2 and SUBW2. */
13971 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13972 if (vec_flags & VEC_ADVSIMD)
13974 /* The select-operand-high-half versions of the sub instruction
13975 have the same cost as the regular three vector version -
13976 don't add the costs of the select into the costs of the sub.
13978 op0 = aarch64_strip_extend_vec_half (op0);
13979 op1 = aarch64_strip_extend_vec_half (op1);
13983 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
13985 /* Detect valid immediates. */
13986 if ((GET_MODE_CLASS (mode) == MODE_INT
13987 || (GET_MODE_CLASS (mode) == MODE_CC
13988 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
13989 && CONST_INT_P (op1)
13990 && aarch64_uimm12_shift (INTVAL (op1)))
13992 if (speed)
13993 /* SUB(S) (immediate). */
13994 *cost += extra_cost->alu.arith;
13995 return true;
13998 /* Look for SUB (extended register). */
13999 if (is_a <scalar_int_mode> (mode)
14000 && aarch64_rtx_arith_op_extract_p (op1))
14002 if (speed)
14003 *cost += extra_cost->alu.extend_arith;
14005 op1 = aarch64_strip_extend (op1, true);
14006 *cost += rtx_cost (op1, VOIDmode,
14007 (enum rtx_code) GET_CODE (op1), 0, speed);
14008 return true;
14011 rtx new_op1 = aarch64_strip_extend (op1, false);
14013 /* Cost this as an FMA-alike operation. */
14014 if ((GET_CODE (new_op1) == MULT
14015 || aarch64_shift_p (GET_CODE (new_op1)))
14016 && code != COMPARE)
14018 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14019 (enum rtx_code) code,
14020 speed);
14021 return true;
14024 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14026 if (speed)
14028 if (VECTOR_MODE_P (mode))
14030 /* Vector SUB. */
14031 *cost += extra_cost->vect.alu;
14033 else if (GET_MODE_CLASS (mode) == MODE_INT)
14035 /* SUB(S). */
14036 *cost += extra_cost->alu.arith;
14038 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14040 /* FSUB. */
14041 *cost += extra_cost->fp[mode == DFmode].addsub;
14044 return true;
14047 case PLUS:
14049 rtx new_op0;
14051 op0 = XEXP (x, 0);
14052 op1 = XEXP (x, 1);
14054 cost_plus:
14055 if (VECTOR_MODE_P (mode))
14057 /* ADDL2 and ADDW2. */
14058 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14059 if (vec_flags & VEC_ADVSIMD)
14061 /* The select-operand-high-half versions of the add instruction
14062 have the same cost as the regular three vector version -
14063 don't add the costs of the select into the costs of the add.
14065 op0 = aarch64_strip_extend_vec_half (op0);
14066 op1 = aarch64_strip_extend_vec_half (op1);
14070 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14071 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14073 /* CSINC. */
14074 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14075 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14076 return true;
14079 if (GET_MODE_CLASS (mode) == MODE_INT
14080 && (aarch64_plus_immediate (op1, mode)
14081 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14083 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14085 if (speed)
14087 /* ADD (immediate). */
14088 *cost += extra_cost->alu.arith;
14090 /* Some tunings prefer to not use the VL-based scalar ops.
14091 Increase the cost of the poly immediate to prevent their
14092 formation. */
14093 if (GET_CODE (op1) == CONST_POLY_INT
14094 && (aarch64_tune_params.extra_tuning_flags
14095 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14096 *cost += COSTS_N_INSNS (1);
14098 return true;
14101 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14103 /* Look for ADD (extended register). */
14104 if (is_a <scalar_int_mode> (mode)
14105 && aarch64_rtx_arith_op_extract_p (op0))
14107 if (speed)
14108 *cost += extra_cost->alu.extend_arith;
14110 op0 = aarch64_strip_extend (op0, true);
14111 *cost += rtx_cost (op0, VOIDmode,
14112 (enum rtx_code) GET_CODE (op0), 0, speed);
14113 return true;
14116 /* Strip any extend, leave shifts behind as we will
14117 cost them through mult_cost. */
14118 new_op0 = aarch64_strip_extend (op0, false);
14120 if (GET_CODE (new_op0) == MULT
14121 || aarch64_shift_p (GET_CODE (new_op0)))
14123 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14124 speed);
14125 return true;
14128 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14130 if (speed)
14132 if (VECTOR_MODE_P (mode))
14134 /* Vector ADD. */
14135 *cost += extra_cost->vect.alu;
14137 else if (GET_MODE_CLASS (mode) == MODE_INT)
14139 /* ADD. */
14140 *cost += extra_cost->alu.arith;
14142 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14144 /* FADD. */
14145 *cost += extra_cost->fp[mode == DFmode].addsub;
14148 return true;
14151 case BSWAP:
14152 *cost = COSTS_N_INSNS (1);
14154 if (speed)
14156 if (VECTOR_MODE_P (mode))
14157 *cost += extra_cost->vect.alu;
14158 else
14159 *cost += extra_cost->alu.rev;
14161 return false;
14163 case IOR:
14164 if (aarch_rev16_p (x))
14166 *cost = COSTS_N_INSNS (1);
14168 if (speed)
14170 if (VECTOR_MODE_P (mode))
14171 *cost += extra_cost->vect.alu;
14172 else
14173 *cost += extra_cost->alu.rev;
14175 return true;
14178 if (aarch64_extr_rtx_p (x, &op0, &op1))
14180 *cost += rtx_cost (op0, mode, IOR, 0, speed);
14181 *cost += rtx_cost (op1, mode, IOR, 1, speed);
14182 if (speed)
14183 *cost += extra_cost->alu.shift;
14185 return true;
14187 /* Fall through. */
14188 case XOR:
14189 case AND:
14190 cost_logic:
14191 op0 = XEXP (x, 0);
14192 op1 = XEXP (x, 1);
14194 if (VECTOR_MODE_P (mode))
14196 if (speed)
14197 *cost += extra_cost->vect.alu;
14198 return true;
14201 if (code == AND
14202 && GET_CODE (op0) == MULT
14203 && CONST_INT_P (XEXP (op0, 1))
14204 && CONST_INT_P (op1)
14205 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14206 INTVAL (op1)) != 0)
14208 /* This is a UBFM/SBFM. */
14209 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14210 if (speed)
14211 *cost += extra_cost->alu.bfx;
14212 return true;
14215 if (is_int_mode (mode, &int_mode))
14217 if (CONST_INT_P (op1))
14219 /* We have a mask + shift version of a UBFIZ
14220 i.e. the *andim_ashift<mode>_bfiz pattern. */
14221 if (GET_CODE (op0) == ASHIFT
14222 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14223 XEXP (op0, 1)))
14225 *cost += rtx_cost (XEXP (op0, 0), int_mode,
14226 (enum rtx_code) code, 0, speed);
14227 if (speed)
14228 *cost += extra_cost->alu.bfx;
14230 return true;
14232 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14234 /* We possibly get the immediate for free, this is not
14235 modelled. */
14236 *cost += rtx_cost (op0, int_mode,
14237 (enum rtx_code) code, 0, speed);
14238 if (speed)
14239 *cost += extra_cost->alu.logical;
14241 return true;
14244 else
14246 rtx new_op0 = op0;
14248 /* Handle ORN, EON, or BIC. */
14249 if (GET_CODE (op0) == NOT)
14250 op0 = XEXP (op0, 0);
14252 new_op0 = aarch64_strip_shift (op0);
14254 /* If we had a shift on op0 then this is a logical-shift-
14255 by-register/immediate operation. Otherwise, this is just
14256 a logical operation. */
14257 if (speed)
14259 if (new_op0 != op0)
14261 /* Shift by immediate. */
14262 if (CONST_INT_P (XEXP (op0, 1)))
14263 *cost += extra_cost->alu.log_shift;
14264 else
14265 *cost += extra_cost->alu.log_shift_reg;
14267 else
14268 *cost += extra_cost->alu.logical;
14271 /* In both cases we want to cost both operands. */
14272 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14273 0, speed);
14274 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14275 1, speed);
14277 return true;
14280 return false;
14282 case NOT:
14283 x = XEXP (x, 0);
14284 op0 = aarch64_strip_shift (x);
14286 if (VECTOR_MODE_P (mode))
14288 /* Vector NOT. */
14289 *cost += extra_cost->vect.alu;
14290 return false;
14293 /* MVN-shifted-reg. */
14294 if (op0 != x)
14296 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14298 if (speed)
14299 *cost += extra_cost->alu.log_shift;
14301 return true;
14303 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14304 Handle the second form here taking care that 'a' in the above can
14305 be a shift. */
14306 else if (GET_CODE (op0) == XOR)
14308 rtx newop0 = XEXP (op0, 0);
14309 rtx newop1 = XEXP (op0, 1);
14310 rtx op0_stripped = aarch64_strip_shift (newop0);
14312 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14313 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14315 if (speed)
14317 if (op0_stripped != newop0)
14318 *cost += extra_cost->alu.log_shift;
14319 else
14320 *cost += extra_cost->alu.logical;
14323 return true;
14325 /* MVN. */
14326 if (speed)
14327 *cost += extra_cost->alu.logical;
14329 return false;
14331 case ZERO_EXTEND:
14333 op0 = XEXP (x, 0);
14334 /* If a value is written in SI mode, then zero extended to DI
14335 mode, the operation will in general be free as a write to
14336 a 'w' register implicitly zeroes the upper bits of an 'x'
14337 register. However, if this is
14339 (set (reg) (zero_extend (reg)))
14341 we must cost the explicit register move. */
14342 if (mode == DImode
14343 && GET_MODE (op0) == SImode)
14345 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14347 /* If OP_COST is non-zero, then the cost of the zero extend
14348 is effectively the cost of the inner operation. Otherwise
14349 we have a MOV instruction and we take the cost from the MOV
14350 itself. This is true independently of whether we are
14351 optimizing for space or time. */
14352 if (op_cost)
14353 *cost = op_cost;
14355 return true;
14357 else if (MEM_P (op0))
14359 /* All loads can zero extend to any size for free. */
14360 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14361 return true;
14364 op0 = aarch64_extend_bitfield_pattern_p (x);
14365 if (op0)
14367 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14368 if (speed)
14369 *cost += extra_cost->alu.bfx;
14370 return true;
14373 if (speed)
14375 if (VECTOR_MODE_P (mode))
14377 /* UMOV. */
14378 *cost += extra_cost->vect.alu;
14380 else
14382 /* We generate an AND instead of UXTB/UXTH. */
14383 *cost += extra_cost->alu.logical;
14386 return false;
14388 case SIGN_EXTEND:
14389 if (MEM_P (XEXP (x, 0)))
14391 /* LDRSH. */
14392 if (speed)
14394 rtx address = XEXP (XEXP (x, 0), 0);
14395 *cost += extra_cost->ldst.load_sign_extend;
14397 *cost +=
14398 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14399 0, speed));
14401 return true;
14404 op0 = aarch64_extend_bitfield_pattern_p (x);
14405 if (op0)
14407 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14408 if (speed)
14409 *cost += extra_cost->alu.bfx;
14410 return true;
14413 if (speed)
14415 if (VECTOR_MODE_P (mode))
14416 *cost += extra_cost->vect.alu;
14417 else
14418 *cost += extra_cost->alu.extend;
14420 return false;
14422 case ASHIFT:
14423 op0 = XEXP (x, 0);
14424 op1 = XEXP (x, 1);
14426 if (CONST_INT_P (op1))
14428 if (speed)
14430 if (VECTOR_MODE_P (mode))
14432 /* Vector shift (immediate). */
14433 *cost += extra_cost->vect.alu;
14435 else
14437 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
14438 aliases. */
14439 *cost += extra_cost->alu.shift;
14443 /* We can incorporate zero/sign extend for free. */
14444 if (GET_CODE (op0) == ZERO_EXTEND
14445 || GET_CODE (op0) == SIGN_EXTEND)
14446 op0 = XEXP (op0, 0);
14448 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14449 return true;
14451 else
14453 if (VECTOR_MODE_P (mode))
14455 if (speed)
14456 /* Vector shift (register). */
14457 *cost += extra_cost->vect.alu;
14459 else
14461 if (speed)
14462 /* LSLV. */
14463 *cost += extra_cost->alu.shift_reg;
14465 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14466 && CONST_INT_P (XEXP (op1, 1))
14467 && known_eq (INTVAL (XEXP (op1, 1)),
14468 GET_MODE_BITSIZE (mode) - 1))
14470 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14471 /* We already demanded XEXP (op1, 0) to be REG_P, so
14472 don't recurse into it. */
14473 return true;
14476 return false; /* All arguments need to be in registers. */
14479 case ROTATE:
14480 case ROTATERT:
14481 case LSHIFTRT:
14482 case ASHIFTRT:
14483 op0 = XEXP (x, 0);
14484 op1 = XEXP (x, 1);
14486 if (CONST_INT_P (op1))
14488 /* ASR (immediate) and friends. */
14489 if (speed)
14491 if (VECTOR_MODE_P (mode))
14492 *cost += extra_cost->vect.alu;
14493 else
14494 *cost += extra_cost->alu.shift;
14497 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14498 return true;
14500 else
14502 if (VECTOR_MODE_P (mode))
14504 if (speed)
14505 /* Vector shift (register). */
14506 *cost += extra_cost->vect.alu;
14508 else
14510 if (speed)
14511 /* ASR (register) and friends. */
14512 *cost += extra_cost->alu.shift_reg;
14514 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14515 && CONST_INT_P (XEXP (op1, 1))
14516 && known_eq (INTVAL (XEXP (op1, 1)),
14517 GET_MODE_BITSIZE (mode) - 1))
14519 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14520 /* We already demanded XEXP (op1, 0) to be REG_P, so
14521 don't recurse into it. */
14522 return true;
14525 return false; /* All arguments need to be in registers. */
14528 case SYMBOL_REF:
14530 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14531 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14533 /* LDR. */
14534 if (speed)
14535 *cost += extra_cost->ldst.load;
14537 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14538 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14540 /* ADRP, followed by ADD. */
14541 *cost += COSTS_N_INSNS (1);
14542 if (speed)
14543 *cost += 2 * extra_cost->alu.arith;
14545 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14546 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14548 /* ADR. */
14549 if (speed)
14550 *cost += extra_cost->alu.arith;
14553 if (flag_pic)
14555 /* One extra load instruction, after accessing the GOT. */
14556 *cost += COSTS_N_INSNS (1);
14557 if (speed)
14558 *cost += extra_cost->ldst.load;
14560 return true;
14562 case HIGH:
14563 case LO_SUM:
14564 /* ADRP/ADD (immediate). */
14565 if (speed)
14566 *cost += extra_cost->alu.arith;
14567 return true;
14569 case ZERO_EXTRACT:
14570 case SIGN_EXTRACT:
14571 /* UBFX/SBFX. */
14572 if (speed)
14574 if (VECTOR_MODE_P (mode))
14575 *cost += extra_cost->vect.alu;
14576 else
14577 *cost += extra_cost->alu.bfx;
14580 /* We can trust that the immediates used will be correct (there
14581 are no by-register forms), so we need only cost op0. */
14582 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14583 return true;
14585 case MULT:
14586 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14587 /* aarch64_rtx_mult_cost always handles recursion to its
14588 operands. */
14589 return true;
14591 case MOD:
14592 /* We can expand signed mod by power of 2 using a NEGS, two parallel
14593 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
14594 an unconditional negate. This case should only ever be reached through
14595 the set_smod_pow2_cheap check in expmed.cc. */
14596 if (CONST_INT_P (XEXP (x, 1))
14597 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14598 && (mode == SImode || mode == DImode))
14600 /* We expand to 4 instructions. Reset the baseline. */
14601 *cost = COSTS_N_INSNS (4);
14603 if (speed)
14604 *cost += 2 * extra_cost->alu.logical
14605 + 2 * extra_cost->alu.arith;
14607 return true;
14610 /* Fall-through. */
14611 case UMOD:
14612 if (speed)
14614 /* Slighly prefer UMOD over SMOD. */
14615 if (VECTOR_MODE_P (mode))
14616 *cost += extra_cost->vect.alu;
14617 else if (GET_MODE_CLASS (mode) == MODE_INT)
14618 *cost += (extra_cost->mult[mode == DImode].add
14619 + extra_cost->mult[mode == DImode].idiv
14620 + (code == MOD ? 1 : 0));
14622 return false; /* All arguments need to be in registers. */
14624 case DIV:
14625 case UDIV:
14626 case SQRT:
14627 if (speed)
14629 if (VECTOR_MODE_P (mode))
14630 *cost += extra_cost->vect.alu;
14631 else if (GET_MODE_CLASS (mode) == MODE_INT)
14632 /* There is no integer SQRT, so only DIV and UDIV can get
14633 here. */
14634 *cost += (extra_cost->mult[mode == DImode].idiv
14635 /* Slighly prefer UDIV over SDIV. */
14636 + (code == DIV ? 1 : 0));
14637 else
14638 *cost += extra_cost->fp[mode == DFmode].div;
14640 return false; /* All arguments need to be in registers. */
14642 case IF_THEN_ELSE:
14643 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14644 XEXP (x, 2), cost, speed);
14646 case EQ:
14647 case NE:
14648 case GT:
14649 case GTU:
14650 case LT:
14651 case LTU:
14652 case GE:
14653 case GEU:
14654 case LE:
14655 case LEU:
14657 return false; /* All arguments must be in registers. */
14659 case FMA:
14660 op0 = XEXP (x, 0);
14661 op1 = XEXP (x, 1);
14662 op2 = XEXP (x, 2);
14664 if (speed)
14666 if (VECTOR_MODE_P (mode))
14667 *cost += extra_cost->vect.alu;
14668 else
14669 *cost += extra_cost->fp[mode == DFmode].fma;
14672 /* FMSUB, FNMADD, and FNMSUB are free. */
14673 if (GET_CODE (op0) == NEG)
14674 op0 = XEXP (op0, 0);
14676 if (GET_CODE (op2) == NEG)
14677 op2 = XEXP (op2, 0);
14679 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14680 and the by-element operand as operand 0. */
14681 if (GET_CODE (op1) == NEG)
14682 op1 = XEXP (op1, 0);
14684 /* Catch vector-by-element operations. The by-element operand can
14685 either be (vec_duplicate (vec_select (x))) or just
14686 (vec_select (x)), depending on whether we are multiplying by
14687 a vector or a scalar.
14689 Canonicalization is not very good in these cases, FMA4 will put the
14690 by-element operand as operand 0, FNMA4 will have it as operand 1. */
14691 if (GET_CODE (op0) == VEC_DUPLICATE)
14692 op0 = XEXP (op0, 0);
14693 else if (GET_CODE (op1) == VEC_DUPLICATE)
14694 op1 = XEXP (op1, 0);
14696 if (GET_CODE (op0) == VEC_SELECT)
14697 op0 = XEXP (op0, 0);
14698 else if (GET_CODE (op1) == VEC_SELECT)
14699 op1 = XEXP (op1, 0);
14701 /* If the remaining parameters are not registers,
14702 get the cost to put them into registers. */
14703 *cost += rtx_cost (op0, mode, FMA, 0, speed);
14704 *cost += rtx_cost (op1, mode, FMA, 1, speed);
14705 *cost += rtx_cost (op2, mode, FMA, 2, speed);
14706 return true;
14708 case FLOAT:
14709 case UNSIGNED_FLOAT:
14710 if (speed)
14711 *cost += extra_cost->fp[mode == DFmode].fromint;
14712 return false;
14714 case FLOAT_EXTEND:
14715 if (speed)
14717 if (VECTOR_MODE_P (mode))
14719 /*Vector truncate. */
14720 *cost += extra_cost->vect.alu;
14722 else
14723 *cost += extra_cost->fp[mode == DFmode].widen;
14725 return false;
14727 case FLOAT_TRUNCATE:
14728 if (speed)
14730 if (VECTOR_MODE_P (mode))
14732 /*Vector conversion. */
14733 *cost += extra_cost->vect.alu;
14735 else
14736 *cost += extra_cost->fp[mode == DFmode].narrow;
14738 return false;
14740 case FIX:
14741 case UNSIGNED_FIX:
14742 x = XEXP (x, 0);
14743 /* Strip the rounding part. They will all be implemented
14744 by the fcvt* family of instructions anyway. */
14745 if (GET_CODE (x) == UNSPEC)
14747 unsigned int uns_code = XINT (x, 1);
14749 if (uns_code == UNSPEC_FRINTA
14750 || uns_code == UNSPEC_FRINTM
14751 || uns_code == UNSPEC_FRINTN
14752 || uns_code == UNSPEC_FRINTP
14753 || uns_code == UNSPEC_FRINTZ)
14754 x = XVECEXP (x, 0, 0);
14757 if (speed)
14759 if (VECTOR_MODE_P (mode))
14760 *cost += extra_cost->vect.alu;
14761 else
14762 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
14765 /* We can combine fmul by a power of 2 followed by a fcvt into a single
14766 fixed-point fcvt. */
14767 if (GET_CODE (x) == MULT
14768 && ((VECTOR_MODE_P (mode)
14769 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
14770 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
14772 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
14773 0, speed);
14774 return true;
14777 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
14778 return true;
14780 case ABS:
14781 if (VECTOR_MODE_P (mode))
14783 /* ABS (vector). */
14784 if (speed)
14785 *cost += extra_cost->vect.alu;
14787 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14789 op0 = XEXP (x, 0);
14791 /* FABD, which is analogous to FADD. */
14792 if (GET_CODE (op0) == MINUS)
14794 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
14795 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
14796 if (speed)
14797 *cost += extra_cost->fp[mode == DFmode].addsub;
14799 return true;
14801 /* Simple FABS is analogous to FNEG. */
14802 if (speed)
14803 *cost += extra_cost->fp[mode == DFmode].neg;
14805 else
14807 /* Integer ABS will either be split to
14808 two arithmetic instructions, or will be an ABS
14809 (scalar), which we don't model. */
14810 *cost = COSTS_N_INSNS (2);
14811 if (speed)
14812 *cost += 2 * extra_cost->alu.arith;
14814 return false;
14816 case SMAX:
14817 case SMIN:
14818 if (speed)
14820 if (VECTOR_MODE_P (mode))
14821 *cost += extra_cost->vect.alu;
14822 else
14824 /* FMAXNM/FMINNM/FMAX/FMIN.
14825 TODO: This may not be accurate for all implementations, but
14826 we do not model this in the cost tables. */
14827 *cost += extra_cost->fp[mode == DFmode].addsub;
14830 return false;
14832 case UNSPEC:
14833 /* The floating point round to integer frint* instructions. */
14834 if (aarch64_frint_unspec_p (XINT (x, 1)))
14836 if (speed)
14837 *cost += extra_cost->fp[mode == DFmode].roundint;
14839 return false;
14842 if (XINT (x, 1) == UNSPEC_RBIT)
14844 if (speed)
14845 *cost += extra_cost->alu.rev;
14847 return false;
14849 break;
14851 case TRUNCATE:
14853 /* Decompose <su>muldi3_highpart. */
14854 if (/* (truncate:DI */
14855 mode == DImode
14856 /* (lshiftrt:TI */
14857 && GET_MODE (XEXP (x, 0)) == TImode
14858 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
14859 /* (mult:TI */
14860 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
14861 /* (ANY_EXTEND:TI (reg:DI))
14862 (ANY_EXTEND:TI (reg:DI))) */
14863 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
14864 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
14865 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
14866 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
14867 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
14868 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
14869 /* (const_int 64) */
14870 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
14871 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
14873 /* UMULH/SMULH. */
14874 if (speed)
14875 *cost += extra_cost->mult[mode == DImode].extend;
14876 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
14877 mode, MULT, 0, speed);
14878 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
14879 mode, MULT, 1, speed);
14880 return true;
14882 break;
14883 case CONST_VECTOR:
14885 /* Load using MOVI/MVNI. */
14886 if (aarch64_simd_valid_immediate (x, NULL))
14887 *cost = extra_cost->vect.movi;
14888 else /* Load using constant pool. */
14889 *cost = extra_cost->ldst.load;
14890 break;
14892 case VEC_CONCAT:
14893 /* depending on the operation, either DUP or INS.
14894 For now, keep default costing. */
14895 break;
14896 case VEC_DUPLICATE:
14897 /* Load using a DUP. */
14898 *cost = extra_cost->vect.dup;
14899 return false;
14900 case VEC_SELECT:
14902 rtx op0 = XEXP (x, 0);
14903 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
14905 /* cost subreg of 0 as free, otherwise as DUP */
14906 rtx op1 = XEXP (x, 1);
14907 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
14909 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
14910 *cost = extra_cost->vect.dup;
14911 else
14912 *cost = extra_cost->vect.extract;
14913 return true;
14915 default:
14916 break;
14919 if (dump_file
14920 && flag_aarch64_verbose_cost)
14921 fprintf (dump_file,
14922 "\nFailed to cost RTX. Assuming default cost.\n");
14924 return true;
14927 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
14928 calculated for X. This cost is stored in *COST. Returns true
14929 if the total cost of X was calculated. */
14930 static bool
14931 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
14932 int param, int *cost, bool speed)
14934 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
14936 if (dump_file
14937 && flag_aarch64_verbose_cost)
14939 print_rtl_single (dump_file, x);
14940 fprintf (dump_file, "\n%s cost: %d (%s)\n",
14941 speed ? "Hot" : "Cold",
14942 *cost, result ? "final" : "partial");
14945 return result;
14948 static int
14949 aarch64_register_move_cost (machine_mode mode,
14950 reg_class_t from_i, reg_class_t to_i)
14952 enum reg_class from = (enum reg_class) from_i;
14953 enum reg_class to = (enum reg_class) to_i;
14954 const struct cpu_regmove_cost *regmove_cost
14955 = aarch64_tune_params.regmove_cost;
14957 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
14958 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
14959 || to == STUB_REGS)
14960 to = GENERAL_REGS;
14962 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
14963 || from == STUB_REGS)
14964 from = GENERAL_REGS;
14966 /* Make RDFFR very expensive. In particular, if we know that the FFR
14967 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
14968 as a way of obtaining a PTRUE. */
14969 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
14970 && hard_reg_set_subset_p (reg_class_contents[from_i],
14971 reg_class_contents[FFR_REGS]))
14972 return 80;
14974 /* Moving between GPR and stack cost is the same as GP2GP. */
14975 if ((from == GENERAL_REGS && to == STACK_REG)
14976 || (to == GENERAL_REGS && from == STACK_REG))
14977 return regmove_cost->GP2GP;
14979 /* To/From the stack register, we move via the gprs. */
14980 if (to == STACK_REG || from == STACK_REG)
14981 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
14982 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
14984 if (known_eq (GET_MODE_SIZE (mode), 16))
14986 /* 128-bit operations on general registers require 2 instructions. */
14987 if (from == GENERAL_REGS && to == GENERAL_REGS)
14988 return regmove_cost->GP2GP * 2;
14989 else if (from == GENERAL_REGS)
14990 return regmove_cost->GP2FP * 2;
14991 else if (to == GENERAL_REGS)
14992 return regmove_cost->FP2GP * 2;
14994 /* When AdvSIMD instructions are disabled it is not possible to move
14995 a 128-bit value directly between Q registers. This is handled in
14996 secondary reload. A general register is used as a scratch to move
14997 the upper DI value and the lower DI value is moved directly,
14998 hence the cost is the sum of three moves. */
14999 if (! TARGET_SIMD)
15000 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15002 return regmove_cost->FP2FP;
15005 if (from == GENERAL_REGS && to == GENERAL_REGS)
15006 return regmove_cost->GP2GP;
15007 else if (from == GENERAL_REGS)
15008 return regmove_cost->GP2FP;
15009 else if (to == GENERAL_REGS)
15010 return regmove_cost->FP2GP;
15012 return regmove_cost->FP2FP;
15015 /* Implements TARGET_MEMORY_MOVE_COST. */
15016 static int
15017 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15019 enum reg_class rclass = (enum reg_class) rclass_i;
15020 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15021 ? reg_classes_intersect_p (rclass, PR_REGS)
15022 : reg_class_subset_p (rclass, PR_REGS))
15023 return (in
15024 ? aarch64_tune_params.memmov_cost.load_pred
15025 : aarch64_tune_params.memmov_cost.store_pred);
15027 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15028 ? reg_classes_intersect_p (rclass, FP_REGS)
15029 : reg_class_subset_p (rclass, FP_REGS))
15030 return (in
15031 ? aarch64_tune_params.memmov_cost.load_fp
15032 : aarch64_tune_params.memmov_cost.store_fp);
15034 return (in
15035 ? aarch64_tune_params.memmov_cost.load_int
15036 : aarch64_tune_params.memmov_cost.store_int);
15039 /* Implement TARGET_INIT_BUILTINS. */
15040 static void
15041 aarch64_init_builtins ()
15043 aarch64_general_init_builtins ();
15044 aarch64_sve::init_builtins ();
15045 #ifdef SUBTARGET_INIT_BUILTINS
15046 SUBTARGET_INIT_BUILTINS;
15047 #endif
15050 /* Implement TARGET_FOLD_BUILTIN. */
15051 static tree
15052 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15054 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15055 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15056 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15057 switch (code & AARCH64_BUILTIN_CLASS)
15059 case AARCH64_BUILTIN_GENERAL:
15060 return aarch64_general_fold_builtin (subcode, type, nargs, args);
15062 case AARCH64_BUILTIN_SVE:
15063 return NULL_TREE;
15065 gcc_unreachable ();
15068 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15069 static bool
15070 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15072 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15073 tree fndecl = gimple_call_fndecl (stmt);
15074 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15075 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15076 gimple *new_stmt = NULL;
15077 switch (code & AARCH64_BUILTIN_CLASS)
15079 case AARCH64_BUILTIN_GENERAL:
15080 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15081 break;
15083 case AARCH64_BUILTIN_SVE:
15084 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15085 break;
15088 if (!new_stmt)
15089 return false;
15091 gsi_replace (gsi, new_stmt, true);
15092 return true;
15095 /* Implement TARGET_EXPAND_BUILTIN. */
15096 static rtx
15097 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15099 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15100 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15101 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15102 switch (code & AARCH64_BUILTIN_CLASS)
15104 case AARCH64_BUILTIN_GENERAL:
15105 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15107 case AARCH64_BUILTIN_SVE:
15108 return aarch64_sve::expand_builtin (subcode, exp, target);
15110 gcc_unreachable ();
15113 /* Implement TARGET_BUILTIN_DECL. */
15114 static tree
15115 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15117 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15118 switch (code & AARCH64_BUILTIN_CLASS)
15120 case AARCH64_BUILTIN_GENERAL:
15121 return aarch64_general_builtin_decl (subcode, initialize_p);
15123 case AARCH64_BUILTIN_SVE:
15124 return aarch64_sve::builtin_decl (subcode, initialize_p);
15126 gcc_unreachable ();
15129 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15130 to optimize 1.0/sqrt. */
15132 static bool
15133 use_rsqrt_p (machine_mode mode)
15135 return (!flag_trapping_math
15136 && flag_unsafe_math_optimizations
15137 && ((aarch64_tune_params.approx_modes->recip_sqrt
15138 & AARCH64_APPROX_MODE (mode))
15139 || flag_mrecip_low_precision_sqrt));
15142 /* Function to decide when to use the approximate reciprocal square root
15143 builtin. */
15145 static tree
15146 aarch64_builtin_reciprocal (tree fndecl)
15148 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15150 if (!use_rsqrt_p (mode))
15151 return NULL_TREE;
15152 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15153 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15154 switch (code & AARCH64_BUILTIN_CLASS)
15156 case AARCH64_BUILTIN_GENERAL:
15157 return aarch64_general_builtin_rsqrt (subcode);
15159 case AARCH64_BUILTIN_SVE:
15160 return NULL_TREE;
15162 gcc_unreachable ();
15165 /* Emit code to perform the floating-point operation:
15167 DST = SRC1 * SRC2
15169 where all three operands are already known to be registers.
15170 If the operation is an SVE one, PTRUE is a suitable all-true
15171 predicate. */
15173 static void
15174 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15176 if (ptrue)
15177 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15178 dst, ptrue, src1, src2,
15179 gen_int_mode (SVE_RELAXED_GP, SImode)));
15180 else
15181 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15184 /* Emit instruction sequence to compute either the approximate square root
15185 or its approximate reciprocal, depending on the flag RECP, and return
15186 whether the sequence was emitted or not. */
15188 bool
15189 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15191 machine_mode mode = GET_MODE (dst);
15193 if (GET_MODE_INNER (mode) == HFmode)
15195 gcc_assert (!recp);
15196 return false;
15199 if (!recp)
15201 if (!(flag_mlow_precision_sqrt
15202 || (aarch64_tune_params.approx_modes->sqrt
15203 & AARCH64_APPROX_MODE (mode))))
15204 return false;
15206 if (!flag_finite_math_only
15207 || flag_trapping_math
15208 || !flag_unsafe_math_optimizations
15209 || optimize_function_for_size_p (cfun))
15210 return false;
15212 else
15213 /* Caller assumes we cannot fail. */
15214 gcc_assert (use_rsqrt_p (mode));
15216 rtx pg = NULL_RTX;
15217 if (aarch64_sve_mode_p (mode))
15218 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15219 machine_mode mmsk = (VECTOR_MODE_P (mode)
15220 ? related_int_vector_mode (mode).require ()
15221 : int_mode_for_mode (mode).require ());
15222 rtx xmsk = NULL_RTX;
15223 if (!recp)
15225 /* When calculating the approximate square root, compare the
15226 argument with 0.0 and create a mask. */
15227 rtx zero = CONST0_RTX (mode);
15228 if (pg)
15230 xmsk = gen_reg_rtx (GET_MODE (pg));
15231 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15232 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15233 xmsk, pg, hint, src, zero));
15235 else
15237 xmsk = gen_reg_rtx (mmsk);
15238 emit_insn (gen_rtx_SET (xmsk,
15239 gen_rtx_NEG (mmsk,
15240 gen_rtx_EQ (mmsk, src, zero))));
15244 /* Estimate the approximate reciprocal square root. */
15245 rtx xdst = gen_reg_rtx (mode);
15246 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15248 /* Iterate over the series twice for SF and thrice for DF. */
15249 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15251 /* Optionally iterate over the series once less for faster performance
15252 while sacrificing the accuracy. */
15253 if ((recp && flag_mrecip_low_precision_sqrt)
15254 || (!recp && flag_mlow_precision_sqrt))
15255 iterations--;
15257 /* Iterate over the series to calculate the approximate reciprocal square
15258 root. */
15259 rtx x1 = gen_reg_rtx (mode);
15260 while (iterations--)
15262 rtx x2 = gen_reg_rtx (mode);
15263 aarch64_emit_mult (x2, pg, xdst, xdst);
15265 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15267 if (iterations > 0)
15268 aarch64_emit_mult (xdst, pg, xdst, x1);
15271 if (!recp)
15273 if (pg)
15274 /* Multiply nonzero source values by the corresponding intermediate
15275 result elements, so that the final calculation is the approximate
15276 square root rather than its reciprocal. Select a zero result for
15277 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15278 otherwise. */
15279 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15280 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15281 else
15283 /* Qualify the approximate reciprocal square root when the
15284 argument is 0.0 by squashing the intermediary result to 0.0. */
15285 rtx xtmp = gen_reg_rtx (mmsk);
15286 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15287 gen_rtx_SUBREG (mmsk, xdst, 0)));
15288 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15290 /* Calculate the approximate square root. */
15291 aarch64_emit_mult (xdst, pg, xdst, src);
15295 /* Finalize the approximation. */
15296 aarch64_emit_mult (dst, pg, xdst, x1);
15298 return true;
15301 /* Emit the instruction sequence to compute the approximation for the division
15302 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15304 bool
15305 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15307 machine_mode mode = GET_MODE (quo);
15309 if (GET_MODE_INNER (mode) == HFmode)
15310 return false;
15312 bool use_approx_division_p = (flag_mlow_precision_div
15313 || (aarch64_tune_params.approx_modes->division
15314 & AARCH64_APPROX_MODE (mode)));
15316 if (!flag_finite_math_only
15317 || flag_trapping_math
15318 || !flag_unsafe_math_optimizations
15319 || optimize_function_for_size_p (cfun)
15320 || !use_approx_division_p)
15321 return false;
15323 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15324 return false;
15326 rtx pg = NULL_RTX;
15327 if (aarch64_sve_mode_p (mode))
15328 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15330 /* Estimate the approximate reciprocal. */
15331 rtx xrcp = gen_reg_rtx (mode);
15332 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15334 /* Iterate over the series twice for SF and thrice for DF. */
15335 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15337 /* Optionally iterate over the series less for faster performance,
15338 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15339 if (flag_mlow_precision_div)
15340 iterations = (GET_MODE_INNER (mode) == DFmode
15341 ? aarch64_double_recp_precision
15342 : aarch64_float_recp_precision);
15344 /* Iterate over the series to calculate the approximate reciprocal. */
15345 rtx xtmp = gen_reg_rtx (mode);
15346 while (iterations--)
15348 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15350 if (iterations > 0)
15351 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15354 if (num != CONST1_RTX (mode))
15356 /* As the approximate reciprocal of DEN is already calculated, only
15357 calculate the approximate division when NUM is not 1.0. */
15358 rtx xnum = force_reg (mode, num);
15359 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15362 /* Finalize the approximation. */
15363 aarch64_emit_mult (quo, pg, xrcp, xtmp);
15364 return true;
15367 /* Return the number of instructions that can be issued per cycle. */
15368 static int
15369 aarch64_sched_issue_rate (void)
15371 return aarch64_tune_params.issue_rate;
15374 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15375 static int
15376 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15378 if (DEBUG_INSN_P (insn))
15379 return more;
15381 rtx_code code = GET_CODE (PATTERN (insn));
15382 if (code == USE || code == CLOBBER)
15383 return more;
15385 if (get_attr_type (insn) == TYPE_NO_INSN)
15386 return more;
15388 return more - 1;
15391 static int
15392 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15394 int issue_rate = aarch64_sched_issue_rate ();
15396 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15400 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15401 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15402 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15404 static int
15405 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15406 int ready_index)
15408 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15412 /* Vectorizer cost model target hooks. */
15414 /* Information about how the CPU would issue the scalar, Advanced SIMD
15415 or SVE version of a vector loop, using the scheme defined by the
15416 aarch64_base_vec_issue_info hierarchy of structures. */
15417 class aarch64_vec_op_count
15419 public:
15420 aarch64_vec_op_count () = default;
15421 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15422 unsigned int = 1);
15424 unsigned int vec_flags () const { return m_vec_flags; }
15425 unsigned int vf_factor () const { return m_vf_factor; }
15427 const aarch64_base_vec_issue_info *base_issue_info () const;
15428 const aarch64_simd_vec_issue_info *simd_issue_info () const;
15429 const aarch64_sve_vec_issue_info *sve_issue_info () const;
15431 fractional_cost rename_cycles_per_iter () const;
15432 fractional_cost min_nonpred_cycles_per_iter () const;
15433 fractional_cost min_pred_cycles_per_iter () const;
15434 fractional_cost min_cycles_per_iter () const;
15436 void dump () const;
15438 /* The number of individual "general" operations. See the comments
15439 in aarch64_base_vec_issue_info for details. */
15440 unsigned int general_ops = 0;
15442 /* The number of load and store operations, under the same scheme
15443 as above. */
15444 unsigned int loads = 0;
15445 unsigned int stores = 0;
15447 /* The minimum number of cycles needed to execute all loop-carried
15448 operations, which in the vector code become associated with
15449 reductions. */
15450 unsigned int reduction_latency = 0;
15452 /* The number of individual predicate operations. See the comments
15453 in aarch64_sve_vec_issue_info for details. */
15454 unsigned int pred_ops = 0;
15456 private:
15457 /* The issue information for the core. */
15458 const aarch64_vec_issue_info *m_issue_info = nullptr;
15460 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15461 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15462 Advanced SIMD code.
15463 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15464 SVE code. */
15465 unsigned int m_vec_flags = 0;
15467 /* Assume that, when the code is executing on the core described
15468 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15469 times more data than the vectorizer anticipates.
15471 This is only ever different from 1 for SVE. It allows us to consider
15472 what would happen on a 256-bit SVE target even when the -mtune
15473 parameters say that the “likely” SVE length is 128 bits. */
15474 unsigned int m_vf_factor = 1;
15477 aarch64_vec_op_count::
15478 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15479 unsigned int vec_flags, unsigned int vf_factor)
15480 : m_issue_info (issue_info),
15481 m_vec_flags (vec_flags),
15482 m_vf_factor (vf_factor)
15486 /* Return the base issue information (i.e. the parts that make sense
15487 for both scalar and vector code). Return null if we have no issue
15488 information. */
15489 const aarch64_base_vec_issue_info *
15490 aarch64_vec_op_count::base_issue_info () const
15492 if (auto *ret = simd_issue_info ())
15493 return ret;
15494 return m_issue_info->scalar;
15497 /* If the structure describes vector code and we have associated issue
15498 information, return that issue information, otherwise return null. */
15499 const aarch64_simd_vec_issue_info *
15500 aarch64_vec_op_count::simd_issue_info () const
15502 if (auto *ret = sve_issue_info ())
15503 return ret;
15504 if (m_vec_flags)
15505 return m_issue_info->advsimd;
15506 return nullptr;
15509 /* If the structure describes SVE code and we have associated issue
15510 information, return that issue information, otherwise return null. */
15511 const aarch64_sve_vec_issue_info *
15512 aarch64_vec_op_count::sve_issue_info () const
15514 if (m_vec_flags & VEC_ANY_SVE)
15515 return m_issue_info->sve;
15516 return nullptr;
15519 /* Estimate the minimum number of cycles per iteration needed to rename
15520 the instructions.
15522 ??? For now this is done inline rather than via cost tables, since it
15523 isn't clear how it should be parameterized for the general case. */
15524 fractional_cost
15525 aarch64_vec_op_count::rename_cycles_per_iter () const
15527 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15528 || sve_issue_info () == &neoversen2_sve_issue_info
15529 || sve_issue_info () == &demeter_sve_issue_info)
15530 /* + 1 for an addition. We've already counted a general op for each
15531 store, so we don't need to account for stores separately. The branch
15532 reads no registers and so does not need to be counted either.
15534 ??? This value is very much on the pessimistic side, but seems to work
15535 pretty well in practice. */
15536 return { general_ops + loads + pred_ops + 1, 5 };
15538 return 0;
15541 /* Like min_cycles_per_iter, but excluding predicate operations. */
15542 fractional_cost
15543 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15545 auto *issue_info = base_issue_info ();
15547 fractional_cost cycles = MAX (reduction_latency, 1);
15548 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15549 cycles = std::max (cycles, { loads + stores,
15550 issue_info->loads_stores_per_cycle });
15551 cycles = std::max (cycles, { general_ops,
15552 issue_info->general_ops_per_cycle });
15553 cycles = std::max (cycles, rename_cycles_per_iter ());
15554 return cycles;
15557 /* Like min_cycles_per_iter, but including only the predicate operations. */
15558 fractional_cost
15559 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15561 if (auto *issue_info = sve_issue_info ())
15562 return { pred_ops, issue_info->pred_ops_per_cycle };
15563 return 0;
15566 /* Estimate the minimum number of cycles needed to issue the operations.
15567 This is a very simplistic model! */
15568 fractional_cost
15569 aarch64_vec_op_count::min_cycles_per_iter () const
15571 return std::max (min_nonpred_cycles_per_iter (),
15572 min_pred_cycles_per_iter ());
15575 /* Dump information about the structure. */
15576 void
15577 aarch64_vec_op_count::dump () const
15579 dump_printf_loc (MSG_NOTE, vect_location,
15580 " load operations = %d\n", loads);
15581 dump_printf_loc (MSG_NOTE, vect_location,
15582 " store operations = %d\n", stores);
15583 dump_printf_loc (MSG_NOTE, vect_location,
15584 " general operations = %d\n", general_ops);
15585 if (sve_issue_info ())
15586 dump_printf_loc (MSG_NOTE, vect_location,
15587 " predicate operations = %d\n", pred_ops);
15588 dump_printf_loc (MSG_NOTE, vect_location,
15589 " reduction latency = %d\n", reduction_latency);
15590 if (auto rcpi = rename_cycles_per_iter ())
15591 dump_printf_loc (MSG_NOTE, vect_location,
15592 " estimated cycles per iteration to rename = %f\n",
15593 rcpi.as_double ());
15594 if (auto pred_cpi = min_pred_cycles_per_iter ())
15596 dump_printf_loc (MSG_NOTE, vect_location,
15597 " estimated min cycles per iteration"
15598 " without predication = %f\n",
15599 min_nonpred_cycles_per_iter ().as_double ());
15600 dump_printf_loc (MSG_NOTE, vect_location,
15601 " estimated min cycles per iteration"
15602 " for predication = %f\n", pred_cpi.as_double ());
15604 if (auto cpi = min_cycles_per_iter ())
15605 dump_printf_loc (MSG_NOTE, vect_location,
15606 " estimated min cycles per iteration = %f\n",
15607 cpi.as_double ());
15610 /* Information about vector code that we're in the process of costing. */
15611 class aarch64_vector_costs : public vector_costs
15613 public:
15614 aarch64_vector_costs (vec_info *, bool);
15616 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15617 stmt_vec_info stmt_info, slp_tree, tree vectype,
15618 int misalign,
15619 vect_cost_model_location where) override;
15620 void finish_cost (const vector_costs *) override;
15621 bool better_main_loop_than_p (const vector_costs *other) const override;
15623 private:
15624 void record_potential_advsimd_unrolling (loop_vec_info);
15625 void analyze_loop_vinfo (loop_vec_info);
15626 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15627 aarch64_vec_op_count *);
15628 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15629 fractional_cost, unsigned int,
15630 unsigned int *, bool *);
15631 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15632 unsigned int);
15633 bool prefer_unrolled_loop () const;
15634 unsigned int determine_suggested_unroll_factor (loop_vec_info);
15636 /* True if we have performed one-time initialization based on the
15637 vec_info. */
15638 bool m_analyzed_vinfo = false;
15640 /* This loop uses an average operation that is not supported by SVE, but is
15641 supported by Advanced SIMD and SVE2. */
15642 bool m_has_avg = false;
15644 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15645 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15646 SIMD code.
15647 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
15648 unsigned int m_vec_flags = 0;
15650 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15651 This means that code such as:
15653 a[0] = x;
15654 a[1] = x;
15656 will be costed as two scalar instructions and two vector instructions
15657 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
15658 wins if the costs are equal, because of the fact that the vector costs
15659 include constant initializations whereas the scalar costs don't.
15660 We would therefore tend to vectorize the code above, even though
15661 the scalar version can use a single STP.
15663 We should eventually fix this and model LDP and STP in the main costs;
15664 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15665 Until then, we look specifically for code that does nothing more than
15666 STP-like operations. We cost them on that basis in addition to the
15667 normal latency-based costs.
15669 If the scalar or vector code could be a sequence of STPs +
15670 initialization, this variable counts the cost of the sequence,
15671 with 2 units per instruction. The variable is ~0U for other
15672 kinds of code. */
15673 unsigned int m_stp_sequence_cost = 0;
15675 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15676 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
15677 situations, we try to predict whether an Advanced SIMD implementation
15678 of the loop could be completely unrolled and become straight-line code.
15679 If so, it is generally better to use the Advanced SIMD version rather
15680 than length-agnostic SVE, since the SVE loop would execute an unknown
15681 number of times and so could not be completely unrolled in the same way.
15683 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15684 number of Advanced SIMD loop iterations that would be unrolled and
15685 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15686 in the unrolled loop. Both values are zero if we're not applying
15687 the heuristic. */
15688 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15689 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15691 /* If we're vectorizing a loop that executes a constant number of times,
15692 this variable gives the number of times that the vector loop would
15693 iterate, otherwise it is zero. */
15694 uint64_t m_num_vector_iterations = 0;
15696 /* Used only when vectorizing loops. Estimates the number and kind of
15697 operations that would be needed by one iteration of the scalar
15698 or vector loop. There is one entry for each tuning option of
15699 interest. */
15700 auto_vec<aarch64_vec_op_count, 2> m_ops;
15703 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15704 bool costing_for_scalar)
15705 : vector_costs (vinfo, costing_for_scalar),
15706 m_vec_flags (costing_for_scalar ? 0
15707 : aarch64_classify_vector_mode (vinfo->vector_mode))
15709 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15711 m_ops.quick_push ({ issue_info, m_vec_flags });
15712 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15714 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15715 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15716 vf_factor });
15721 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
15722 vector_costs *
15723 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15725 return new aarch64_vector_costs (vinfo, costing_for_scalar);
15728 /* Return true if the current CPU should use the new costs defined
15729 in GCC 11. This should be removed for GCC 12 and above, with the
15730 costs applying to all CPUs instead. */
15731 static bool
15732 aarch64_use_new_vector_costs_p ()
15734 return (aarch64_tune_params.extra_tuning_flags
15735 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
15738 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
15739 static const simd_vec_cost *
15740 aarch64_simd_vec_costs (tree vectype)
15742 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15743 if (vectype != NULL
15744 && aarch64_sve_mode_p (TYPE_MODE (vectype))
15745 && costs->sve != NULL)
15746 return costs->sve;
15747 return costs->advsimd;
15750 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
15751 static const simd_vec_cost *
15752 aarch64_simd_vec_costs_for_flags (unsigned int flags)
15754 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15755 if ((flags & VEC_ANY_SVE) && costs->sve)
15756 return costs->sve;
15757 return costs->advsimd;
15760 /* If STMT_INFO is a memory reference, return the scalar memory type,
15761 otherwise return null. */
15762 static tree
15763 aarch64_dr_type (stmt_vec_info stmt_info)
15765 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
15766 return TREE_TYPE (DR_REF (dr));
15767 return NULL_TREE;
15770 /* Decide whether to use the unrolling heuristic described above
15771 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
15772 describes the loop that we're vectorizing. */
15773 void
15774 aarch64_vector_costs::
15775 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
15777 /* The heuristic only makes sense on targets that have the same
15778 vector throughput for SVE and Advanced SIMD. */
15779 if (!(aarch64_tune_params.extra_tuning_flags
15780 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
15781 return;
15783 /* We only want to apply the heuristic if LOOP_VINFO is being
15784 vectorized for SVE. */
15785 if (!(m_vec_flags & VEC_ANY_SVE))
15786 return;
15788 /* Check whether it is possible in principle to use Advanced SIMD
15789 instead. */
15790 if (aarch64_autovec_preference == 2)
15791 return;
15793 /* We don't want to apply the heuristic to outer loops, since it's
15794 harder to track two levels of unrolling. */
15795 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
15796 return;
15798 /* Only handle cases in which the number of Advanced SIMD iterations
15799 would be known at compile time but the number of SVE iterations
15800 would not. */
15801 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
15802 || aarch64_sve_vg.is_constant ())
15803 return;
15805 /* Guess how many times the Advanced SIMD loop would iterate and make
15806 sure that it is within the complete unrolling limit. Even if the
15807 number of iterations is small enough, the number of statements might
15808 not be, which is why we need to estimate the number of statements too. */
15809 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15810 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
15811 unsigned HOST_WIDE_INT unrolled_advsimd_niters
15812 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
15813 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
15814 return;
15816 /* Record that we're applying the heuristic and should try to estimate
15817 the number of statements in the Advanced SIMD loop. */
15818 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
15821 /* Do one-time initialization of the aarch64_vector_costs given that we're
15822 costing the loop vectorization described by LOOP_VINFO. */
15823 void
15824 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
15826 /* Record the number of times that the vector loop would execute,
15827 if known. */
15828 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
15829 auto scalar_niters = max_stmt_executions_int (loop);
15830 if (scalar_niters >= 0)
15832 unsigned int vf = vect_vf_for_cost (loop_vinfo);
15833 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
15834 m_num_vector_iterations = scalar_niters / vf;
15835 else
15836 m_num_vector_iterations = CEIL (scalar_niters, vf);
15839 /* Detect whether we're vectorizing for SVE and should apply the unrolling
15840 heuristic described above m_unrolled_advsimd_niters. */
15841 record_potential_advsimd_unrolling (loop_vinfo);
15843 /* Record the issue information for any SVE WHILE instructions that the
15844 loop needs. */
15845 if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
15847 unsigned int num_masks = 0;
15848 rgroup_controls *rgm;
15849 unsigned int num_vectors_m1;
15850 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
15851 if (rgm->type)
15852 num_masks += num_vectors_m1 + 1;
15853 for (auto &ops : m_ops)
15854 if (auto *issue = ops.sve_issue_info ())
15855 ops.pred_ops += num_masks * issue->while_pred_ops;
15859 /* Implement targetm.vectorize.builtin_vectorization_cost. */
15860 static int
15861 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
15862 tree vectype,
15863 int misalign ATTRIBUTE_UNUSED)
15865 unsigned elements;
15866 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15867 bool fp = false;
15869 if (vectype != NULL)
15870 fp = FLOAT_TYPE_P (vectype);
15872 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
15874 switch (type_of_cost)
15876 case scalar_stmt:
15877 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
15879 case scalar_load:
15880 return costs->scalar_load_cost;
15882 case scalar_store:
15883 return costs->scalar_store_cost;
15885 case vector_stmt:
15886 return fp ? simd_costs->fp_stmt_cost
15887 : simd_costs->int_stmt_cost;
15889 case vector_load:
15890 return simd_costs->align_load_cost;
15892 case vector_store:
15893 return simd_costs->store_cost;
15895 case vec_to_scalar:
15896 return simd_costs->vec_to_scalar_cost;
15898 case scalar_to_vec:
15899 return simd_costs->scalar_to_vec_cost;
15901 case unaligned_load:
15902 case vector_gather_load:
15903 return simd_costs->unalign_load_cost;
15905 case unaligned_store:
15906 case vector_scatter_store:
15907 return simd_costs->unalign_store_cost;
15909 case cond_branch_taken:
15910 return costs->cond_taken_branch_cost;
15912 case cond_branch_not_taken:
15913 return costs->cond_not_taken_branch_cost;
15915 case vec_perm:
15916 return simd_costs->permute_cost;
15918 case vec_promote_demote:
15919 return fp ? simd_costs->fp_stmt_cost
15920 : simd_costs->int_stmt_cost;
15922 case vec_construct:
15923 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
15924 return elements / 2 + 1;
15926 default:
15927 gcc_unreachable ();
15931 /* Return true if an access of kind KIND for STMT_INFO represents one
15932 vector of an LD[234] or ST[234] operation. Return the total number of
15933 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
15934 static int
15935 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
15937 if ((kind == vector_load
15938 || kind == unaligned_load
15939 || kind == vector_store
15940 || kind == unaligned_store)
15941 && STMT_VINFO_DATA_REF (stmt_info))
15943 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
15944 if (stmt_info
15945 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
15946 return DR_GROUP_SIZE (stmt_info);
15948 return 0;
15951 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
15952 vectors would produce a series of LDP or STP operations. KIND is the
15953 kind of statement that STMT_INFO represents. */
15954 static bool
15955 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
15956 stmt_vec_info stmt_info)
15958 switch (kind)
15960 case vector_load:
15961 case vector_store:
15962 case unaligned_load:
15963 case unaligned_store:
15964 break;
15966 default:
15967 return false;
15970 if (aarch64_tune_params.extra_tuning_flags
15971 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
15972 return false;
15974 return is_gimple_assign (stmt_info->stmt);
15977 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
15978 or multiply-subtract sequence that might be suitable for fusing into a
15979 single instruction. If VEC_FLAGS is zero, analyze the operation as
15980 a scalar one, otherwise analyze it as an operation on vectors with those
15981 VEC_* flags. */
15982 static bool
15983 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
15984 unsigned int vec_flags)
15986 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
15987 if (!assign)
15988 return false;
15989 tree_code code = gimple_assign_rhs_code (assign);
15990 if (code != PLUS_EXPR && code != MINUS_EXPR)
15991 return false;
15993 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
15994 || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
15995 return false;
15997 for (int i = 1; i < 3; ++i)
15999 tree rhs = gimple_op (assign, i);
16000 /* ??? Should we try to check for a single use as well? */
16001 if (TREE_CODE (rhs) != SSA_NAME)
16002 continue;
16004 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16005 if (!def_stmt_info
16006 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16007 continue;
16008 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16009 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16010 continue;
16012 if (vec_flags & VEC_ADVSIMD)
16014 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16015 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16016 only supports MLA forms, so will require a move if the result
16017 cannot be tied to the accumulator. The most important case in
16018 which this is true is when the accumulator input is invariant. */
16019 rhs = gimple_op (assign, 3 - i);
16020 if (TREE_CODE (rhs) != SSA_NAME)
16021 return false;
16022 def_stmt_info = vinfo->lookup_def (rhs);
16023 if (!def_stmt_info
16024 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16025 return false;
16028 return true;
16030 return false;
16033 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16034 in-loop reduction that SVE supports directly, return its latency in cycles,
16035 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16036 instructions. */
16037 static unsigned int
16038 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16039 stmt_vec_info stmt_info,
16040 const sve_vec_cost *sve_costs)
16042 switch (vect_reduc_type (vinfo, stmt_info))
16044 case EXTRACT_LAST_REDUCTION:
16045 return sve_costs->clast_cost;
16047 case FOLD_LEFT_REDUCTION:
16048 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16050 case E_HFmode:
16051 case E_BFmode:
16052 return sve_costs->fadda_f16_cost;
16054 case E_SFmode:
16055 return sve_costs->fadda_f32_cost;
16057 case E_DFmode:
16058 return sve_costs->fadda_f64_cost;
16060 default:
16061 break;
16063 break;
16066 return 0;
16069 /* STMT_INFO describes a loop-carried operation in the original scalar code
16070 that we are considering implementing as a reduction. Return one of the
16071 following values, depending on VEC_FLAGS:
16073 - If VEC_FLAGS is zero, return the loop carry latency of the original
16074 scalar operation.
16076 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16077 Advanced SIMD implementation.
16079 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16080 SVE implementation. */
16081 static unsigned int
16082 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16083 unsigned int vec_flags)
16085 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16086 const sve_vec_cost *sve_costs = nullptr;
16087 if (vec_flags & VEC_ANY_SVE)
16088 sve_costs = aarch64_tune_params.vec_costs->sve;
16090 /* If the caller is asking for the SVE latency, check for forms of reduction
16091 that only SVE can handle directly. */
16092 if (sve_costs)
16094 unsigned int latency
16095 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16096 if (latency)
16097 return latency;
16100 /* Handle scalar costs. */
16101 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16102 if (vec_flags == 0)
16104 if (is_float)
16105 return vec_costs->scalar_fp_stmt_cost;
16106 return vec_costs->scalar_int_stmt_cost;
16109 /* Otherwise, the loop body just contains normal integer or FP operations,
16110 with a vector reduction outside the loop. */
16111 const simd_vec_cost *simd_costs
16112 = aarch64_simd_vec_costs_for_flags (vec_flags);
16113 if (is_float)
16114 return simd_costs->fp_stmt_cost;
16115 return simd_costs->int_stmt_cost;
16118 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16119 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16120 try to subdivide the target-independent categorization provided by KIND
16121 to get a more accurate cost. */
16122 static fractional_cost
16123 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16124 stmt_vec_info stmt_info,
16125 fractional_cost stmt_cost)
16127 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16128 the extension with the load. */
16129 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16130 return 0;
16132 return stmt_cost;
16135 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16136 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16137 when vectorized would operate on vector type VECTYPE. Try to subdivide
16138 the target-independent categorization provided by KIND to get a more
16139 accurate cost. WHERE specifies where the cost associated with KIND
16140 occurs. */
16141 static fractional_cost
16142 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16143 stmt_vec_info stmt_info, tree vectype,
16144 enum vect_cost_model_location where,
16145 fractional_cost stmt_cost)
16147 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16148 const sve_vec_cost *sve_costs = nullptr;
16149 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16150 sve_costs = aarch64_tune_params.vec_costs->sve;
16152 /* It's generally better to avoid costing inductions, since the induction
16153 will usually be hidden by other operations. This is particularly true
16154 for things like COND_REDUCTIONS. */
16155 if (is_a<gphi *> (stmt_info->stmt))
16156 return 0;
16158 /* Detect cases in which vec_to_scalar is describing the extraction of a
16159 vector element in preparation for a scalar store. The store itself is
16160 costed separately. */
16161 if (vect_is_store_elt_extraction (kind, stmt_info))
16162 return simd_costs->store_elt_extra_cost;
16164 /* Detect SVE gather loads, which are costed as a single scalar_load
16165 for each element. We therefore need to divide the full-instruction
16166 cost by the number of elements in the vector. */
16167 if (kind == scalar_load
16168 && sve_costs
16169 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16171 unsigned int nunits = vect_nunits_for_cost (vectype);
16172 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16173 return { sve_costs->gather_load_x64_cost, nunits };
16174 return { sve_costs->gather_load_x32_cost, nunits };
16177 /* Detect cases in which a scalar_store is really storing one element
16178 in a scatter operation. */
16179 if (kind == scalar_store
16180 && sve_costs
16181 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16182 return sve_costs->scatter_store_elt_cost;
16184 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16185 if (kind == vec_to_scalar
16186 && where == vect_body
16187 && sve_costs)
16189 unsigned int latency
16190 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16191 if (latency)
16192 return latency;
16195 /* Detect cases in which vec_to_scalar represents a single reduction
16196 instruction like FADDP or MAXV. */
16197 if (kind == vec_to_scalar
16198 && where == vect_epilogue
16199 && vect_is_reduction (stmt_info))
16200 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16202 case E_QImode:
16203 return simd_costs->reduc_i8_cost;
16205 case E_HImode:
16206 return simd_costs->reduc_i16_cost;
16208 case E_SImode:
16209 return simd_costs->reduc_i32_cost;
16211 case E_DImode:
16212 return simd_costs->reduc_i64_cost;
16214 case E_HFmode:
16215 case E_BFmode:
16216 return simd_costs->reduc_f16_cost;
16218 case E_SFmode:
16219 return simd_costs->reduc_f32_cost;
16221 case E_DFmode:
16222 return simd_costs->reduc_f64_cost;
16224 default:
16225 break;
16228 /* Otherwise stick with the original categorization. */
16229 return stmt_cost;
16232 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16233 for STMT_INFO, which has cost kind KIND and which when vectorized would
16234 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16235 targets. */
16236 static fractional_cost
16237 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16238 stmt_vec_info stmt_info, tree vectype,
16239 fractional_cost stmt_cost)
16241 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16242 vector register size or number of units. Integer promotions of this
16243 type therefore map to SXT[BHW] or UXT[BHW].
16245 Most loads have extending forms that can do the sign or zero extension
16246 on the fly. Optimistically assume that a load followed by an extension
16247 will fold to this form during combine, and that the extension therefore
16248 comes for free. */
16249 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16250 stmt_cost = 0;
16252 /* For similar reasons, vector_stmt integer truncations are a no-op,
16253 because we can just ignore the unused upper bits of the source. */
16254 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16255 stmt_cost = 0;
16257 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16258 but there are no equivalent instructions for SVE. This means that
16259 (all other things being equal) 128-bit SVE needs twice as many load
16260 and store instructions as Advanced SIMD in order to process vector pairs.
16262 Also, scalar code can often use LDP and STP to access pairs of values,
16263 so it is too simplistic to say that one SVE load or store replaces
16264 VF scalar loads and stores.
16266 Ideally we would account for this in the scalar and Advanced SIMD
16267 costs by making suitable load/store pairs as cheap as a single
16268 load/store. However, that would be a very invasive change and in
16269 practice it tends to stress other parts of the cost model too much.
16270 E.g. stores of scalar constants currently count just a store,
16271 whereas stores of vector constants count a store and a vec_init.
16272 This is an artificial distinction for AArch64, where stores of
16273 nonzero scalar constants need the same kind of register invariant
16274 as vector stores.
16276 An alternative would be to double the cost of any SVE loads and stores
16277 that could be paired in Advanced SIMD (and possibly also paired in
16278 scalar code). But this tends to stress other parts of the cost model
16279 in the same way. It also means that we can fall back to Advanced SIMD
16280 even if full-loop predication would have been useful.
16282 Here we go for a more conservative version: double the costs of SVE
16283 loads and stores if one iteration of the scalar loop processes enough
16284 elements for it to use a whole number of Advanced SIMD LDP or STP
16285 instructions. This makes it very likely that the VF would be 1 for
16286 Advanced SIMD, and so no epilogue should be needed. */
16287 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16289 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16290 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16291 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16292 if (multiple_p (count * elt_bits, 256)
16293 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16294 stmt_cost *= 2;
16297 return stmt_cost;
16300 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16301 and which when vectorized would operate on vector type VECTYPE. Add the
16302 cost of any embedded operations. */
16303 static fractional_cost
16304 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16305 tree vectype, fractional_cost stmt_cost)
16307 if (vectype)
16309 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16311 /* Detect cases in which a vector load or store represents an
16312 LD[234] or ST[234] instruction. */
16313 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16315 case 2:
16316 stmt_cost += simd_costs->ld2_st2_permute_cost;
16317 break;
16319 case 3:
16320 stmt_cost += simd_costs->ld3_st3_permute_cost;
16321 break;
16323 case 4:
16324 stmt_cost += simd_costs->ld4_st4_permute_cost;
16325 break;
16328 if (kind == vector_stmt || kind == vec_to_scalar)
16329 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16331 if (FLOAT_TYPE_P (cmp_type))
16332 stmt_cost += simd_costs->fp_stmt_cost;
16333 else
16334 stmt_cost += simd_costs->int_stmt_cost;
16338 if (kind == scalar_stmt)
16339 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16341 if (FLOAT_TYPE_P (cmp_type))
16342 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16343 else
16344 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16347 return stmt_cost;
16350 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16351 and they describe an operation in the body of a vector loop. Record issue
16352 information relating to the vector operation in OPS. */
16353 void
16354 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16355 stmt_vec_info stmt_info,
16356 aarch64_vec_op_count *ops)
16358 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16359 if (!base_issue)
16360 return;
16361 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16362 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16364 /* Calculate the minimum cycles per iteration imposed by a reduction
16365 operation. */
16366 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16367 && vect_is_reduction (stmt_info))
16369 unsigned int base
16370 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16372 /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16373 that's not yet the case. */
16374 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16377 /* Assume that multiply-adds will become a single operation. */
16378 if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16379 return;
16381 /* Count the basic operation cost associated with KIND. */
16382 switch (kind)
16384 case cond_branch_taken:
16385 case cond_branch_not_taken:
16386 case vector_gather_load:
16387 case vector_scatter_store:
16388 /* We currently don't expect these to be used in a loop body. */
16389 break;
16391 case vec_perm:
16392 case vec_promote_demote:
16393 case vec_construct:
16394 case vec_to_scalar:
16395 case scalar_to_vec:
16396 case vector_stmt:
16397 case scalar_stmt:
16398 ops->general_ops += count;
16399 break;
16401 case scalar_load:
16402 case vector_load:
16403 case unaligned_load:
16404 ops->loads += count;
16405 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16406 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16407 break;
16409 case vector_store:
16410 case unaligned_store:
16411 case scalar_store:
16412 ops->stores += count;
16413 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16414 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16415 break;
16418 /* Add any embedded comparison operations. */
16419 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16420 && vect_embedded_comparison_type (stmt_info))
16421 ops->general_ops += count;
16423 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16424 have only accounted for one. */
16425 if ((kind == vector_stmt || kind == vec_to_scalar)
16426 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16427 ops->general_ops += count;
16429 /* Count the predicate operations needed by an SVE comparison. */
16430 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16431 if (tree type = vect_comparison_type (stmt_info))
16433 unsigned int base = (FLOAT_TYPE_P (type)
16434 ? sve_issue->fp_cmp_pred_ops
16435 : sve_issue->int_cmp_pred_ops);
16436 ops->pred_ops += base * count;
16439 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
16440 if (simd_issue)
16441 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16443 case 2:
16444 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16445 break;
16447 case 3:
16448 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16449 break;
16451 case 4:
16452 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16453 break;
16456 /* Add any overhead associated with gather loads and scatter stores. */
16457 if (sve_issue
16458 && (kind == scalar_load || kind == scalar_store)
16459 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16461 unsigned int pairs = CEIL (count, 2);
16462 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16463 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16467 /* Return true if STMT_INFO contains a memory access and if the constant
16468 component of the memory address is aligned to SIZE bytes. */
16469 static bool
16470 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16471 poly_uint64 size)
16473 if (!STMT_VINFO_DATA_REF (stmt_info))
16474 return false;
16476 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16477 stmt_info = first_stmt;
16478 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16479 /* Needed for gathers & scatters, for example. */
16480 if (!constant_offset)
16481 return false;
16483 return multiple_p (wi::to_poly_offset (constant_offset), size);
16486 /* Check if a scalar or vector stmt could be part of a region of code
16487 that does nothing more than store values to memory, in the scalar
16488 case using STP. Return the cost of the stmt if so, counting 2 for
16489 one instruction. Return ~0U otherwise.
16491 The arguments are a subset of those passed to add_stmt_cost. */
16492 unsigned int
16493 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16494 stmt_vec_info stmt_info, tree vectype)
16496 /* Code that stores vector constants uses a vector_load to create
16497 the constant. We don't apply the heuristic to that case for two
16498 main reasons:
16500 - At the moment, STPs are only formed via peephole2, and the
16501 constant scalar moves would often come between STRs and so
16502 prevent STP formation.
16504 - The scalar code also has to load the constant somehow, and that
16505 isn't costed. */
16506 switch (kind)
16508 case scalar_to_vec:
16509 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
16510 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16512 case vec_construct:
16513 if (FLOAT_TYPE_P (vectype))
16514 /* Count 1 insn for the maximum number of FP->SIMD INS
16515 instructions. */
16516 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16518 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16519 maximum number of GPR->SIMD INS instructions. */
16520 return vect_nunits_for_cost (vectype) * 4 * count;
16522 case vector_store:
16523 case unaligned_store:
16524 /* Count 1 insn per vector if we can't form STP Q pairs. */
16525 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16526 return count * 2;
16527 if (aarch64_tune_params.extra_tuning_flags
16528 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16529 return count * 2;
16531 if (stmt_info)
16533 /* Assume we won't be able to use STP if the constant offset
16534 component of the address is misaligned. ??? This could be
16535 removed if we formed STP pairs earlier, rather than relying
16536 on peephole2. */
16537 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16538 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16539 return count * 2;
16541 return CEIL (count, 2) * 2;
16543 case scalar_store:
16544 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16546 /* Check for a mode in which STP pairs can be formed. */
16547 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16548 if (maybe_ne (size, 4) && maybe_ne (size, 8))
16549 return ~0U;
16551 /* Assume we won't be able to use STP if the constant offset
16552 component of the address is misaligned. ??? This could be
16553 removed if we formed STP pairs earlier, rather than relying
16554 on peephole2. */
16555 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16556 return ~0U;
16558 return count;
16560 default:
16561 return ~0U;
16565 unsigned
16566 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16567 stmt_vec_info stmt_info, slp_tree,
16568 tree vectype, int misalign,
16569 vect_cost_model_location where)
16571 fractional_cost stmt_cost
16572 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16574 bool in_inner_loop_p = (where == vect_body
16575 && stmt_info
16576 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16578 /* Do one-time initialization based on the vinfo. */
16579 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16580 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16582 if (loop_vinfo)
16583 analyze_loop_vinfo (loop_vinfo);
16585 m_analyzed_vinfo = true;
16588 /* Apply the heuristic described above m_stp_sequence_cost. */
16589 if (m_stp_sequence_cost != ~0U)
16591 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16592 stmt_info, vectype);
16593 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16596 /* Try to get a more accurate cost by looking at STMT_INFO instead
16597 of just looking at KIND. */
16598 if (stmt_info && aarch64_use_new_vector_costs_p ())
16600 /* If we scalarize a strided store, the vectorizer costs one
16601 vec_to_scalar for each element. However, we can store the first
16602 element using an FP store without a separate extract step. */
16603 if (vect_is_store_elt_extraction (kind, stmt_info))
16604 count -= 1;
16606 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16607 stmt_info, stmt_cost);
16609 if (vectype && m_vec_flags)
16610 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16611 stmt_info, vectype,
16612 where, stmt_cost);
16615 /* Do any SVE-specific adjustments to the cost. */
16616 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16617 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16618 vectype, stmt_cost);
16620 if (stmt_info && aarch64_use_new_vector_costs_p ())
16622 /* Account for any extra "embedded" costs that apply additively
16623 to the base cost calculated above. */
16624 stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16625 stmt_cost);
16627 /* If we're recording a nonzero vector loop body cost for the
16628 innermost loop, also estimate the operations that would need
16629 to be issued by all relevant implementations of the loop. */
16630 if (loop_vinfo
16631 && (m_costing_for_scalar || where == vect_body)
16632 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16633 && stmt_cost != 0)
16634 for (auto &ops : m_ops)
16635 count_ops (count, kind, stmt_info, &ops);
16637 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16638 estimate the number of statements in the unrolled Advanced SIMD
16639 loop. For simplicitly, we assume that one iteration of the
16640 Advanced SIMD loop would need the same number of statements
16641 as one iteration of the SVE loop. */
16642 if (where == vect_body && m_unrolled_advsimd_niters)
16643 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16645 /* Detect the use of an averaging operation. */
16646 gimple *stmt = stmt_info->stmt;
16647 if (is_gimple_call (stmt)
16648 && gimple_call_internal_p (stmt))
16650 switch (gimple_call_internal_fn (stmt))
16652 case IFN_AVG_FLOOR:
16653 case IFN_AVG_CEIL:
16654 m_has_avg = true;
16655 default:
16656 break;
16660 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16663 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16664 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16665 says that we should prefer the Advanced SIMD loop. */
16666 bool
16667 aarch64_vector_costs::prefer_unrolled_loop () const
16669 if (!m_unrolled_advsimd_stmts)
16670 return false;
16672 if (dump_enabled_p ())
16673 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16674 " unrolled Advanced SIMD loop = %d\n",
16675 m_unrolled_advsimd_stmts);
16677 /* The balance here is tricky. On the one hand, we can't be sure whether
16678 the code is vectorizable with Advanced SIMD or not. However, even if
16679 it isn't vectorizable with Advanced SIMD, there's a possibility that
16680 the scalar code could also be unrolled. Some of the code might then
16681 benefit from SLP, or from using LDP and STP. We therefore apply
16682 the heuristic regardless of can_use_advsimd_p. */
16683 return (m_unrolled_advsimd_stmts
16684 && (m_unrolled_advsimd_stmts
16685 <= (unsigned int) param_max_completely_peeled_insns));
16688 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
16689 how fast the SVE code can be issued and compare it to the equivalent value
16690 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
16691 also compare it to the issue rate of Advanced SIMD code
16692 (ADVSIMD_CYCLES_PER_ITER).
16694 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16695 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
16696 is true if we think the loop body is too expensive. */
16698 fractional_cost
16699 aarch64_vector_costs::
16700 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16701 fractional_cost scalar_cycles_per_iter,
16702 unsigned int orig_body_cost, unsigned int *body_cost,
16703 bool *should_disparage)
16705 if (dump_enabled_p ())
16706 ops->dump ();
16708 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16709 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16711 /* If the scalar version of the loop could issue at least as
16712 quickly as the predicate parts of the SVE loop, make the SVE loop
16713 prohibitively expensive. In this case vectorization is adding an
16714 overhead that the original scalar code didn't have.
16716 This is mostly intended to detect cases in which WHILELOs dominate
16717 for very tight loops, which is something that normal latency-based
16718 costs would not model. Adding this kind of cliffedge would be
16719 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16720 code in the caller handles that case in a more conservative way. */
16721 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
16722 if (scalar_cycles_per_iter < sve_estimate)
16724 unsigned int min_cost
16725 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
16726 if (*body_cost < min_cost)
16728 if (dump_enabled_p ())
16729 dump_printf_loc (MSG_NOTE, vect_location,
16730 "Increasing body cost to %d because the"
16731 " scalar code could issue within the limit"
16732 " imposed by predicate operations\n",
16733 min_cost);
16734 *body_cost = min_cost;
16735 *should_disparage = true;
16739 return sve_cycles_per_iter;
16742 unsigned int
16743 aarch64_vector_costs::
16744 determine_suggested_unroll_factor (loop_vec_info loop_vinfo)
16746 bool sve = m_vec_flags & VEC_ANY_SVE;
16747 /* If we are trying to unroll an Advanced SIMD main loop that contains
16748 an averaging operation that we do not support with SVE and we might use a
16749 predicated epilogue, we need to be conservative and block unrolling as
16750 this might lead to a less optimal loop for the first and only epilogue
16751 using the original loop's vectorization factor.
16752 TODO: Remove this constraint when we add support for multiple epilogue
16753 vectorization. */
16754 if (!sve && !TARGET_SVE2 && m_has_avg)
16755 return 1;
16757 unsigned int max_unroll_factor = 1;
16758 auto vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
16759 for (auto vec_ops : m_ops)
16761 aarch64_simd_vec_issue_info const *vec_issue
16762 = vec_ops.simd_issue_info ();
16763 if (!vec_issue)
16764 return 1;
16765 /* Limit unroll factor to a value adjustable by the user, the default
16766 value is 4. */
16767 unsigned int unroll_factor = MIN (aarch64_vect_unroll_limit,
16768 (int) known_alignment (vf));
16769 unsigned int factor
16770 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
16771 unsigned int temp;
16773 /* Sanity check, this should never happen. */
16774 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
16775 return 1;
16777 /* Check stores. */
16778 if (vec_ops.stores > 0)
16780 temp = CEIL (factor * vec_issue->stores_per_cycle,
16781 vec_ops.stores);
16782 unroll_factor = MIN (unroll_factor, temp);
16785 /* Check loads + stores. */
16786 if (vec_ops.loads > 0)
16788 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
16789 vec_ops.loads + vec_ops.stores);
16790 unroll_factor = MIN (unroll_factor, temp);
16793 /* Check general ops. */
16794 if (vec_ops.general_ops > 0)
16796 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
16797 vec_ops.general_ops);
16798 unroll_factor = MIN (unroll_factor, temp);
16800 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
16803 /* Make sure unroll factor is power of 2. */
16804 return 1 << ceil_log2 (max_unroll_factor);
16807 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
16808 and return the new cost. */
16809 unsigned int
16810 aarch64_vector_costs::
16811 adjust_body_cost (loop_vec_info loop_vinfo,
16812 const aarch64_vector_costs *scalar_costs,
16813 unsigned int body_cost)
16815 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
16816 return body_cost;
16818 const auto &scalar_ops = scalar_costs->m_ops[0];
16819 const auto &vector_ops = m_ops[0];
16820 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
16821 unsigned int orig_body_cost = body_cost;
16822 bool should_disparage = false;
16824 if (dump_enabled_p ())
16825 dump_printf_loc (MSG_NOTE, vect_location,
16826 "Original vector body cost = %d\n", body_cost);
16828 fractional_cost scalar_cycles_per_iter
16829 = scalar_ops.min_cycles_per_iter () * estimated_vf;
16831 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
16833 if (dump_enabled_p ())
16835 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
16836 dump_printf_loc (MSG_NOTE, vect_location,
16837 "Vector loop iterates at most %wd times\n",
16838 m_num_vector_iterations);
16839 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
16840 scalar_ops.dump ();
16841 dump_printf_loc (MSG_NOTE, vect_location,
16842 " estimated cycles per vector iteration"
16843 " (for VF %d) = %f\n",
16844 estimated_vf, scalar_cycles_per_iter.as_double ());
16847 if (vector_ops.sve_issue_info ())
16849 if (dump_enabled_p ())
16850 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
16851 vector_cycles_per_iter
16852 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
16853 orig_body_cost, &body_cost, &should_disparage);
16855 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16857 /* Also take Neoverse V1 tuning into account, doubling the
16858 scalar and Advanced SIMD estimates to account for the
16859 doubling in SVE vector length. */
16860 if (dump_enabled_p ())
16861 dump_printf_loc (MSG_NOTE, vect_location,
16862 "Neoverse V1 estimate:\n");
16863 auto vf_factor = m_ops[1].vf_factor ();
16864 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
16865 orig_body_cost, &body_cost, &should_disparage);
16868 else
16870 if (dump_enabled_p ())
16872 dump_printf_loc (MSG_NOTE, vect_location,
16873 "Vector issue estimate:\n");
16874 vector_ops.dump ();
16878 /* Decide whether to stick to latency-based costs or whether to try to
16879 take issue rates into account. */
16880 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
16881 if (m_vec_flags & VEC_ANY_SVE)
16882 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
16884 if (m_num_vector_iterations >= 1
16885 && m_num_vector_iterations < threshold)
16887 if (dump_enabled_p ())
16888 dump_printf_loc (MSG_NOTE, vect_location,
16889 "Low iteration count, so using pure latency"
16890 " costs\n");
16892 /* Increase the cost of the vector code if it looks like the scalar code
16893 could issue more quickly. These values are only rough estimates,
16894 so minor differences should only result in minor changes. */
16895 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
16897 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
16898 scalar_cycles_per_iter);
16899 if (dump_enabled_p ())
16900 dump_printf_loc (MSG_NOTE, vect_location,
16901 "Increasing body cost to %d because scalar code"
16902 " would issue more quickly\n", body_cost);
16904 /* In general, it's expected that the proposed vector code would be able
16905 to issue more quickly than the original scalar code. This should
16906 already be reflected to some extent in the latency-based costs.
16908 However, the latency-based costs effectively assume that the scalar
16909 code and the vector code execute serially, which tends to underplay
16910 one important case: if the real (non-serialized) execution time of
16911 a scalar iteration is dominated by loop-carried dependencies,
16912 and if the vector code is able to reduce both the length of
16913 the loop-carried dependencies *and* the number of cycles needed
16914 to issue the code in general, we can be more confident that the
16915 vector code is an improvement, even if adding the other (non-loop-carried)
16916 latencies tends to hide this saving. We therefore reduce the cost of the
16917 vector loop body in proportion to the saving. */
16918 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
16919 && scalar_ops.reduction_latency == scalar_cycles_per_iter
16920 && scalar_cycles_per_iter > vector_cycles_per_iter
16921 && !should_disparage)
16923 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
16924 scalar_cycles_per_iter);
16925 if (dump_enabled_p ())
16926 dump_printf_loc (MSG_NOTE, vect_location,
16927 "Decreasing body cost to %d account for smaller"
16928 " reduction latency\n", body_cost);
16931 return body_cost;
16934 void
16935 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
16937 auto *scalar_costs
16938 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
16939 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16940 if (loop_vinfo
16941 && m_vec_flags
16942 && aarch64_use_new_vector_costs_p ())
16944 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
16945 m_costs[vect_body]);
16946 m_suggested_unroll_factor
16947 = determine_suggested_unroll_factor (loop_vinfo);
16950 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
16951 the scalar code in the event of a tie, since there is more chance
16952 of scalar code being optimized with surrounding operations. */
16953 if (!loop_vinfo
16954 && scalar_costs
16955 && m_stp_sequence_cost != ~0U
16956 && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
16957 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
16959 vector_costs::finish_cost (scalar_costs);
16962 bool
16963 aarch64_vector_costs::
16964 better_main_loop_than_p (const vector_costs *uncast_other) const
16966 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
16968 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
16969 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
16971 if (dump_enabled_p ())
16972 dump_printf_loc (MSG_NOTE, vect_location,
16973 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
16974 GET_MODE_NAME (this_loop_vinfo->vector_mode),
16975 vect_vf_for_cost (this_loop_vinfo),
16976 GET_MODE_NAME (other_loop_vinfo->vector_mode),
16977 vect_vf_for_cost (other_loop_vinfo));
16979 /* Apply the unrolling heuristic described above
16980 m_unrolled_advsimd_niters. */
16981 if (bool (m_unrolled_advsimd_stmts)
16982 != bool (other->m_unrolled_advsimd_stmts))
16984 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
16985 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
16986 if (this_prefer_unrolled != other_prefer_unrolled)
16988 if (dump_enabled_p ())
16989 dump_printf_loc (MSG_NOTE, vect_location,
16990 "Preferring Advanced SIMD loop because"
16991 " it can be unrolled\n");
16992 return other_prefer_unrolled;
16996 for (unsigned int i = 0; i < m_ops.length (); ++i)
16998 if (dump_enabled_p ())
17000 if (i)
17001 dump_printf_loc (MSG_NOTE, vect_location,
17002 "Reconsidering with subtuning %d\n", i);
17003 dump_printf_loc (MSG_NOTE, vect_location,
17004 "Issue info for %s loop:\n",
17005 GET_MODE_NAME (this_loop_vinfo->vector_mode));
17006 this->m_ops[i].dump ();
17007 dump_printf_loc (MSG_NOTE, vect_location,
17008 "Issue info for %s loop:\n",
17009 GET_MODE_NAME (other_loop_vinfo->vector_mode));
17010 other->m_ops[i].dump ();
17013 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17014 * this->m_ops[i].vf_factor ());
17015 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17016 * other->m_ops[i].vf_factor ());
17018 /* If it appears that one loop could process the same amount of data
17019 in fewer cycles, prefer that loop over the other one. */
17020 fractional_cost this_cost
17021 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17022 fractional_cost other_cost
17023 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17024 if (dump_enabled_p ())
17026 dump_printf_loc (MSG_NOTE, vect_location,
17027 "Weighted cycles per iteration of %s loop ~= %f\n",
17028 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17029 this_cost.as_double ());
17030 dump_printf_loc (MSG_NOTE, vect_location,
17031 "Weighted cycles per iteration of %s loop ~= %f\n",
17032 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17033 other_cost.as_double ());
17035 if (this_cost != other_cost)
17037 if (dump_enabled_p ())
17038 dump_printf_loc (MSG_NOTE, vect_location,
17039 "Preferring loop with lower cycles"
17040 " per iteration\n");
17041 return this_cost < other_cost;
17044 /* If the issue rate of SVE code is limited by predicate operations
17045 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17046 and if Advanced SIMD code could issue within the limit imposed
17047 by the predicate operations, the predicate operations are adding an
17048 overhead that the original code didn't have and so we should prefer
17049 the Advanced SIMD version. */
17050 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17051 const aarch64_vec_op_count &b) -> bool
17053 if (a.pred_ops == 0
17054 && (b.min_pred_cycles_per_iter ()
17055 > b.min_nonpred_cycles_per_iter ()))
17057 if (dump_enabled_p ())
17058 dump_printf_loc (MSG_NOTE, vect_location,
17059 "Preferring Advanced SIMD loop since"
17060 " SVE loop is predicate-limited\n");
17061 return true;
17063 return false;
17065 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17066 return true;
17067 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17068 return false;
17071 return vector_costs::better_main_loop_than_p (other);
17074 static void initialize_aarch64_code_model (struct gcc_options *);
17076 /* Parse the TO_PARSE string and put the architecture struct that it
17077 selects into RES and the architectural features into ISA_FLAGS.
17078 Return an aarch64_parse_opt_result describing the parse result.
17079 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17080 When the TO_PARSE string contains an invalid extension,
17081 a copy of the string is created and stored to INVALID_EXTENSION. */
17083 static enum aarch64_parse_opt_result
17084 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17085 uint64_t *isa_flags, std::string *invalid_extension)
17087 const char *ext;
17088 const struct processor *arch;
17089 size_t len;
17091 ext = strchr (to_parse, '+');
17093 if (ext != NULL)
17094 len = ext - to_parse;
17095 else
17096 len = strlen (to_parse);
17098 if (len == 0)
17099 return AARCH64_PARSE_MISSING_ARG;
17102 /* Loop through the list of supported ARCHes to find a match. */
17103 for (arch = all_architectures; arch->name != NULL; arch++)
17105 if (strlen (arch->name) == len
17106 && strncmp (arch->name, to_parse, len) == 0)
17108 uint64_t isa_temp = arch->flags;
17110 if (ext != NULL)
17112 /* TO_PARSE string contains at least one extension. */
17113 enum aarch64_parse_opt_result ext_res
17114 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17116 if (ext_res != AARCH64_PARSE_OK)
17117 return ext_res;
17119 /* Extension parsing was successful. Confirm the result
17120 arch and ISA flags. */
17121 *res = arch;
17122 *isa_flags = isa_temp;
17123 return AARCH64_PARSE_OK;
17127 /* ARCH name not found in list. */
17128 return AARCH64_PARSE_INVALID_ARG;
17131 /* Parse the TO_PARSE string and put the result tuning in RES and the
17132 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
17133 describing the parse result. If there is an error parsing, RES and
17134 ISA_FLAGS are left unchanged.
17135 When the TO_PARSE string contains an invalid extension,
17136 a copy of the string is created and stored to INVALID_EXTENSION. */
17138 static enum aarch64_parse_opt_result
17139 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17140 uint64_t *isa_flags, std::string *invalid_extension)
17142 const char *ext;
17143 const struct processor *cpu;
17144 size_t len;
17146 ext = strchr (to_parse, '+');
17148 if (ext != NULL)
17149 len = ext - to_parse;
17150 else
17151 len = strlen (to_parse);
17153 if (len == 0)
17154 return AARCH64_PARSE_MISSING_ARG;
17157 /* Loop through the list of supported CPUs to find a match. */
17158 for (cpu = all_cores; cpu->name != NULL; cpu++)
17160 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17162 uint64_t isa_temp = cpu->flags;
17165 if (ext != NULL)
17167 /* TO_PARSE string contains at least one extension. */
17168 enum aarch64_parse_opt_result ext_res
17169 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17171 if (ext_res != AARCH64_PARSE_OK)
17172 return ext_res;
17174 /* Extension parsing was successfull. Confirm the result
17175 cpu and ISA flags. */
17176 *res = cpu;
17177 *isa_flags = isa_temp;
17178 return AARCH64_PARSE_OK;
17182 /* CPU name not found in list. */
17183 return AARCH64_PARSE_INVALID_ARG;
17186 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17187 Return an aarch64_parse_opt_result describing the parse result.
17188 If the parsing fails the RES does not change. */
17190 static enum aarch64_parse_opt_result
17191 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17193 const struct processor *cpu;
17195 /* Loop through the list of supported CPUs to find a match. */
17196 for (cpu = all_cores; cpu->name != NULL; cpu++)
17198 if (strcmp (cpu->name, to_parse) == 0)
17200 *res = cpu;
17201 return AARCH64_PARSE_OK;
17205 /* CPU name not found in list. */
17206 return AARCH64_PARSE_INVALID_ARG;
17209 /* Parse TOKEN, which has length LENGTH to see if it is an option
17210 described in FLAG. If it is, return the index bit for that fusion type.
17211 If not, error (printing OPTION_NAME) and return zero. */
17213 static unsigned int
17214 aarch64_parse_one_option_token (const char *token,
17215 size_t length,
17216 const struct aarch64_flag_desc *flag,
17217 const char *option_name)
17219 for (; flag->name != NULL; flag++)
17221 if (length == strlen (flag->name)
17222 && !strncmp (flag->name, token, length))
17223 return flag->flag;
17226 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17227 return 0;
17230 /* Parse OPTION which is a comma-separated list of flags to enable.
17231 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17232 default state we inherit from the CPU tuning structures. OPTION_NAME
17233 gives the top-level option we are parsing in the -moverride string,
17234 for use in error messages. */
17236 static unsigned int
17237 aarch64_parse_boolean_options (const char *option,
17238 const struct aarch64_flag_desc *flags,
17239 unsigned int initial_state,
17240 const char *option_name)
17242 const char separator = '.';
17243 const char* specs = option;
17244 const char* ntoken = option;
17245 unsigned int found_flags = initial_state;
17247 while ((ntoken = strchr (specs, separator)))
17249 size_t token_length = ntoken - specs;
17250 unsigned token_ops = aarch64_parse_one_option_token (specs,
17251 token_length,
17252 flags,
17253 option_name);
17254 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17255 in the token stream, reset the supported operations. So:
17257 adrp+add.cmp+branch.none.adrp+add
17259 would have the result of turning on only adrp+add fusion. */
17260 if (!token_ops)
17261 found_flags = 0;
17263 found_flags |= token_ops;
17264 specs = ++ntoken;
17267 /* We ended with a comma, print something. */
17268 if (!(*specs))
17270 error ("%qs string ill-formed", option_name);
17271 return 0;
17274 /* We still have one more token to parse. */
17275 size_t token_length = strlen (specs);
17276 unsigned token_ops = aarch64_parse_one_option_token (specs,
17277 token_length,
17278 flags,
17279 option_name);
17280 if (!token_ops)
17281 found_flags = 0;
17283 found_flags |= token_ops;
17284 return found_flags;
17287 /* Support for overriding instruction fusion. */
17289 static void
17290 aarch64_parse_fuse_string (const char *fuse_string,
17291 struct tune_params *tune)
17293 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17294 aarch64_fusible_pairs,
17295 tune->fusible_ops,
17296 "fuse=");
17299 /* Support for overriding other tuning flags. */
17301 static void
17302 aarch64_parse_tune_string (const char *tune_string,
17303 struct tune_params *tune)
17305 tune->extra_tuning_flags
17306 = aarch64_parse_boolean_options (tune_string,
17307 aarch64_tuning_flags,
17308 tune->extra_tuning_flags,
17309 "tune=");
17312 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17313 Accept the valid SVE vector widths allowed by
17314 aarch64_sve_vector_bits_enum and use it to override sve_width
17315 in TUNE. */
17317 static void
17318 aarch64_parse_sve_width_string (const char *tune_string,
17319 struct tune_params *tune)
17321 int width = -1;
17323 int n = sscanf (tune_string, "%d", &width);
17324 if (n == EOF)
17326 error ("invalid format for %<sve_width%>");
17327 return;
17329 switch (width)
17331 case SVE_128:
17332 case SVE_256:
17333 case SVE_512:
17334 case SVE_1024:
17335 case SVE_2048:
17336 break;
17337 default:
17338 error ("invalid %<sve_width%> value: %d", width);
17340 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17343 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17344 we understand. If it is, extract the option string and handoff to
17345 the appropriate function. */
17347 void
17348 aarch64_parse_one_override_token (const char* token,
17349 size_t length,
17350 struct tune_params *tune)
17352 const struct aarch64_tuning_override_function *fn
17353 = aarch64_tuning_override_functions;
17355 const char *option_part = strchr (token, '=');
17356 if (!option_part)
17358 error ("tuning string missing in option (%s)", token);
17359 return;
17362 /* Get the length of the option name. */
17363 length = option_part - token;
17364 /* Skip the '=' to get to the option string. */
17365 option_part++;
17367 for (; fn->name != NULL; fn++)
17369 if (!strncmp (fn->name, token, length))
17371 fn->parse_override (option_part, tune);
17372 return;
17376 error ("unknown tuning option (%s)",token);
17377 return;
17380 /* A checking mechanism for the implementation of the tls size. */
17382 static void
17383 initialize_aarch64_tls_size (struct gcc_options *opts)
17385 if (aarch64_tls_size == 0)
17386 aarch64_tls_size = 24;
17388 switch (opts->x_aarch64_cmodel_var)
17390 case AARCH64_CMODEL_TINY:
17391 /* Both the default and maximum TLS size allowed under tiny is 1M which
17392 needs two instructions to address, so we clamp the size to 24. */
17393 if (aarch64_tls_size > 24)
17394 aarch64_tls_size = 24;
17395 break;
17396 case AARCH64_CMODEL_SMALL:
17397 /* The maximum TLS size allowed under small is 4G. */
17398 if (aarch64_tls_size > 32)
17399 aarch64_tls_size = 32;
17400 break;
17401 case AARCH64_CMODEL_LARGE:
17402 /* The maximum TLS size allowed under large is 16E.
17403 FIXME: 16E should be 64bit, we only support 48bit offset now. */
17404 if (aarch64_tls_size > 48)
17405 aarch64_tls_size = 48;
17406 break;
17407 default:
17408 gcc_unreachable ();
17411 return;
17414 /* Return the CPU corresponding to the enum CPU. */
17416 static const struct processor *
17417 aarch64_get_tune_cpu (enum aarch64_processor cpu)
17419 gcc_assert (cpu != aarch64_none);
17421 return &all_cores[cpu];
17424 /* Return the architecture corresponding to the enum ARCH. */
17426 static const struct processor *
17427 aarch64_get_arch (enum aarch64_arch arch)
17429 gcc_assert (arch != aarch64_no_arch);
17431 return &all_architectures[arch];
17434 /* Parse STRING looking for options in the format:
17435 string :: option:string
17436 option :: name=substring
17437 name :: {a-z}
17438 substring :: defined by option. */
17440 static void
17441 aarch64_parse_override_string (const char* input_string,
17442 struct tune_params* tune)
17444 const char separator = ':';
17445 size_t string_length = strlen (input_string) + 1;
17446 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17447 char *string = string_root;
17448 strncpy (string, input_string, string_length);
17449 string[string_length - 1] = '\0';
17451 char* ntoken = string;
17453 while ((ntoken = strchr (string, separator)))
17455 size_t token_length = ntoken - string;
17456 /* Make this substring look like a string. */
17457 *ntoken = '\0';
17458 aarch64_parse_one_override_token (string, token_length, tune);
17459 string = ++ntoken;
17462 /* One last option to parse. */
17463 aarch64_parse_one_override_token (string, strlen (string), tune);
17464 free (string_root);
17467 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17468 are best for a generic target with the currently-enabled architecture
17469 extensions. */
17470 static void
17471 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17473 /* Neoverse V1 is the only core that is known to benefit from
17474 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
17475 point enabling it for SVE2 and above. */
17476 if (TARGET_SVE2)
17477 current_tune.extra_tuning_flags
17478 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17481 static void
17482 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17484 if (accepted_branch_protection_string)
17486 opts->x_aarch64_branch_protection_string
17487 = xstrdup (accepted_branch_protection_string);
17490 /* PR 70044: We have to be careful about being called multiple times for the
17491 same function. This means all changes should be repeatable. */
17493 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17494 Disable the frame pointer flag so the mid-end will not use a frame
17495 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17496 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17497 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
17498 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17499 if (opts->x_flag_omit_frame_pointer == 0)
17500 opts->x_flag_omit_frame_pointer = 2;
17502 /* If not optimizing for size, set the default
17503 alignment to what the target wants. */
17504 if (!opts->x_optimize_size)
17506 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17507 opts->x_str_align_loops = aarch64_tune_params.loop_align;
17508 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17509 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17510 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17511 opts->x_str_align_functions = aarch64_tune_params.function_align;
17514 /* We default to no pc-relative literal loads. */
17516 aarch64_pcrelative_literal_loads = false;
17518 /* If -mpc-relative-literal-loads is set on the command line, this
17519 implies that the user asked for PC relative literal loads. */
17520 if (opts->x_pcrelative_literal_loads == 1)
17521 aarch64_pcrelative_literal_loads = true;
17523 /* In the tiny memory model it makes no sense to disallow PC relative
17524 literal pool loads. */
17525 if (aarch64_cmodel == AARCH64_CMODEL_TINY
17526 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17527 aarch64_pcrelative_literal_loads = true;
17529 /* When enabling the lower precision Newton series for the square root, also
17530 enable it for the reciprocal square root, since the latter is an
17531 intermediary step for the former. */
17532 if (flag_mlow_precision_sqrt)
17533 flag_mrecip_low_precision_sqrt = true;
17536 /* 'Unpack' up the internal tuning structs and update the options
17537 in OPTS. The caller must have set up selected_tune and selected_arch
17538 as all the other target-specific codegen decisions are
17539 derived from them. */
17541 void
17542 aarch64_override_options_internal (struct gcc_options *opts)
17544 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17545 aarch64_tune_flags = tune->flags;
17546 aarch64_tune = tune->sched_core;
17547 /* Make a copy of the tuning parameters attached to the core, which
17548 we may later overwrite. */
17549 aarch64_tune_params = *(tune->tune);
17550 if (tune->tune == &generic_tunings)
17551 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17553 if (opts->x_aarch64_override_tune_string)
17554 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17555 &aarch64_tune_params);
17557 /* This target defaults to strict volatile bitfields. */
17558 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17559 opts->x_flag_strict_volatile_bitfields = 1;
17561 if (aarch64_stack_protector_guard == SSP_GLOBAL
17562 && opts->x_aarch64_stack_protector_guard_offset_str)
17564 error ("incompatible options %<-mstack-protector-guard=global%> and "
17565 "%<-mstack-protector-guard-offset=%s%>",
17566 aarch64_stack_protector_guard_offset_str);
17569 if (aarch64_stack_protector_guard == SSP_SYSREG
17570 && !(opts->x_aarch64_stack_protector_guard_offset_str
17571 && opts->x_aarch64_stack_protector_guard_reg_str))
17573 error ("both %<-mstack-protector-guard-offset%> and "
17574 "%<-mstack-protector-guard-reg%> must be used "
17575 "with %<-mstack-protector-guard=sysreg%>");
17578 if (opts->x_aarch64_stack_protector_guard_reg_str)
17580 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17581 error ("specify a system register with a small string length");
17584 if (opts->x_aarch64_stack_protector_guard_offset_str)
17586 char *end;
17587 const char *str = aarch64_stack_protector_guard_offset_str;
17588 errno = 0;
17589 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17590 if (!*str || *end || errno)
17591 error ("%qs is not a valid offset in %qs", str,
17592 "-mstack-protector-guard-offset=");
17593 aarch64_stack_protector_guard_offset = offs;
17596 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17597 && !fixed_regs[R18_REGNUM])
17598 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17600 initialize_aarch64_code_model (opts);
17601 initialize_aarch64_tls_size (opts);
17603 int queue_depth = 0;
17604 switch (aarch64_tune_params.autoprefetcher_model)
17606 case tune_params::AUTOPREFETCHER_OFF:
17607 queue_depth = -1;
17608 break;
17609 case tune_params::AUTOPREFETCHER_WEAK:
17610 queue_depth = 0;
17611 break;
17612 case tune_params::AUTOPREFETCHER_STRONG:
17613 queue_depth = max_insn_queue_index + 1;
17614 break;
17615 default:
17616 gcc_unreachable ();
17619 /* We don't mind passing in global_options_set here as we don't use
17620 the *options_set structs anyway. */
17621 SET_OPTION_IF_UNSET (opts, &global_options_set,
17622 param_sched_autopref_queue_depth, queue_depth);
17624 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17625 comparison. */
17626 if (aarch64_autovec_preference == 1)
17627 SET_OPTION_IF_UNSET (opts, &global_options_set,
17628 aarch64_sve_compare_costs, 0);
17630 /* Set up parameters to be used in prefetching algorithm. Do not
17631 override the defaults unless we are tuning for a core we have
17632 researched values for. */
17633 if (aarch64_tune_params.prefetch->num_slots > 0)
17634 SET_OPTION_IF_UNSET (opts, &global_options_set,
17635 param_simultaneous_prefetches,
17636 aarch64_tune_params.prefetch->num_slots);
17637 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17638 SET_OPTION_IF_UNSET (opts, &global_options_set,
17639 param_l1_cache_size,
17640 aarch64_tune_params.prefetch->l1_cache_size);
17641 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17642 SET_OPTION_IF_UNSET (opts, &global_options_set,
17643 param_l1_cache_line_size,
17644 aarch64_tune_params.prefetch->l1_cache_line_size);
17646 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17648 SET_OPTION_IF_UNSET (opts, &global_options_set,
17649 param_destruct_interfere_size,
17650 aarch64_tune_params.prefetch->l1_cache_line_size);
17651 SET_OPTION_IF_UNSET (opts, &global_options_set,
17652 param_construct_interfere_size,
17653 aarch64_tune_params.prefetch->l1_cache_line_size);
17655 else
17657 /* For a generic AArch64 target, cover the current range of cache line
17658 sizes. */
17659 SET_OPTION_IF_UNSET (opts, &global_options_set,
17660 param_destruct_interfere_size,
17661 256);
17662 SET_OPTION_IF_UNSET (opts, &global_options_set,
17663 param_construct_interfere_size,
17664 64);
17667 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17668 SET_OPTION_IF_UNSET (opts, &global_options_set,
17669 param_l2_cache_size,
17670 aarch64_tune_params.prefetch->l2_cache_size);
17671 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17672 SET_OPTION_IF_UNSET (opts, &global_options_set,
17673 param_prefetch_dynamic_strides, 0);
17674 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17675 SET_OPTION_IF_UNSET (opts, &global_options_set,
17676 param_prefetch_minimum_stride,
17677 aarch64_tune_params.prefetch->minimum_stride);
17679 /* Use the alternative scheduling-pressure algorithm by default. */
17680 SET_OPTION_IF_UNSET (opts, &global_options_set,
17681 param_sched_pressure_algorithm,
17682 SCHED_PRESSURE_MODEL);
17684 /* Validate the guard size. */
17685 int guard_size = param_stack_clash_protection_guard_size;
17687 if (guard_size != 12 && guard_size != 16)
17688 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17689 "size. Given value %d (%llu KB) is out of range",
17690 guard_size, (1ULL << guard_size) / 1024ULL);
17692 /* Enforce that interval is the same size as size so the mid-end does the
17693 right thing. */
17694 SET_OPTION_IF_UNSET (opts, &global_options_set,
17695 param_stack_clash_protection_probe_interval,
17696 guard_size);
17698 /* The maybe_set calls won't update the value if the user has explicitly set
17699 one. Which means we need to validate that probing interval and guard size
17700 are equal. */
17701 int probe_interval
17702 = param_stack_clash_protection_probe_interval;
17703 if (guard_size != probe_interval)
17704 error ("stack clash guard size %<%d%> must be equal to probing interval "
17705 "%<%d%>", guard_size, probe_interval);
17707 /* Enable sw prefetching at specified optimization level for
17708 CPUS that have prefetch. Lower optimization level threshold by 1
17709 when profiling is enabled. */
17710 if (opts->x_flag_prefetch_loop_arrays < 0
17711 && !opts->x_optimize_size
17712 && aarch64_tune_params.prefetch->default_opt_level >= 0
17713 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
17714 opts->x_flag_prefetch_loop_arrays = 1;
17716 aarch64_override_options_after_change_1 (opts);
17719 /* Print a hint with a suggestion for a core or architecture name that
17720 most closely resembles what the user passed in STR. ARCH is true if
17721 the user is asking for an architecture name. ARCH is false if the user
17722 is asking for a core name. */
17724 static void
17725 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
17727 auto_vec<const char *> candidates;
17728 const struct processor *entry = arch ? all_architectures : all_cores;
17729 for (; entry->name != NULL; entry++)
17730 candidates.safe_push (entry->name);
17732 #ifdef HAVE_LOCAL_CPU_DETECT
17733 /* Add also "native" as possible value. */
17734 if (arch)
17735 candidates.safe_push ("native");
17736 #endif
17738 char *s;
17739 const char *hint = candidates_list_and_hint (str, s, candidates);
17740 if (hint)
17741 inform (input_location, "valid arguments are: %s;"
17742 " did you mean %qs?", s, hint);
17743 else
17744 inform (input_location, "valid arguments are: %s", s);
17746 XDELETEVEC (s);
17749 /* Print a hint with a suggestion for a core name that most closely resembles
17750 what the user passed in STR. */
17752 inline static void
17753 aarch64_print_hint_for_core (const char *str)
17755 aarch64_print_hint_for_core_or_arch (str, false);
17758 /* Print a hint with a suggestion for an architecture name that most closely
17759 resembles what the user passed in STR. */
17761 inline static void
17762 aarch64_print_hint_for_arch (const char *str)
17764 aarch64_print_hint_for_core_or_arch (str, true);
17768 /* Print a hint with a suggestion for an extension name
17769 that most closely resembles what the user passed in STR. */
17771 void
17772 aarch64_print_hint_for_extensions (const std::string &str)
17774 auto_vec<const char *> candidates;
17775 aarch64_get_all_extension_candidates (&candidates);
17776 char *s;
17777 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
17778 if (hint)
17779 inform (input_location, "valid arguments are: %s;"
17780 " did you mean %qs?", s, hint);
17781 else
17782 inform (input_location, "valid arguments are: %s", s);
17784 XDELETEVEC (s);
17787 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
17788 specified in STR and throw errors if appropriate. Put the results if
17789 they are valid in RES and ISA_FLAGS. Return whether the option is
17790 valid. */
17792 static bool
17793 aarch64_validate_mcpu (const char *str, const struct processor **res,
17794 uint64_t *isa_flags)
17796 std::string invalid_extension;
17797 enum aarch64_parse_opt_result parse_res
17798 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
17800 if (parse_res == AARCH64_PARSE_OK)
17801 return true;
17803 switch (parse_res)
17805 case AARCH64_PARSE_MISSING_ARG:
17806 error ("missing cpu name in %<-mcpu=%s%>", str);
17807 break;
17808 case AARCH64_PARSE_INVALID_ARG:
17809 error ("unknown value %qs for %<-mcpu%>", str);
17810 aarch64_print_hint_for_core (str);
17811 break;
17812 case AARCH64_PARSE_INVALID_FEATURE:
17813 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
17814 invalid_extension.c_str (), str);
17815 aarch64_print_hint_for_extensions (invalid_extension);
17816 break;
17817 default:
17818 gcc_unreachable ();
17821 return false;
17824 /* Straight line speculation indicators. */
17825 enum aarch64_sls_hardening_type
17827 SLS_NONE = 0,
17828 SLS_RETBR = 1,
17829 SLS_BLR = 2,
17830 SLS_ALL = 3,
17832 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
17834 /* Return whether we should mitigatate Straight Line Speculation for the RET
17835 and BR instructions. */
17836 bool
17837 aarch64_harden_sls_retbr_p (void)
17839 return aarch64_sls_hardening & SLS_RETBR;
17842 /* Return whether we should mitigatate Straight Line Speculation for the BLR
17843 instruction. */
17844 bool
17845 aarch64_harden_sls_blr_p (void)
17847 return aarch64_sls_hardening & SLS_BLR;
17850 /* As of yet we only allow setting these options globally, in the future we may
17851 allow setting them per function. */
17852 static void
17853 aarch64_validate_sls_mitigation (const char *const_str)
17855 char *token_save = NULL;
17856 char *str = NULL;
17858 if (strcmp (const_str, "none") == 0)
17860 aarch64_sls_hardening = SLS_NONE;
17861 return;
17863 if (strcmp (const_str, "all") == 0)
17865 aarch64_sls_hardening = SLS_ALL;
17866 return;
17869 char *str_root = xstrdup (const_str);
17870 str = strtok_r (str_root, ",", &token_save);
17871 if (!str)
17872 error ("invalid argument given to %<-mharden-sls=%>");
17874 int temp = SLS_NONE;
17875 while (str)
17877 if (strcmp (str, "blr") == 0)
17878 temp |= SLS_BLR;
17879 else if (strcmp (str, "retbr") == 0)
17880 temp |= SLS_RETBR;
17881 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
17883 error ("%qs must be by itself for %<-mharden-sls=%>", str);
17884 break;
17886 else
17888 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
17889 break;
17891 str = strtok_r (NULL, ",", &token_save);
17893 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
17894 free (str_root);
17897 /* Parses CONST_STR for branch protection features specified in
17898 aarch64_branch_protect_types, and set any global variables required. Returns
17899 the parsing result and assigns LAST_STR to the last processed token from
17900 CONST_STR so that it can be used for error reporting. */
17902 static enum
17903 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
17904 char** last_str)
17906 char *str_root = xstrdup (const_str);
17907 char* token_save = NULL;
17908 char *str = strtok_r (str_root, "+", &token_save);
17909 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
17910 if (!str)
17911 res = AARCH64_PARSE_MISSING_ARG;
17912 else
17914 char *next_str = strtok_r (NULL, "+", &token_save);
17915 /* Reset the branch protection features to their defaults. */
17916 aarch64_handle_no_branch_protection (NULL, NULL);
17918 while (str && res == AARCH64_PARSE_OK)
17920 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
17921 bool found = false;
17922 /* Search for this type. */
17923 while (type && type->name && !found && res == AARCH64_PARSE_OK)
17925 if (strcmp (str, type->name) == 0)
17927 found = true;
17928 res = type->handler (str, next_str);
17929 str = next_str;
17930 next_str = strtok_r (NULL, "+", &token_save);
17932 else
17933 type++;
17935 if (found && res == AARCH64_PARSE_OK)
17937 bool found_subtype = true;
17938 /* Loop through each token until we find one that isn't a
17939 subtype. */
17940 while (found_subtype)
17942 found_subtype = false;
17943 const aarch64_branch_protect_type *subtype = type->subtypes;
17944 /* Search for the subtype. */
17945 while (str && subtype && subtype->name && !found_subtype
17946 && res == AARCH64_PARSE_OK)
17948 if (strcmp (str, subtype->name) == 0)
17950 found_subtype = true;
17951 res = subtype->handler (str, next_str);
17952 str = next_str;
17953 next_str = strtok_r (NULL, "+", &token_save);
17955 else
17956 subtype++;
17960 else if (!found)
17961 res = AARCH64_PARSE_INVALID_ARG;
17964 /* Copy the last processed token into the argument to pass it back.
17965 Used by option and attribute validation to print the offending token. */
17966 if (last_str)
17968 if (str) strcpy (*last_str, str);
17969 else *last_str = NULL;
17971 if (res == AARCH64_PARSE_OK)
17973 /* If needed, alloc the accepted string then copy in const_str.
17974 Used by override_option_after_change_1. */
17975 if (!accepted_branch_protection_string)
17976 accepted_branch_protection_string = (char *) xmalloc (
17977 BRANCH_PROTECT_STR_MAX
17978 + 1);
17979 strncpy (accepted_branch_protection_string, const_str,
17980 BRANCH_PROTECT_STR_MAX + 1);
17981 /* Forcibly null-terminate. */
17982 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
17984 return res;
17987 static bool
17988 aarch64_validate_mbranch_protection (const char *const_str)
17990 char *str = (char *) xmalloc (strlen (const_str));
17991 enum aarch64_parse_opt_result res =
17992 aarch64_parse_branch_protection (const_str, &str);
17993 if (res == AARCH64_PARSE_INVALID_ARG)
17994 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
17995 else if (res == AARCH64_PARSE_MISSING_ARG)
17996 error ("missing argument for %<-mbranch-protection=%>");
17997 free (str);
17998 return res == AARCH64_PARSE_OK;
18001 /* Validate a command-line -march option. Parse the arch and extensions
18002 (if any) specified in STR and throw errors if appropriate. Put the
18003 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18004 option is valid. */
18006 static bool
18007 aarch64_validate_march (const char *str, const struct processor **res,
18008 uint64_t *isa_flags)
18010 std::string invalid_extension;
18011 enum aarch64_parse_opt_result parse_res
18012 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18014 if (parse_res == AARCH64_PARSE_OK)
18015 return true;
18017 switch (parse_res)
18019 case AARCH64_PARSE_MISSING_ARG:
18020 error ("missing arch name in %<-march=%s%>", str);
18021 break;
18022 case AARCH64_PARSE_INVALID_ARG:
18023 error ("unknown value %qs for %<-march%>", str);
18024 aarch64_print_hint_for_arch (str);
18025 break;
18026 case AARCH64_PARSE_INVALID_FEATURE:
18027 error ("invalid feature modifier %qs in %<-march=%s%>",
18028 invalid_extension.c_str (), str);
18029 aarch64_print_hint_for_extensions (invalid_extension);
18030 break;
18031 default:
18032 gcc_unreachable ();
18035 return false;
18038 /* Validate a command-line -mtune option. Parse the cpu
18039 specified in STR and throw errors if appropriate. Put the
18040 result, if it is valid, in RES. Return whether the option is
18041 valid. */
18043 static bool
18044 aarch64_validate_mtune (const char *str, const struct processor **res)
18046 enum aarch64_parse_opt_result parse_res
18047 = aarch64_parse_tune (str, res);
18049 if (parse_res == AARCH64_PARSE_OK)
18050 return true;
18052 switch (parse_res)
18054 case AARCH64_PARSE_MISSING_ARG:
18055 error ("missing cpu name in %<-mtune=%s%>", str);
18056 break;
18057 case AARCH64_PARSE_INVALID_ARG:
18058 error ("unknown value %qs for %<-mtune%>", str);
18059 aarch64_print_hint_for_core (str);
18060 break;
18061 default:
18062 gcc_unreachable ();
18064 return false;
18067 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18069 static poly_uint16
18070 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18072 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18073 on big-endian targets, so we would need to forbid subregs that convert
18074 from one to the other. By default a reinterpret sequence would then
18075 involve a store to memory in one mode and a load back in the other.
18076 Even if we optimize that sequence using reverse instructions,
18077 it would still be a significant potential overhead.
18079 For now, it seems better to generate length-agnostic code for that
18080 case instead. */
18081 if (value == SVE_SCALABLE
18082 || (value == SVE_128 && BYTES_BIG_ENDIAN))
18083 return poly_uint16 (2, 2);
18084 else
18085 return (int) value / 64;
18088 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18089 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18090 tuning structs. In particular it must set selected_tune and
18091 aarch64_isa_flags that define the available ISA features and tuning
18092 decisions. It must also set selected_arch as this will be used to
18093 output the .arch asm tags for each function. */
18095 static void
18096 aarch64_override_options (void)
18098 uint64_t cpu_isa = 0;
18099 uint64_t arch_isa = 0;
18100 aarch64_isa_flags = 0;
18102 const struct processor *cpu = NULL;
18103 const struct processor *arch = NULL;
18104 const struct processor *tune = NULL;
18106 if (aarch64_harden_sls_string)
18107 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18109 if (aarch64_branch_protection_string)
18110 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
18112 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18113 If either of -march or -mtune is given, they override their
18114 respective component of -mcpu. */
18115 if (aarch64_cpu_string)
18116 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18118 if (aarch64_arch_string)
18119 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18121 if (aarch64_tune_string)
18122 aarch64_validate_mtune (aarch64_tune_string, &tune);
18124 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18125 SUBTARGET_OVERRIDE_OPTIONS;
18126 #endif
18128 if (cpu && arch)
18130 /* If both -mcpu and -march are specified, warn if they are not
18131 architecturally compatible and prefer the -march ISA flags. */
18132 if (arch->arch != cpu->arch)
18134 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18135 aarch64_cpu_string,
18136 aarch64_arch_string);
18139 selected_arch = arch->arch;
18140 aarch64_isa_flags = arch_isa;
18142 else if (cpu)
18144 selected_arch = cpu->arch;
18145 aarch64_isa_flags = cpu_isa;
18147 else if (arch)
18149 cpu = &all_cores[arch->ident];
18150 selected_arch = arch->arch;
18151 aarch64_isa_flags = arch_isa;
18153 else
18155 /* No -mcpu or -march specified, so use the default CPU. */
18156 cpu = &all_cores[TARGET_CPU_DEFAULT];
18157 selected_arch = cpu->arch;
18158 aarch64_isa_flags = cpu->flags;
18161 selected_tune = tune ? tune->ident : cpu->ident;
18163 if (aarch64_enable_bti == 2)
18165 #ifdef TARGET_ENABLE_BTI
18166 aarch64_enable_bti = 1;
18167 #else
18168 aarch64_enable_bti = 0;
18169 #endif
18172 /* Return address signing is currently not supported for ILP32 targets. For
18173 LP64 targets use the configured option in the absence of a command-line
18174 option for -mbranch-protection. */
18175 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18177 #ifdef TARGET_ENABLE_PAC_RET
18178 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
18179 #else
18180 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
18181 #endif
18184 #ifndef HAVE_AS_MABI_OPTION
18185 /* The compiler may have been configured with 2.23.* binutils, which does
18186 not have support for ILP32. */
18187 if (TARGET_ILP32)
18188 error ("assembler does not support %<-mabi=ilp32%>");
18189 #endif
18191 /* Convert -msve-vector-bits to a VG count. */
18192 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18194 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
18195 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18197 /* The pass to insert speculation tracking runs before
18198 shrink-wrapping and the latter does not know how to update the
18199 tracking status. So disable it in this case. */
18200 if (aarch64_track_speculation)
18201 flag_shrink_wrap = 0;
18203 aarch64_override_options_internal (&global_options);
18205 /* Save these options as the default ones in case we push and pop them later
18206 while processing functions with potential target attributes. */
18207 target_option_default_node = target_option_current_node
18208 = build_target_option_node (&global_options, &global_options_set);
18211 /* Implement targetm.override_options_after_change. */
18213 static void
18214 aarch64_override_options_after_change (void)
18216 aarch64_override_options_after_change_1 (&global_options);
18219 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18220 static char *
18221 aarch64_offload_options (void)
18223 if (TARGET_ILP32)
18224 return xstrdup ("-foffload-abi=ilp32");
18225 else
18226 return xstrdup ("-foffload-abi=lp64");
18229 static struct machine_function *
18230 aarch64_init_machine_status (void)
18232 struct machine_function *machine;
18233 machine = ggc_cleared_alloc<machine_function> ();
18234 return machine;
18237 void
18238 aarch64_init_expanders (void)
18240 init_machine_status = aarch64_init_machine_status;
18243 /* A checking mechanism for the implementation of the various code models. */
18244 static void
18245 initialize_aarch64_code_model (struct gcc_options *opts)
18247 aarch64_cmodel = opts->x_aarch64_cmodel_var;
18248 switch (opts->x_aarch64_cmodel_var)
18250 case AARCH64_CMODEL_TINY:
18251 if (opts->x_flag_pic)
18252 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18253 break;
18254 case AARCH64_CMODEL_SMALL:
18255 if (opts->x_flag_pic)
18257 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18258 aarch64_cmodel = (flag_pic == 2
18259 ? AARCH64_CMODEL_SMALL_PIC
18260 : AARCH64_CMODEL_SMALL_SPIC);
18261 #else
18262 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18263 #endif
18265 break;
18266 case AARCH64_CMODEL_LARGE:
18267 if (opts->x_flag_pic)
18268 sorry ("code model %qs with %<-f%s%>", "large",
18269 opts->x_flag_pic > 1 ? "PIC" : "pic");
18270 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18271 sorry ("code model %qs not supported in ilp32 mode", "large");
18272 break;
18273 case AARCH64_CMODEL_TINY_PIC:
18274 case AARCH64_CMODEL_SMALL_PIC:
18275 case AARCH64_CMODEL_SMALL_SPIC:
18276 gcc_unreachable ();
18280 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
18281 using the information saved in PTR. */
18283 static void
18284 aarch64_option_restore (struct gcc_options *opts,
18285 struct gcc_options * /* opts_set */,
18286 struct cl_target_option * /* ptr */)
18288 aarch64_override_options_internal (opts);
18291 /* Implement TARGET_OPTION_PRINT. */
18293 static void
18294 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18296 const struct processor *cpu
18297 = aarch64_get_tune_cpu (ptr->x_selected_tune);
18298 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18299 std::string extension
18300 = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_isa_flags,
18301 arch->flags);
18303 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18304 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18305 arch->name, extension.c_str ());
18308 static GTY(()) tree aarch64_previous_fndecl;
18310 void
18311 aarch64_reset_previous_fndecl (void)
18313 aarch64_previous_fndecl = NULL;
18316 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18317 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18318 make sure optab availability predicates are recomputed when necessary. */
18320 void
18321 aarch64_save_restore_target_globals (tree new_tree)
18323 if (TREE_TARGET_GLOBALS (new_tree))
18324 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18325 else if (new_tree == target_option_default_node)
18326 restore_target_globals (&default_target_globals);
18327 else
18328 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18331 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
18332 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18333 of the function, if such exists. This function may be called multiple
18334 times on a single function so use aarch64_previous_fndecl to avoid
18335 setting up identical state. */
18337 static void
18338 aarch64_set_current_function (tree fndecl)
18340 if (!fndecl || fndecl == aarch64_previous_fndecl)
18341 return;
18343 tree old_tree = (aarch64_previous_fndecl
18344 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18345 : NULL_TREE);
18347 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18349 /* If current function has no attributes but the previous one did,
18350 use the default node. */
18351 if (!new_tree && old_tree)
18352 new_tree = target_option_default_node;
18354 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
18355 the default have been handled by aarch64_save_restore_target_globals from
18356 aarch64_pragma_target_parse. */
18357 if (old_tree == new_tree)
18358 return;
18360 aarch64_previous_fndecl = fndecl;
18362 /* First set the target options. */
18363 cl_target_option_restore (&global_options, &global_options_set,
18364 TREE_TARGET_OPTION (new_tree));
18366 aarch64_save_restore_target_globals (new_tree);
18369 /* Enum describing the various ways we can handle attributes.
18370 In many cases we can reuse the generic option handling machinery. */
18372 enum aarch64_attr_opt_type
18374 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
18375 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
18376 aarch64_attr_enum, /* Attribute sets an enum variable. */
18377 aarch64_attr_custom /* Attribute requires a custom handling function. */
18380 /* All the information needed to handle a target attribute.
18381 NAME is the name of the attribute.
18382 ATTR_TYPE specifies the type of behavior of the attribute as described
18383 in the definition of enum aarch64_attr_opt_type.
18384 ALLOW_NEG is true if the attribute supports a "no-" form.
18385 HANDLER is the function that takes the attribute string as an argument
18386 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18387 OPT_NUM is the enum specifying the option that the attribute modifies.
18388 This is needed for attributes that mirror the behavior of a command-line
18389 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18390 aarch64_attr_enum. */
18392 struct aarch64_attribute_info
18394 const char *name;
18395 enum aarch64_attr_opt_type attr_type;
18396 bool allow_neg;
18397 bool (*handler) (const char *);
18398 enum opt_code opt_num;
18401 /* Handle the ARCH_STR argument to the arch= target attribute. */
18403 static bool
18404 aarch64_handle_attr_arch (const char *str)
18406 const struct processor *tmp_arch = NULL;
18407 std::string invalid_extension;
18408 enum aarch64_parse_opt_result parse_res
18409 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
18411 if (parse_res == AARCH64_PARSE_OK)
18413 gcc_assert (tmp_arch);
18414 selected_arch = tmp_arch->arch;
18415 return true;
18418 switch (parse_res)
18420 case AARCH64_PARSE_MISSING_ARG:
18421 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18422 break;
18423 case AARCH64_PARSE_INVALID_ARG:
18424 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18425 aarch64_print_hint_for_arch (str);
18426 break;
18427 case AARCH64_PARSE_INVALID_FEATURE:
18428 error ("invalid feature modifier %s of value %qs in "
18429 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18430 aarch64_print_hint_for_extensions (invalid_extension);
18431 break;
18432 default:
18433 gcc_unreachable ();
18436 return false;
18439 /* Handle the argument CPU_STR to the cpu= target attribute. */
18441 static bool
18442 aarch64_handle_attr_cpu (const char *str)
18444 const struct processor *tmp_cpu = NULL;
18445 std::string invalid_extension;
18446 enum aarch64_parse_opt_result parse_res
18447 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
18449 if (parse_res == AARCH64_PARSE_OK)
18451 gcc_assert (tmp_cpu);
18452 selected_tune = tmp_cpu->ident;
18453 selected_arch = tmp_cpu->arch;
18454 return true;
18457 switch (parse_res)
18459 case AARCH64_PARSE_MISSING_ARG:
18460 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18461 break;
18462 case AARCH64_PARSE_INVALID_ARG:
18463 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18464 aarch64_print_hint_for_core (str);
18465 break;
18466 case AARCH64_PARSE_INVALID_FEATURE:
18467 error ("invalid feature modifier %qs of value %qs in "
18468 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18469 aarch64_print_hint_for_extensions (invalid_extension);
18470 break;
18471 default:
18472 gcc_unreachable ();
18475 return false;
18478 /* Handle the argument STR to the branch-protection= attribute. */
18480 static bool
18481 aarch64_handle_attr_branch_protection (const char* str)
18483 char *err_str = (char *) xmalloc (strlen (str) + 1);
18484 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
18485 &err_str);
18486 bool success = false;
18487 switch (res)
18489 case AARCH64_PARSE_MISSING_ARG:
18490 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18491 " attribute");
18492 break;
18493 case AARCH64_PARSE_INVALID_ARG:
18494 error ("invalid protection type %qs in %<target(\"branch-protection"
18495 "=\")%> pragma or attribute", err_str);
18496 break;
18497 case AARCH64_PARSE_OK:
18498 success = true;
18499 /* Fall through. */
18500 case AARCH64_PARSE_INVALID_FEATURE:
18501 break;
18502 default:
18503 gcc_unreachable ();
18505 free (err_str);
18506 return success;
18509 /* Handle the argument STR to the tune= target attribute. */
18511 static bool
18512 aarch64_handle_attr_tune (const char *str)
18514 const struct processor *tmp_tune = NULL;
18515 enum aarch64_parse_opt_result parse_res
18516 = aarch64_parse_tune (str, &tmp_tune);
18518 if (parse_res == AARCH64_PARSE_OK)
18520 gcc_assert (tmp_tune);
18521 selected_tune = tmp_tune->ident;
18522 return true;
18525 switch (parse_res)
18527 case AARCH64_PARSE_INVALID_ARG:
18528 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18529 aarch64_print_hint_for_core (str);
18530 break;
18531 default:
18532 gcc_unreachable ();
18535 return false;
18538 /* Parse an architecture extensions target attribute string specified in STR.
18539 For example "+fp+nosimd". Show any errors if needed. Return TRUE
18540 if successful. Update aarch64_isa_flags to reflect the ISA features
18541 modified. */
18543 static bool
18544 aarch64_handle_attr_isa_flags (char *str)
18546 enum aarch64_parse_opt_result parse_res;
18547 uint64_t isa_flags = aarch64_isa_flags;
18549 /* We allow "+nothing" in the beginning to clear out all architectural
18550 features if the user wants to handpick specific features. */
18551 if (strncmp ("+nothing", str, 8) == 0)
18553 isa_flags = 0;
18554 str += 8;
18557 std::string invalid_extension;
18558 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18560 if (parse_res == AARCH64_PARSE_OK)
18562 aarch64_isa_flags = isa_flags;
18563 return true;
18566 switch (parse_res)
18568 case AARCH64_PARSE_MISSING_ARG:
18569 error ("missing value in %<target()%> pragma or attribute");
18570 break;
18572 case AARCH64_PARSE_INVALID_FEATURE:
18573 error ("invalid feature modifier %qs of value %qs in "
18574 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18575 break;
18577 default:
18578 gcc_unreachable ();
18581 return false;
18584 /* The target attributes that we support. On top of these we also support just
18585 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
18586 handled explicitly in aarch64_process_one_target_attr. */
18588 static const struct aarch64_attribute_info aarch64_attributes[] =
18590 { "general-regs-only", aarch64_attr_mask, false, NULL,
18591 OPT_mgeneral_regs_only },
18592 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18593 OPT_mfix_cortex_a53_835769 },
18594 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18595 OPT_mfix_cortex_a53_843419 },
18596 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18597 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18598 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18599 OPT_momit_leaf_frame_pointer },
18600 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18601 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18602 OPT_march_ },
18603 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18604 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18605 OPT_mtune_ },
18606 { "branch-protection", aarch64_attr_custom, false,
18607 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18608 { "sign-return-address", aarch64_attr_enum, false, NULL,
18609 OPT_msign_return_address_ },
18610 { "outline-atomics", aarch64_attr_bool, true, NULL,
18611 OPT_moutline_atomics},
18612 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18615 /* Parse ARG_STR which contains the definition of one target attribute.
18616 Show appropriate errors if any or return true if the attribute is valid. */
18618 static bool
18619 aarch64_process_one_target_attr (char *arg_str)
18621 bool invert = false;
18623 size_t len = strlen (arg_str);
18625 if (len == 0)
18627 error ("malformed %<target()%> pragma or attribute");
18628 return false;
18631 char *str_to_check = (char *) alloca (len + 1);
18632 strcpy (str_to_check, arg_str);
18634 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18635 It is easier to detect and handle it explicitly here rather than going
18636 through the machinery for the rest of the target attributes in this
18637 function. */
18638 if (*str_to_check == '+')
18639 return aarch64_handle_attr_isa_flags (str_to_check);
18641 if (len > 3 && startswith (str_to_check, "no-"))
18643 invert = true;
18644 str_to_check += 3;
18646 char *arg = strchr (str_to_check, '=');
18648 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18649 and point ARG to "foo". */
18650 if (arg)
18652 *arg = '\0';
18653 arg++;
18655 const struct aarch64_attribute_info *p_attr;
18656 bool found = false;
18657 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18659 /* If the names don't match up, or the user has given an argument
18660 to an attribute that doesn't accept one, or didn't give an argument
18661 to an attribute that expects one, fail to match. */
18662 if (strcmp (str_to_check, p_attr->name) != 0)
18663 continue;
18665 found = true;
18666 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18667 || p_attr->attr_type == aarch64_attr_enum;
18669 if (attr_need_arg_p ^ (arg != NULL))
18671 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18672 return false;
18675 /* If the name matches but the attribute does not allow "no-" versions
18676 then we can't match. */
18677 if (invert && !p_attr->allow_neg)
18679 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18680 return false;
18683 switch (p_attr->attr_type)
18685 /* Has a custom handler registered.
18686 For example, cpu=, arch=, tune=. */
18687 case aarch64_attr_custom:
18688 gcc_assert (p_attr->handler);
18689 if (!p_attr->handler (arg))
18690 return false;
18691 break;
18693 /* Either set or unset a boolean option. */
18694 case aarch64_attr_bool:
18696 struct cl_decoded_option decoded;
18698 generate_option (p_attr->opt_num, NULL, !invert,
18699 CL_TARGET, &decoded);
18700 aarch64_handle_option (&global_options, &global_options_set,
18701 &decoded, input_location);
18702 break;
18704 /* Set or unset a bit in the target_flags. aarch64_handle_option
18705 should know what mask to apply given the option number. */
18706 case aarch64_attr_mask:
18708 struct cl_decoded_option decoded;
18709 /* We only need to specify the option number.
18710 aarch64_handle_option will know which mask to apply. */
18711 decoded.opt_index = p_attr->opt_num;
18712 decoded.value = !invert;
18713 aarch64_handle_option (&global_options, &global_options_set,
18714 &decoded, input_location);
18715 break;
18717 /* Use the option setting machinery to set an option to an enum. */
18718 case aarch64_attr_enum:
18720 gcc_assert (arg);
18721 bool valid;
18722 int value;
18723 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
18724 &value, CL_TARGET);
18725 if (valid)
18727 set_option (&global_options, NULL, p_attr->opt_num, value,
18728 NULL, DK_UNSPECIFIED, input_location,
18729 global_dc);
18731 else
18733 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
18735 break;
18737 default:
18738 gcc_unreachable ();
18742 /* If we reached here we either have found an attribute and validated
18743 it or didn't match any. If we matched an attribute but its arguments
18744 were malformed we will have returned false already. */
18745 return found;
18748 /* Count how many times the character C appears in
18749 NULL-terminated string STR. */
18751 static unsigned int
18752 num_occurences_in_str (char c, char *str)
18754 unsigned int res = 0;
18755 while (*str != '\0')
18757 if (*str == c)
18758 res++;
18760 str++;
18763 return res;
18766 /* Parse the tree in ARGS that contains the target attribute information
18767 and update the global target options space. */
18769 bool
18770 aarch64_process_target_attr (tree args)
18772 if (TREE_CODE (args) == TREE_LIST)
18776 tree head = TREE_VALUE (args);
18777 if (head)
18779 if (!aarch64_process_target_attr (head))
18780 return false;
18782 args = TREE_CHAIN (args);
18783 } while (args);
18785 return true;
18788 if (TREE_CODE (args) != STRING_CST)
18790 error ("attribute %<target%> argument not a string");
18791 return false;
18794 size_t len = strlen (TREE_STRING_POINTER (args));
18795 char *str_to_check = (char *) alloca (len + 1);
18796 strcpy (str_to_check, TREE_STRING_POINTER (args));
18798 if (len == 0)
18800 error ("malformed %<target()%> pragma or attribute");
18801 return false;
18804 /* Used to catch empty spaces between commas i.e.
18805 attribute ((target ("attr1,,attr2"))). */
18806 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
18808 /* Handle multiple target attributes separated by ','. */
18809 char *token = strtok_r (str_to_check, ",", &str_to_check);
18811 unsigned int num_attrs = 0;
18812 while (token)
18814 num_attrs++;
18815 if (!aarch64_process_one_target_attr (token))
18817 /* Check if token is possibly an arch extension without
18818 leading '+'. */
18819 uint64_t isa_temp = 0;
18820 auto with_plus = std::string ("+") + token;
18821 enum aarch64_parse_opt_result ext_res
18822 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
18824 if (ext_res == AARCH64_PARSE_OK)
18825 error ("arch extension %<%s%> should be prefixed by %<+%>",
18826 token);
18827 else
18828 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
18829 return false;
18832 token = strtok_r (NULL, ",", &str_to_check);
18835 if (num_attrs != num_commas + 1)
18837 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
18838 return false;
18841 return true;
18844 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
18845 process attribute ((target ("..."))). */
18847 static bool
18848 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
18850 struct cl_target_option cur_target;
18851 bool ret;
18852 tree old_optimize;
18853 tree new_target, new_optimize;
18854 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18856 /* If what we're processing is the current pragma string then the
18857 target option node is already stored in target_option_current_node
18858 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
18859 having to re-parse the string. This is especially useful to keep
18860 arm_neon.h compile times down since that header contains a lot
18861 of intrinsics enclosed in pragmas. */
18862 if (!existing_target && args == current_target_pragma)
18864 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
18865 return true;
18867 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
18869 old_optimize
18870 = build_optimization_node (&global_options, &global_options_set);
18871 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
18873 /* If the function changed the optimization levels as well as setting
18874 target options, start with the optimizations specified. */
18875 if (func_optimize && func_optimize != old_optimize)
18876 cl_optimization_restore (&global_options, &global_options_set,
18877 TREE_OPTIMIZATION (func_optimize));
18879 /* Save the current target options to restore at the end. */
18880 cl_target_option_save (&cur_target, &global_options, &global_options_set);
18882 /* If fndecl already has some target attributes applied to it, unpack
18883 them so that we add this attribute on top of them, rather than
18884 overwriting them. */
18885 if (existing_target)
18887 struct cl_target_option *existing_options
18888 = TREE_TARGET_OPTION (existing_target);
18890 if (existing_options)
18891 cl_target_option_restore (&global_options, &global_options_set,
18892 existing_options);
18894 else
18895 cl_target_option_restore (&global_options, &global_options_set,
18896 TREE_TARGET_OPTION (target_option_current_node));
18898 ret = aarch64_process_target_attr (args);
18900 /* Set up any additional state. */
18901 if (ret)
18903 aarch64_override_options_internal (&global_options);
18904 /* Initialize SIMD builtins if we haven't already.
18905 Set current_target_pragma to NULL for the duration so that
18906 the builtin initialization code doesn't try to tag the functions
18907 being built with the attributes specified by any current pragma, thus
18908 going into an infinite recursion. */
18909 if (TARGET_SIMD)
18911 tree saved_current_target_pragma = current_target_pragma;
18912 current_target_pragma = NULL;
18913 aarch64_init_simd_builtins ();
18914 current_target_pragma = saved_current_target_pragma;
18916 new_target = build_target_option_node (&global_options,
18917 &global_options_set);
18919 else
18920 new_target = NULL;
18922 new_optimize = build_optimization_node (&global_options,
18923 &global_options_set);
18925 if (fndecl && ret)
18927 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
18929 if (old_optimize != new_optimize)
18930 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
18933 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
18935 if (old_optimize != new_optimize)
18936 cl_optimization_restore (&global_options, &global_options_set,
18937 TREE_OPTIMIZATION (old_optimize));
18938 return ret;
18941 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
18942 tri-bool options (yes, no, don't care) and the default value is
18943 DEF, determine whether to reject inlining. */
18945 static bool
18946 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
18947 int dont_care, int def)
18949 /* If the callee doesn't care, always allow inlining. */
18950 if (callee == dont_care)
18951 return true;
18953 /* If the caller doesn't care, always allow inlining. */
18954 if (caller == dont_care)
18955 return true;
18957 /* Otherwise, allow inlining if either the callee and caller values
18958 agree, or if the callee is using the default value. */
18959 return (callee == caller || callee == def);
18962 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
18963 to inline CALLEE into CALLER based on target-specific info.
18964 Make sure that the caller and callee have compatible architectural
18965 features. Then go through the other possible target attributes
18966 and see if they can block inlining. Try not to reject always_inline
18967 callees unless they are incompatible architecturally. */
18969 static bool
18970 aarch64_can_inline_p (tree caller, tree callee)
18972 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
18973 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
18975 struct cl_target_option *caller_opts
18976 = TREE_TARGET_OPTION (caller_tree ? caller_tree
18977 : target_option_default_node);
18979 struct cl_target_option *callee_opts
18980 = TREE_TARGET_OPTION (callee_tree ? callee_tree
18981 : target_option_default_node);
18983 /* Callee's ISA flags should be a subset of the caller's. */
18984 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
18985 != callee_opts->x_aarch64_isa_flags)
18986 return false;
18988 /* Allow non-strict aligned functions inlining into strict
18989 aligned ones. */
18990 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
18991 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
18992 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
18993 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
18994 return false;
18996 bool always_inline = lookup_attribute ("always_inline",
18997 DECL_ATTRIBUTES (callee));
18999 /* If the architectural features match up and the callee is always_inline
19000 then the other attributes don't matter. */
19001 if (always_inline)
19002 return true;
19004 if (caller_opts->x_aarch64_cmodel_var
19005 != callee_opts->x_aarch64_cmodel_var)
19006 return false;
19008 if (caller_opts->x_aarch64_tls_dialect
19009 != callee_opts->x_aarch64_tls_dialect)
19010 return false;
19012 /* Honour explicit requests to workaround errata. */
19013 if (!aarch64_tribools_ok_for_inlining_p (
19014 caller_opts->x_aarch64_fix_a53_err835769,
19015 callee_opts->x_aarch64_fix_a53_err835769,
19016 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19017 return false;
19019 if (!aarch64_tribools_ok_for_inlining_p (
19020 caller_opts->x_aarch64_fix_a53_err843419,
19021 callee_opts->x_aarch64_fix_a53_err843419,
19022 2, TARGET_FIX_ERR_A53_843419))
19023 return false;
19025 /* If the user explicitly specified -momit-leaf-frame-pointer for the
19026 caller and calle and they don't match up, reject inlining. */
19027 if (!aarch64_tribools_ok_for_inlining_p (
19028 caller_opts->x_flag_omit_leaf_frame_pointer,
19029 callee_opts->x_flag_omit_leaf_frame_pointer,
19030 2, 1))
19031 return false;
19033 /* If the callee has specific tuning overrides, respect them. */
19034 if (callee_opts->x_aarch64_override_tune_string != NULL
19035 && caller_opts->x_aarch64_override_tune_string == NULL)
19036 return false;
19038 /* If the user specified tuning override strings for the
19039 caller and callee and they don't match up, reject inlining.
19040 We just do a string compare here, we don't analyze the meaning
19041 of the string, as it would be too costly for little gain. */
19042 if (callee_opts->x_aarch64_override_tune_string
19043 && caller_opts->x_aarch64_override_tune_string
19044 && (strcmp (callee_opts->x_aarch64_override_tune_string,
19045 caller_opts->x_aarch64_override_tune_string) != 0))
19046 return false;
19048 return true;
19051 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19052 been already. */
19054 unsigned int
19055 aarch64_tlsdesc_abi_id ()
19057 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19058 if (!tlsdesc_abi.initialized_p ())
19060 HARD_REG_SET full_reg_clobbers;
19061 CLEAR_HARD_REG_SET (full_reg_clobbers);
19062 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19063 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19064 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19065 SET_HARD_REG_BIT (full_reg_clobbers, regno);
19066 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19068 return tlsdesc_abi.id ();
19071 /* Return true if SYMBOL_REF X binds locally. */
19073 static bool
19074 aarch64_symbol_binds_local_p (const_rtx x)
19076 return (SYMBOL_REF_DECL (x)
19077 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19078 : SYMBOL_REF_LOCAL_P (x));
19081 /* Return true if SYMBOL_REF X is thread local */
19082 static bool
19083 aarch64_tls_symbol_p (rtx x)
19085 if (! TARGET_HAVE_TLS)
19086 return false;
19088 x = strip_salt (x);
19089 if (!SYMBOL_REF_P (x))
19090 return false;
19092 return SYMBOL_REF_TLS_MODEL (x) != 0;
19095 /* Classify a TLS symbol into one of the TLS kinds. */
19096 enum aarch64_symbol_type
19097 aarch64_classify_tls_symbol (rtx x)
19099 enum tls_model tls_kind = tls_symbolic_operand_type (x);
19101 switch (tls_kind)
19103 case TLS_MODEL_GLOBAL_DYNAMIC:
19104 case TLS_MODEL_LOCAL_DYNAMIC:
19105 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19107 case TLS_MODEL_INITIAL_EXEC:
19108 switch (aarch64_cmodel)
19110 case AARCH64_CMODEL_TINY:
19111 case AARCH64_CMODEL_TINY_PIC:
19112 return SYMBOL_TINY_TLSIE;
19113 default:
19114 return SYMBOL_SMALL_TLSIE;
19117 case TLS_MODEL_LOCAL_EXEC:
19118 if (aarch64_tls_size == 12)
19119 return SYMBOL_TLSLE12;
19120 else if (aarch64_tls_size == 24)
19121 return SYMBOL_TLSLE24;
19122 else if (aarch64_tls_size == 32)
19123 return SYMBOL_TLSLE32;
19124 else if (aarch64_tls_size == 48)
19125 return SYMBOL_TLSLE48;
19126 else
19127 gcc_unreachable ();
19129 case TLS_MODEL_EMULATED:
19130 case TLS_MODEL_NONE:
19131 return SYMBOL_FORCE_TO_MEM;
19133 default:
19134 gcc_unreachable ();
19138 /* Return the correct method for accessing X + OFFSET, where X is either
19139 a SYMBOL_REF or LABEL_REF. */
19141 enum aarch64_symbol_type
19142 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19144 x = strip_salt (x);
19146 if (LABEL_REF_P (x))
19148 switch (aarch64_cmodel)
19150 case AARCH64_CMODEL_LARGE:
19151 return SYMBOL_FORCE_TO_MEM;
19153 case AARCH64_CMODEL_TINY_PIC:
19154 case AARCH64_CMODEL_TINY:
19155 return SYMBOL_TINY_ABSOLUTE;
19157 case AARCH64_CMODEL_SMALL_SPIC:
19158 case AARCH64_CMODEL_SMALL_PIC:
19159 case AARCH64_CMODEL_SMALL:
19160 return SYMBOL_SMALL_ABSOLUTE;
19162 default:
19163 gcc_unreachable ();
19167 if (SYMBOL_REF_P (x))
19169 if (aarch64_tls_symbol_p (x))
19170 return aarch64_classify_tls_symbol (x);
19172 switch (aarch64_cmodel)
19174 case AARCH64_CMODEL_TINY_PIC:
19175 case AARCH64_CMODEL_TINY:
19176 /* With -fPIC non-local symbols use the GOT. For orthogonality
19177 always use the GOT for extern weak symbols. */
19178 if ((flag_pic || SYMBOL_REF_WEAK (x))
19179 && !aarch64_symbol_binds_local_p (x))
19180 return SYMBOL_TINY_GOT;
19182 /* When we retrieve symbol + offset address, we have to make sure
19183 the offset does not cause overflow of the final address. But
19184 we have no way of knowing the address of symbol at compile time
19185 so we can't accurately say if the distance between the PC and
19186 symbol + offset is outside the addressible range of +/-1MB in the
19187 TINY code model. So we limit the maximum offset to +/-64KB and
19188 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19189 If offset_within_block_p is true we allow larger offsets. */
19190 if (!(IN_RANGE (offset, -0x10000, 0x10000)
19191 || offset_within_block_p (x, offset)))
19192 return SYMBOL_FORCE_TO_MEM;
19194 return SYMBOL_TINY_ABSOLUTE;
19197 case AARCH64_CMODEL_SMALL_SPIC:
19198 case AARCH64_CMODEL_SMALL_PIC:
19199 case AARCH64_CMODEL_SMALL:
19200 if ((flag_pic || SYMBOL_REF_WEAK (x))
19201 && !aarch64_symbol_binds_local_p (x))
19202 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19203 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19205 /* Same reasoning as the tiny code model, but the offset cap here is
19206 1MB, allowing +/-3.9GB for the offset to the symbol. */
19207 if (!(IN_RANGE (offset, -0x100000, 0x100000)
19208 || offset_within_block_p (x, offset)))
19209 return SYMBOL_FORCE_TO_MEM;
19211 return SYMBOL_SMALL_ABSOLUTE;
19213 case AARCH64_CMODEL_LARGE:
19214 /* This is alright even in PIC code as the constant
19215 pool reference is always PC relative and within
19216 the same translation unit. */
19217 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19218 return SYMBOL_SMALL_ABSOLUTE;
19219 else
19220 return SYMBOL_FORCE_TO_MEM;
19222 default:
19223 gcc_unreachable ();
19227 /* By default push everything into the constant pool. */
19228 return SYMBOL_FORCE_TO_MEM;
19231 bool
19232 aarch64_constant_address_p (rtx x)
19234 return (CONSTANT_P (x) && memory_address_p (DImode, x));
19237 bool
19238 aarch64_legitimate_pic_operand_p (rtx x)
19240 poly_int64 offset;
19241 x = strip_offset_and_salt (x, &offset);
19242 if (SYMBOL_REF_P (x))
19243 return false;
19245 return true;
19248 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
19249 that should be rematerialized rather than spilled. */
19251 static bool
19252 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19254 /* Support CSE and rematerialization of common constants. */
19255 if (CONST_INT_P (x)
19256 || CONST_DOUBLE_P (x))
19257 return true;
19259 /* Only accept variable-length vector constants if they can be
19260 handled directly.
19262 ??? It would be possible (but complex) to handle rematerialization
19263 of other constants via secondary reloads. */
19264 if (!GET_MODE_SIZE (mode).is_constant ())
19265 return aarch64_simd_valid_immediate (x, NULL);
19267 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19268 least be forced to memory and loaded from there. */
19269 if (CONST_VECTOR_P (x))
19270 return !targetm.cannot_force_const_mem (mode, x);
19272 /* Do not allow vector struct mode constants for Advanced SIMD.
19273 We could support 0 and -1 easily, but they need support in
19274 aarch64-simd.md. */
19275 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19276 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19277 return false;
19279 if (GET_CODE (x) == HIGH)
19280 x = XEXP (x, 0);
19282 /* Accept polynomial constants that can be calculated by using the
19283 destination of a move as the sole temporary. Constants that
19284 require a second temporary cannot be rematerialized (they can't be
19285 forced to memory and also aren't legitimate constants). */
19286 poly_int64 offset;
19287 if (poly_int_rtx_p (x, &offset))
19288 return aarch64_offset_temporaries (false, offset) <= 1;
19290 /* If an offset is being added to something else, we need to allow the
19291 base to be moved into the destination register, meaning that there
19292 are no free temporaries for the offset. */
19293 x = strip_offset_and_salt (x, &offset);
19294 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19295 return false;
19297 /* Do not allow const (plus (anchor_symbol, const_int)). */
19298 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19299 return false;
19301 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
19302 so spilling them is better than rematerialization. */
19303 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19304 return true;
19306 /* Label references are always constant. */
19307 if (LABEL_REF_P (x))
19308 return true;
19310 return false;
19314 aarch64_load_tp (rtx target)
19316 if (!target
19317 || GET_MODE (target) != Pmode
19318 || !register_operand (target, Pmode))
19319 target = gen_reg_rtx (Pmode);
19321 /* Can return in any reg. */
19322 emit_insn (gen_aarch64_load_tp_hard (target));
19323 return target;
19326 /* On AAPCS systems, this is the "struct __va_list". */
19327 static GTY(()) tree va_list_type;
19329 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19330 Return the type to use as __builtin_va_list.
19332 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19334 struct __va_list
19336 void *__stack;
19337 void *__gr_top;
19338 void *__vr_top;
19339 int __gr_offs;
19340 int __vr_offs;
19341 }; */
19343 static tree
19344 aarch64_build_builtin_va_list (void)
19346 tree va_list_name;
19347 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19349 /* Create the type. */
19350 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19351 /* Give it the required name. */
19352 va_list_name = build_decl (BUILTINS_LOCATION,
19353 TYPE_DECL,
19354 get_identifier ("__va_list"),
19355 va_list_type);
19356 DECL_ARTIFICIAL (va_list_name) = 1;
19357 TYPE_NAME (va_list_type) = va_list_name;
19358 TYPE_STUB_DECL (va_list_type) = va_list_name;
19360 /* Create the fields. */
19361 f_stack = build_decl (BUILTINS_LOCATION,
19362 FIELD_DECL, get_identifier ("__stack"),
19363 ptr_type_node);
19364 f_grtop = build_decl (BUILTINS_LOCATION,
19365 FIELD_DECL, get_identifier ("__gr_top"),
19366 ptr_type_node);
19367 f_vrtop = build_decl (BUILTINS_LOCATION,
19368 FIELD_DECL, get_identifier ("__vr_top"),
19369 ptr_type_node);
19370 f_groff = build_decl (BUILTINS_LOCATION,
19371 FIELD_DECL, get_identifier ("__gr_offs"),
19372 integer_type_node);
19373 f_vroff = build_decl (BUILTINS_LOCATION,
19374 FIELD_DECL, get_identifier ("__vr_offs"),
19375 integer_type_node);
19377 /* Tell tree-stdarg pass about our internal offset fields.
19378 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19379 purpose to identify whether the code is updating va_list internal
19380 offset fields through irregular way. */
19381 va_list_gpr_counter_field = f_groff;
19382 va_list_fpr_counter_field = f_vroff;
19384 DECL_ARTIFICIAL (f_stack) = 1;
19385 DECL_ARTIFICIAL (f_grtop) = 1;
19386 DECL_ARTIFICIAL (f_vrtop) = 1;
19387 DECL_ARTIFICIAL (f_groff) = 1;
19388 DECL_ARTIFICIAL (f_vroff) = 1;
19390 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19391 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19392 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19393 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19394 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19396 TYPE_FIELDS (va_list_type) = f_stack;
19397 DECL_CHAIN (f_stack) = f_grtop;
19398 DECL_CHAIN (f_grtop) = f_vrtop;
19399 DECL_CHAIN (f_vrtop) = f_groff;
19400 DECL_CHAIN (f_groff) = f_vroff;
19402 /* Compute its layout. */
19403 layout_type (va_list_type);
19405 return va_list_type;
19408 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
19409 static void
19410 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19412 const CUMULATIVE_ARGS *cum;
19413 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19414 tree stack, grtop, vrtop, groff, vroff;
19415 tree t;
19416 int gr_save_area_size = cfun->va_list_gpr_size;
19417 int vr_save_area_size = cfun->va_list_fpr_size;
19418 int vr_offset;
19420 cum = &crtl->args.info;
19421 if (cfun->va_list_gpr_size)
19422 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19423 cfun->va_list_gpr_size);
19424 if (cfun->va_list_fpr_size)
19425 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19426 * UNITS_PER_VREG, cfun->va_list_fpr_size);
19428 if (!TARGET_FLOAT)
19430 gcc_assert (cum->aapcs_nvrn == 0);
19431 vr_save_area_size = 0;
19434 f_stack = TYPE_FIELDS (va_list_type_node);
19435 f_grtop = DECL_CHAIN (f_stack);
19436 f_vrtop = DECL_CHAIN (f_grtop);
19437 f_groff = DECL_CHAIN (f_vrtop);
19438 f_vroff = DECL_CHAIN (f_groff);
19440 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19441 NULL_TREE);
19442 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19443 NULL_TREE);
19444 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19445 NULL_TREE);
19446 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19447 NULL_TREE);
19448 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19449 NULL_TREE);
19451 /* Emit code to initialize STACK, which points to the next varargs stack
19452 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
19453 by named arguments. STACK is 8-byte aligned. */
19454 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19455 if (cum->aapcs_stack_size > 0)
19456 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19457 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19458 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19460 /* Emit code to initialize GRTOP, the top of the GR save area.
19461 virtual_incoming_args_rtx should have been 16 byte aligned. */
19462 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19463 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19464 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19466 /* Emit code to initialize VRTOP, the top of the VR save area.
19467 This address is gr_save_area_bytes below GRTOP, rounded
19468 down to the next 16-byte boundary. */
19469 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19470 vr_offset = ROUND_UP (gr_save_area_size,
19471 STACK_BOUNDARY / BITS_PER_UNIT);
19473 if (vr_offset)
19474 t = fold_build_pointer_plus_hwi (t, -vr_offset);
19475 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19476 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19478 /* Emit code to initialize GROFF, the offset from GRTOP of the
19479 next GPR argument. */
19480 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19481 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19482 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19484 /* Likewise emit code to initialize VROFF, the offset from FTOP
19485 of the next VR argument. */
19486 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19487 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19488 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19491 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
19493 static tree
19494 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19495 gimple_seq *post_p ATTRIBUTE_UNUSED)
19497 tree addr;
19498 bool indirect_p;
19499 bool is_ha; /* is HFA or HVA. */
19500 bool dw_align; /* double-word align. */
19501 machine_mode ag_mode = VOIDmode;
19502 int nregs;
19503 machine_mode mode;
19505 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19506 tree stack, f_top, f_off, off, arg, roundup, on_stack;
19507 HOST_WIDE_INT size, rsize, adjust, align;
19508 tree t, u, cond1, cond2;
19510 indirect_p = pass_va_arg_by_reference (type);
19511 if (indirect_p)
19512 type = build_pointer_type (type);
19514 mode = TYPE_MODE (type);
19516 f_stack = TYPE_FIELDS (va_list_type_node);
19517 f_grtop = DECL_CHAIN (f_stack);
19518 f_vrtop = DECL_CHAIN (f_grtop);
19519 f_groff = DECL_CHAIN (f_vrtop);
19520 f_vroff = DECL_CHAIN (f_groff);
19522 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19523 f_stack, NULL_TREE);
19524 size = int_size_in_bytes (type);
19526 unsigned int abi_break;
19527 align
19528 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
19530 dw_align = false;
19531 adjust = 0;
19532 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19533 &is_ha, false))
19535 /* No frontends can create types with variable-sized modes, so we
19536 shouldn't be asked to pass or return them. */
19537 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19539 /* TYPE passed in fp/simd registers. */
19540 if (!TARGET_FLOAT)
19541 aarch64_err_no_fpadvsimd (mode);
19543 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19544 unshare_expr (valist), f_vrtop, NULL_TREE);
19545 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19546 unshare_expr (valist), f_vroff, NULL_TREE);
19548 rsize = nregs * UNITS_PER_VREG;
19550 if (is_ha)
19552 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19553 adjust = UNITS_PER_VREG - ag_size;
19555 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19556 && size < UNITS_PER_VREG)
19558 adjust = UNITS_PER_VREG - size;
19561 else
19563 /* TYPE passed in general registers. */
19564 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19565 unshare_expr (valist), f_grtop, NULL_TREE);
19566 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19567 unshare_expr (valist), f_groff, NULL_TREE);
19568 rsize = ROUND_UP (size, UNITS_PER_WORD);
19569 nregs = rsize / UNITS_PER_WORD;
19571 if (align > 8)
19573 if (abi_break && warn_psabi)
19574 inform (input_location, "parameter passing for argument of type "
19575 "%qT changed in GCC 9.1", type);
19576 dw_align = true;
19579 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19580 && size < UNITS_PER_WORD)
19582 adjust = UNITS_PER_WORD - size;
19586 /* Get a local temporary for the field value. */
19587 off = get_initialized_tmp_var (f_off, pre_p, NULL);
19589 /* Emit code to branch if off >= 0. */
19590 t = build2 (GE_EXPR, boolean_type_node, off,
19591 build_int_cst (TREE_TYPE (off), 0));
19592 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19594 if (dw_align)
19596 /* Emit: offs = (offs + 15) & -16. */
19597 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19598 build_int_cst (TREE_TYPE (off), 15));
19599 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19600 build_int_cst (TREE_TYPE (off), -16));
19601 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19603 else
19604 roundup = NULL;
19606 /* Update ap.__[g|v]r_offs */
19607 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19608 build_int_cst (TREE_TYPE (off), rsize));
19609 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19611 /* String up. */
19612 if (roundup)
19613 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19615 /* [cond2] if (ap.__[g|v]r_offs > 0) */
19616 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19617 build_int_cst (TREE_TYPE (f_off), 0));
19618 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19620 /* String up: make sure the assignment happens before the use. */
19621 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19622 COND_EXPR_ELSE (cond1) = t;
19624 /* Prepare the trees handling the argument that is passed on the stack;
19625 the top level node will store in ON_STACK. */
19626 arg = get_initialized_tmp_var (stack, pre_p, NULL);
19627 if (align > 8)
19629 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
19630 t = fold_build_pointer_plus_hwi (arg, 15);
19631 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19632 build_int_cst (TREE_TYPE (t), -16));
19633 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19635 else
19636 roundup = NULL;
19637 /* Advance ap.__stack */
19638 t = fold_build_pointer_plus_hwi (arg, size + 7);
19639 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19640 build_int_cst (TREE_TYPE (t), -8));
19641 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19642 /* String up roundup and advance. */
19643 if (roundup)
19644 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19645 /* String up with arg */
19646 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19647 /* Big-endianness related address adjustment. */
19648 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19649 && size < UNITS_PER_WORD)
19651 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19652 size_int (UNITS_PER_WORD - size));
19653 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19656 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19657 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19659 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
19660 t = off;
19661 if (adjust)
19662 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19663 build_int_cst (TREE_TYPE (off), adjust));
19665 t = fold_convert (sizetype, t);
19666 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19668 if (is_ha)
19670 /* type ha; // treat as "struct {ftype field[n];}"
19671 ... [computing offs]
19672 for (i = 0; i <nregs; ++i, offs += 16)
19673 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19674 return ha; */
19675 int i;
19676 tree tmp_ha, field_t, field_ptr_t;
19678 /* Declare a local variable. */
19679 tmp_ha = create_tmp_var_raw (type, "ha");
19680 gimple_add_tmp_var (tmp_ha);
19682 /* Establish the base type. */
19683 switch (ag_mode)
19685 case E_SFmode:
19686 field_t = float_type_node;
19687 field_ptr_t = float_ptr_type_node;
19688 break;
19689 case E_DFmode:
19690 field_t = double_type_node;
19691 field_ptr_t = double_ptr_type_node;
19692 break;
19693 case E_TFmode:
19694 field_t = long_double_type_node;
19695 field_ptr_t = long_double_ptr_type_node;
19696 break;
19697 case E_SDmode:
19698 field_t = dfloat32_type_node;
19699 field_ptr_t = build_pointer_type (dfloat32_type_node);
19700 break;
19701 case E_DDmode:
19702 field_t = dfloat64_type_node;
19703 field_ptr_t = build_pointer_type (dfloat64_type_node);
19704 break;
19705 case E_TDmode:
19706 field_t = dfloat128_type_node;
19707 field_ptr_t = build_pointer_type (dfloat128_type_node);
19708 break;
19709 case E_HFmode:
19710 field_t = aarch64_fp16_type_node;
19711 field_ptr_t = aarch64_fp16_ptr_type_node;
19712 break;
19713 case E_BFmode:
19714 field_t = aarch64_bf16_type_node;
19715 field_ptr_t = aarch64_bf16_ptr_type_node;
19716 break;
19717 case E_V2SImode:
19718 case E_V4SImode:
19720 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
19721 field_t = build_vector_type_for_mode (innertype, ag_mode);
19722 field_ptr_t = build_pointer_type (field_t);
19724 break;
19725 default:
19726 gcc_assert (0);
19729 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
19730 TREE_ADDRESSABLE (tmp_ha) = 1;
19731 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
19732 addr = t;
19733 t = fold_convert (field_ptr_t, addr);
19734 t = build2 (MODIFY_EXPR, field_t,
19735 build1 (INDIRECT_REF, field_t, tmp_ha),
19736 build1 (INDIRECT_REF, field_t, t));
19738 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
19739 for (i = 1; i < nregs; ++i)
19741 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
19742 u = fold_convert (field_ptr_t, addr);
19743 u = build2 (MODIFY_EXPR, field_t,
19744 build2 (MEM_REF, field_t, tmp_ha,
19745 build_int_cst (field_ptr_t,
19746 (i *
19747 int_size_in_bytes (field_t)))),
19748 build1 (INDIRECT_REF, field_t, u));
19749 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
19752 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
19753 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
19756 COND_EXPR_ELSE (cond2) = t;
19757 addr = fold_convert (build_pointer_type (type), cond1);
19758 addr = build_va_arg_indirect_ref (addr);
19760 if (indirect_p)
19761 addr = build_va_arg_indirect_ref (addr);
19763 return addr;
19766 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
19768 static void
19769 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
19770 const function_arg_info &arg,
19771 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
19773 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
19774 CUMULATIVE_ARGS local_cum;
19775 int gr_saved = cfun->va_list_gpr_size;
19776 int vr_saved = cfun->va_list_fpr_size;
19778 /* The caller has advanced CUM up to, but not beyond, the last named
19779 argument. Advance a local copy of CUM past the last "real" named
19780 argument, to find out how many registers are left over. */
19781 local_cum = *cum;
19782 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
19784 /* Found out how many registers we need to save.
19785 Honor tree-stdvar analysis results. */
19786 if (cfun->va_list_gpr_size)
19787 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
19788 cfun->va_list_gpr_size / UNITS_PER_WORD);
19789 if (cfun->va_list_fpr_size)
19790 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
19791 cfun->va_list_fpr_size / UNITS_PER_VREG);
19793 if (!TARGET_FLOAT)
19795 gcc_assert (local_cum.aapcs_nvrn == 0);
19796 vr_saved = 0;
19799 if (!no_rtl)
19801 if (gr_saved > 0)
19803 rtx ptr, mem;
19805 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
19806 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
19807 - gr_saved * UNITS_PER_WORD);
19808 mem = gen_frame_mem (BLKmode, ptr);
19809 set_mem_alias_set (mem, get_varargs_alias_set ());
19811 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
19812 mem, gr_saved);
19814 if (vr_saved > 0)
19816 /* We can't use move_block_from_reg, because it will use
19817 the wrong mode, storing D regs only. */
19818 machine_mode mode = TImode;
19819 int off, i, vr_start;
19821 /* Set OFF to the offset from virtual_incoming_args_rtx of
19822 the first vector register. The VR save area lies below
19823 the GR one, and is aligned to 16 bytes. */
19824 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
19825 STACK_BOUNDARY / BITS_PER_UNIT);
19826 off -= vr_saved * UNITS_PER_VREG;
19828 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
19829 for (i = 0; i < vr_saved; ++i)
19831 rtx ptr, mem;
19833 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
19834 mem = gen_frame_mem (mode, ptr);
19835 set_mem_alias_set (mem, get_varargs_alias_set ());
19836 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
19837 off += UNITS_PER_VREG;
19842 /* We don't save the size into *PRETEND_SIZE because we want to avoid
19843 any complication of having crtl->args.pretend_args_size changed. */
19844 cfun->machine->frame.saved_varargs_size
19845 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
19846 STACK_BOUNDARY / BITS_PER_UNIT)
19847 + vr_saved * UNITS_PER_VREG);
19850 static void
19851 aarch64_conditional_register_usage (void)
19853 int i;
19854 if (!TARGET_FLOAT)
19856 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
19858 fixed_regs[i] = 1;
19859 call_used_regs[i] = 1;
19862 if (!TARGET_SVE)
19863 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
19865 fixed_regs[i] = 1;
19866 call_used_regs[i] = 1;
19869 /* Only allow the FFR and FFRT to be accessed via special patterns. */
19870 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
19871 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
19873 /* When tracking speculation, we need a couple of call-clobbered registers
19874 to track the speculation state. It would be nice to just use
19875 IP0 and IP1, but currently there are numerous places that just
19876 assume these registers are free for other uses (eg pointer
19877 authentication). */
19878 if (aarch64_track_speculation)
19880 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
19881 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
19882 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
19883 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
19887 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
19889 bool
19890 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
19892 /* For records we're passed a FIELD_DECL, for arrays we're passed
19893 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
19894 const_tree type = TREE_TYPE (field_or_array);
19896 /* Assign BLKmode to anything that contains multiple SVE predicates.
19897 For structures, the "multiple" case is indicated by MODE being
19898 VOIDmode. */
19899 unsigned int num_zr, num_pr;
19900 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
19902 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
19903 return !simple_cst_equal (TYPE_SIZE (field_or_array),
19904 TYPE_SIZE (type));
19905 return mode == VOIDmode;
19908 return default_member_type_forces_blk (field_or_array, mode);
19911 /* Bitmasks that indicate whether earlier versions of GCC would have
19912 taken a different path through the ABI logic. This should result in
19913 a -Wpsabi warning if the earlier path led to a different ABI decision.
19915 WARN_PSABI_EMPTY_CXX17_BASE
19916 Indicates that the type includes an artificial empty C++17 base field
19917 that, prior to GCC 10.1, would prevent the type from being treated as
19918 a HFA or HVA. See PR94383 for details.
19920 WARN_PSABI_NO_UNIQUE_ADDRESS
19921 Indicates that the type includes an empty [[no_unique_address]] field
19922 that, prior to GCC 10.1, would prevent the type from being treated as
19923 a HFA or HVA. */
19924 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
19925 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
19926 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
19928 /* Walk down the type tree of TYPE counting consecutive base elements.
19929 If *MODEP is VOIDmode, then set it to the first valid floating point
19930 type. If a non-floating point type is found, or if a floating point
19931 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
19932 otherwise return the count in the sub-tree.
19934 The WARN_PSABI_FLAGS argument allows the caller to check whether this
19935 function has changed its behavior relative to earlier versions of GCC.
19936 Normally the argument should be nonnull and point to a zero-initialized
19937 variable. The function then records whether the ABI decision might
19938 be affected by a known fix to the ABI logic, setting the associated
19939 WARN_PSABI_* bits if so.
19941 When the argument is instead a null pointer, the function tries to
19942 simulate the behavior of GCC before all such ABI fixes were made.
19943 This is useful to check whether the function returns something
19944 different after the ABI fixes. */
19945 static int
19946 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
19947 unsigned int *warn_psabi_flags)
19949 machine_mode mode;
19950 HOST_WIDE_INT size;
19952 if (aarch64_sve::builtin_type_p (type))
19953 return -1;
19955 switch (TREE_CODE (type))
19957 case REAL_TYPE:
19958 mode = TYPE_MODE (type);
19959 if (mode != DFmode && mode != SFmode
19960 && mode != TFmode && mode != HFmode
19961 && mode != SDmode && mode != DDmode && mode != TDmode)
19962 return -1;
19964 if (*modep == VOIDmode)
19965 *modep = mode;
19967 if (*modep == mode)
19968 return 1;
19970 break;
19972 case COMPLEX_TYPE:
19973 mode = TYPE_MODE (TREE_TYPE (type));
19974 if (mode != DFmode && mode != SFmode
19975 && mode != TFmode && mode != HFmode)
19976 return -1;
19978 if (*modep == VOIDmode)
19979 *modep = mode;
19981 if (*modep == mode)
19982 return 2;
19984 break;
19986 case VECTOR_TYPE:
19987 /* Use V2SImode and V4SImode as representatives of all 64-bit
19988 and 128-bit vector types. */
19989 size = int_size_in_bytes (type);
19990 switch (size)
19992 case 8:
19993 mode = V2SImode;
19994 break;
19995 case 16:
19996 mode = V4SImode;
19997 break;
19998 default:
19999 return -1;
20002 if (*modep == VOIDmode)
20003 *modep = mode;
20005 /* Vector modes are considered to be opaque: two vectors are
20006 equivalent for the purposes of being homogeneous aggregates
20007 if they are the same size. */
20008 if (*modep == mode)
20009 return 1;
20011 break;
20013 case ARRAY_TYPE:
20015 int count;
20016 tree index = TYPE_DOMAIN (type);
20018 /* Can't handle incomplete types nor sizes that are not
20019 fixed. */
20020 if (!COMPLETE_TYPE_P (type)
20021 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20022 return -1;
20024 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20025 warn_psabi_flags);
20026 if (count == -1
20027 || !index
20028 || !TYPE_MAX_VALUE (index)
20029 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20030 || !TYPE_MIN_VALUE (index)
20031 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20032 || count < 0)
20033 return -1;
20035 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20036 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20038 /* There must be no padding. */
20039 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20040 count * GET_MODE_BITSIZE (*modep)))
20041 return -1;
20043 return count;
20046 case RECORD_TYPE:
20048 int count = 0;
20049 int sub_count;
20050 tree field;
20052 /* Can't handle incomplete types nor sizes that are not
20053 fixed. */
20054 if (!COMPLETE_TYPE_P (type)
20055 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20056 return -1;
20058 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20060 if (TREE_CODE (field) != FIELD_DECL)
20061 continue;
20063 if (DECL_FIELD_ABI_IGNORED (field))
20065 /* See whether this is something that earlier versions of
20066 GCC failed to ignore. */
20067 unsigned int flag;
20068 if (lookup_attribute ("no_unique_address",
20069 DECL_ATTRIBUTES (field)))
20070 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20071 else if (cxx17_empty_base_field_p (field))
20072 flag = WARN_PSABI_EMPTY_CXX17_BASE;
20073 else
20074 /* No compatibility problem. */
20075 continue;
20077 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
20078 if (warn_psabi_flags)
20080 *warn_psabi_flags |= flag;
20081 continue;
20084 /* A zero-width bitfield may affect layout in some
20085 circumstances, but adds no members. The determination
20086 of whether or not a type is an HFA is performed after
20087 layout is complete, so if the type still looks like an
20088 HFA afterwards, it is still classed as one. This is
20089 potentially an ABI break for the hard-float ABI. */
20090 else if (DECL_BIT_FIELD (field)
20091 && integer_zerop (DECL_SIZE (field)))
20093 /* Prior to GCC-12 these fields were striped early,
20094 hiding them from the back-end entirely and
20095 resulting in the correct behaviour for argument
20096 passing. Simulate that old behaviour without
20097 generating a warning. */
20098 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20099 continue;
20100 if (warn_psabi_flags)
20102 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20103 continue;
20107 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20108 warn_psabi_flags);
20109 if (sub_count < 0)
20110 return -1;
20111 count += sub_count;
20114 /* There must be no padding. */
20115 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20116 count * GET_MODE_BITSIZE (*modep)))
20117 return -1;
20119 return count;
20122 case UNION_TYPE:
20123 case QUAL_UNION_TYPE:
20125 /* These aren't very interesting except in a degenerate case. */
20126 int count = 0;
20127 int sub_count;
20128 tree field;
20130 /* Can't handle incomplete types nor sizes that are not
20131 fixed. */
20132 if (!COMPLETE_TYPE_P (type)
20133 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20134 return -1;
20136 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20138 if (TREE_CODE (field) != FIELD_DECL)
20139 continue;
20141 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20142 warn_psabi_flags);
20143 if (sub_count < 0)
20144 return -1;
20145 count = count > sub_count ? count : sub_count;
20148 /* There must be no padding. */
20149 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20150 count * GET_MODE_BITSIZE (*modep)))
20151 return -1;
20153 return count;
20156 default:
20157 break;
20160 return -1;
20163 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20164 type as described in AAPCS64 \S 4.1.2.
20166 See the comment above aarch64_composite_type_p for the notes on MODE. */
20168 static bool
20169 aarch64_short_vector_p (const_tree type,
20170 machine_mode mode)
20172 poly_int64 size = -1;
20174 if (type && TREE_CODE (type) == VECTOR_TYPE)
20176 if (aarch64_sve::builtin_type_p (type))
20177 return false;
20178 size = int_size_in_bytes (type);
20180 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20181 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20183 /* The containing "else if" is too loose: it means that we look at TYPE
20184 if the type is a vector type (good), but that we otherwise ignore TYPE
20185 and look only at the mode. This is wrong because the type describes
20186 the language-level information whereas the mode is purely an internal
20187 GCC concept. We can therefore reach here for types that are not
20188 vectors in the AAPCS64 sense.
20190 We can't "fix" that for the traditional Advanced SIMD vector modes
20191 without breaking backwards compatibility. However, there's no such
20192 baggage for the structure modes, which were introduced in GCC 12. */
20193 if (aarch64_advsimd_struct_mode_p (mode))
20194 return false;
20196 /* For similar reasons, rely only on the type, not the mode, when
20197 processing SVE types. */
20198 if (type && aarch64_some_values_include_pst_objects_p (type))
20199 /* Leave later code to report an error if SVE is disabled. */
20200 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20201 else
20202 size = GET_MODE_SIZE (mode);
20204 if (known_eq (size, 8) || known_eq (size, 16))
20206 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20207 they are being treated as scalable AAPCS64 types. */
20208 gcc_assert (!aarch64_sve_mode_p (mode)
20209 && !aarch64_advsimd_struct_mode_p (mode));
20210 return true;
20212 return false;
20215 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20216 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
20217 array types. The C99 floating-point complex types are also considered
20218 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
20219 types, which are GCC extensions and out of the scope of AAPCS64, are
20220 treated as composite types here as well.
20222 Note that MODE itself is not sufficient in determining whether a type
20223 is such a composite type or not. This is because
20224 stor-layout.cc:compute_record_mode may have already changed the MODE
20225 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
20226 structure with only one field may have its MODE set to the mode of the
20227 field. Also an integer mode whose size matches the size of the
20228 RECORD_TYPE type may be used to substitute the original mode
20229 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
20230 solely relied on. */
20232 static bool
20233 aarch64_composite_type_p (const_tree type,
20234 machine_mode mode)
20236 if (aarch64_short_vector_p (type, mode))
20237 return false;
20239 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20240 return true;
20242 if (mode == BLKmode
20243 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20244 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20245 return true;
20247 return false;
20250 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20251 shall be passed or returned in simd/fp register(s) (providing these
20252 parameter passing registers are available).
20254 Upon successful return, *COUNT returns the number of needed registers,
20255 *BASE_MODE returns the mode of the individual register and when IS_HA
20256 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20257 floating-point aggregate or a homogeneous short-vector aggregate.
20259 SILENT_P is true if the function should refrain from reporting any
20260 diagnostics. This should only be used if the caller is certain that
20261 any ABI decisions would eventually come through this function with
20262 SILENT_P set to false. */
20264 static bool
20265 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20266 const_tree type,
20267 machine_mode *base_mode,
20268 int *count,
20269 bool *is_ha,
20270 bool silent_p)
20272 if (is_ha != NULL) *is_ha = false;
20274 machine_mode new_mode = VOIDmode;
20275 bool composite_p = aarch64_composite_type_p (type, mode);
20277 if ((!composite_p
20278 && (GET_MODE_CLASS (mode) == MODE_FLOAT
20279 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
20280 || aarch64_short_vector_p (type, mode))
20282 *count = 1;
20283 new_mode = mode;
20285 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20287 if (is_ha != NULL) *is_ha = true;
20288 *count = 2;
20289 new_mode = GET_MODE_INNER (mode);
20291 else if (type && composite_p)
20293 unsigned int warn_psabi_flags = 0;
20294 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20295 &warn_psabi_flags);
20296 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20298 static unsigned last_reported_type_uid;
20299 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20300 int alt;
20301 if (!silent_p
20302 && warn_psabi
20303 && warn_psabi_flags
20304 && uid != last_reported_type_uid
20305 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20306 != ag_count))
20308 const char *url10
20309 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20310 const char *url12
20311 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20312 gcc_assert (alt == -1);
20313 last_reported_type_uid = uid;
20314 /* Use TYPE_MAIN_VARIANT to strip any redundant const
20315 qualification. */
20316 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20317 inform (input_location, "parameter passing for argument of "
20318 "type %qT with %<[[no_unique_address]]%> members "
20319 "changed %{in GCC 10.1%}",
20320 TYPE_MAIN_VARIANT (type), url10);
20321 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20322 inform (input_location, "parameter passing for argument of "
20323 "type %qT when C++17 is enabled changed to match "
20324 "C++14 %{in GCC 10.1%}",
20325 TYPE_MAIN_VARIANT (type), url10);
20326 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20327 inform (input_location, "parameter passing for argument of "
20328 "type %qT changed %{in GCC 12.1%}",
20329 TYPE_MAIN_VARIANT (type), url12);
20332 if (is_ha != NULL) *is_ha = true;
20333 *count = ag_count;
20335 else
20336 return false;
20338 else
20339 return false;
20341 gcc_assert (!aarch64_sve_mode_p (new_mode));
20342 *base_mode = new_mode;
20343 return true;
20346 /* Implement TARGET_STRUCT_VALUE_RTX. */
20348 static rtx
20349 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20350 int incoming ATTRIBUTE_UNUSED)
20352 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20355 /* Implements target hook vector_mode_supported_p. */
20356 static bool
20357 aarch64_vector_mode_supported_p (machine_mode mode)
20359 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20360 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20363 /* Return the full-width SVE vector mode for element mode MODE, if one
20364 exists. */
20365 opt_machine_mode
20366 aarch64_full_sve_mode (scalar_mode mode)
20368 switch (mode)
20370 case E_DFmode:
20371 return VNx2DFmode;
20372 case E_SFmode:
20373 return VNx4SFmode;
20374 case E_HFmode:
20375 return VNx8HFmode;
20376 case E_BFmode:
20377 return VNx8BFmode;
20378 case E_DImode:
20379 return VNx2DImode;
20380 case E_SImode:
20381 return VNx4SImode;
20382 case E_HImode:
20383 return VNx8HImode;
20384 case E_QImode:
20385 return VNx16QImode;
20386 default:
20387 return opt_machine_mode ();
20391 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20392 if it exists. */
20393 opt_machine_mode
20394 aarch64_vq_mode (scalar_mode mode)
20396 switch (mode)
20398 case E_DFmode:
20399 return V2DFmode;
20400 case E_SFmode:
20401 return V4SFmode;
20402 case E_HFmode:
20403 return V8HFmode;
20404 case E_BFmode:
20405 return V8BFmode;
20406 case E_SImode:
20407 return V4SImode;
20408 case E_HImode:
20409 return V8HImode;
20410 case E_QImode:
20411 return V16QImode;
20412 case E_DImode:
20413 return V2DImode;
20414 default:
20415 return opt_machine_mode ();
20419 /* Return appropriate SIMD container
20420 for MODE within a vector of WIDTH bits. */
20421 static machine_mode
20422 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20424 if (TARGET_SVE
20425 && maybe_ne (width, 128)
20426 && known_eq (width, BITS_PER_SVE_VECTOR))
20427 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20429 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20430 if (TARGET_SIMD)
20432 if (known_eq (width, 128))
20433 return aarch64_vq_mode (mode).else_mode (word_mode);
20434 else
20435 switch (mode)
20437 case E_SFmode:
20438 return V2SFmode;
20439 case E_HFmode:
20440 return V4HFmode;
20441 case E_BFmode:
20442 return V4BFmode;
20443 case E_SImode:
20444 return V2SImode;
20445 case E_HImode:
20446 return V4HImode;
20447 case E_QImode:
20448 return V8QImode;
20449 default:
20450 break;
20453 return word_mode;
20456 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20457 and return whether the SVE mode should be preferred over the
20458 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
20459 static bool
20460 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20462 /* Take into account the aarch64-autovec-preference param if non-zero. */
20463 bool only_asimd_p = aarch64_autovec_preference == 1;
20464 bool only_sve_p = aarch64_autovec_preference == 2;
20466 if (only_asimd_p)
20467 return false;
20468 if (only_sve_p)
20469 return true;
20471 /* The preference in case of a tie in costs. */
20472 bool prefer_asimd = aarch64_autovec_preference == 3;
20473 bool prefer_sve = aarch64_autovec_preference == 4;
20475 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20476 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20477 /* If the CPU information does not have an SVE width registered use the
20478 generic poly_int comparison that prefers SVE. If a preference is
20479 explicitly requested avoid this path. */
20480 if (aarch64_tune_params.sve_width == SVE_SCALABLE
20481 && !prefer_asimd
20482 && !prefer_sve)
20483 return maybe_gt (nunits_sve, nunits_asimd);
20485 /* Otherwise estimate the runtime width of the modes involved. */
20486 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20487 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20489 /* Preferring SVE means picking it first unless the Advanced SIMD mode
20490 is clearly wider. */
20491 if (prefer_sve)
20492 return est_sve >= est_asimd;
20493 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20494 is clearly wider. */
20495 if (prefer_asimd)
20496 return est_sve > est_asimd;
20498 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
20499 return est_sve > est_asimd;
20502 /* Return 128-bit container as the preferred SIMD mode for MODE. */
20503 static machine_mode
20504 aarch64_preferred_simd_mode (scalar_mode mode)
20506 /* Take into account explicit auto-vectorization ISA preferences through
20507 aarch64_cmp_autovec_modes. */
20508 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20509 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20510 if (TARGET_SIMD)
20511 return aarch64_vq_mode (mode).else_mode (word_mode);
20512 return word_mode;
20515 /* Return a list of possible vector sizes for the vectorizer
20516 to iterate over. */
20517 static unsigned int
20518 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20520 static const machine_mode sve_modes[] = {
20521 /* Try using full vectors for all element types. */
20522 VNx16QImode,
20524 /* Try using 16-bit containers for 8-bit elements and full vectors
20525 for wider elements. */
20526 VNx8QImode,
20528 /* Try using 32-bit containers for 8-bit and 16-bit elements and
20529 full vectors for wider elements. */
20530 VNx4QImode,
20532 /* Try using 64-bit containers for all element types. */
20533 VNx2QImode
20536 static const machine_mode advsimd_modes[] = {
20537 /* Try using 128-bit vectors for all element types. */
20538 V16QImode,
20540 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20541 for wider elements. */
20542 V8QImode,
20544 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20545 for wider elements.
20547 TODO: We could support a limited form of V4QImode too, so that
20548 we use 32-bit vectors for 8-bit elements. */
20549 V4HImode,
20551 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20552 for 64-bit elements.
20554 TODO: We could similarly support limited forms of V2QImode and V2HImode
20555 for this case. */
20556 V2SImode
20559 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20560 This is because:
20562 - If we can't use N-byte Advanced SIMD vectors then the placement
20563 doesn't matter; we'll just continue as though the Advanced SIMD
20564 entry didn't exist.
20566 - If an SVE main loop with N bytes ends up being cheaper than an
20567 Advanced SIMD main loop with N bytes then by default we'll replace
20568 the Advanced SIMD version with the SVE one.
20570 - If an Advanced SIMD main loop with N bytes ends up being cheaper
20571 than an SVE main loop with N bytes then by default we'll try to
20572 use the SVE loop to vectorize the epilogue instead. */
20574 bool only_asimd_p = aarch64_autovec_preference == 1;
20575 bool only_sve_p = aarch64_autovec_preference == 2;
20577 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20578 unsigned int advsimd_i = 0;
20580 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20582 if (sve_i < ARRAY_SIZE (sve_modes)
20583 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20584 advsimd_modes[advsimd_i]))
20585 modes->safe_push (sve_modes[sve_i++]);
20586 else
20587 modes->safe_push (advsimd_modes[advsimd_i++]);
20589 while (sve_i < ARRAY_SIZE (sve_modes))
20590 modes->safe_push (sve_modes[sve_i++]);
20592 unsigned int flags = 0;
20593 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20594 can compare SVE against Advanced SIMD and so that we can compare
20595 multiple SVE vectorization approaches against each other. There's
20596 not really any point doing this for Advanced SIMD only, since the
20597 first mode that works should always be the best. */
20598 if (TARGET_SVE && aarch64_sve_compare_costs)
20599 flags |= VECT_COMPARE_COSTS;
20600 return flags;
20603 /* Implement TARGET_MANGLE_TYPE. */
20605 static const char *
20606 aarch64_mangle_type (const_tree type)
20608 /* The AArch64 ABI documents say that "__va_list" has to be
20609 mangled as if it is in the "std" namespace. */
20610 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20611 return "St9__va_list";
20613 /* Half-precision floating point types. */
20614 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20616 if (TYPE_MODE (type) == BFmode)
20617 return "u6__bf16";
20618 else
20619 return "Dh";
20622 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
20623 builtin types. */
20624 if (TYPE_NAME (type) != NULL)
20626 const char *res;
20627 if ((res = aarch64_general_mangle_builtin_type (type))
20628 || (res = aarch64_sve::mangle_builtin_type (type)))
20629 return res;
20632 /* Use the default mangling. */
20633 return NULL;
20636 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
20638 static bool
20639 aarch64_verify_type_context (location_t loc, type_context_kind context,
20640 const_tree type, bool silent_p)
20642 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20645 /* Find the first rtx_insn before insn that will generate an assembly
20646 instruction. */
20648 static rtx_insn *
20649 aarch64_prev_real_insn (rtx_insn *insn)
20651 if (!insn)
20652 return NULL;
20656 insn = prev_real_insn (insn);
20658 while (insn && recog_memoized (insn) < 0);
20660 return insn;
20663 static bool
20664 is_madd_op (enum attr_type t1)
20666 unsigned int i;
20667 /* A number of these may be AArch32 only. */
20668 enum attr_type mlatypes[] = {
20669 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20670 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20671 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20674 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
20676 if (t1 == mlatypes[i])
20677 return true;
20680 return false;
20683 /* Check if there is a register dependency between a load and the insn
20684 for which we hold recog_data. */
20686 static bool
20687 dep_between_memop_and_curr (rtx memop)
20689 rtx load_reg;
20690 int opno;
20692 gcc_assert (GET_CODE (memop) == SET);
20694 if (!REG_P (SET_DEST (memop)))
20695 return false;
20697 load_reg = SET_DEST (memop);
20698 for (opno = 1; opno < recog_data.n_operands; opno++)
20700 rtx operand = recog_data.operand[opno];
20701 if (REG_P (operand)
20702 && reg_overlap_mentioned_p (load_reg, operand))
20703 return true;
20706 return false;
20710 /* When working around the Cortex-A53 erratum 835769,
20711 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
20712 instruction and has a preceding memory instruction such that a NOP
20713 should be inserted between them. */
20715 bool
20716 aarch64_madd_needs_nop (rtx_insn* insn)
20718 enum attr_type attr_type;
20719 rtx_insn *prev;
20720 rtx body;
20722 if (!TARGET_FIX_ERR_A53_835769)
20723 return false;
20725 if (!INSN_P (insn) || recog_memoized (insn) < 0)
20726 return false;
20728 attr_type = get_attr_type (insn);
20729 if (!is_madd_op (attr_type))
20730 return false;
20732 prev = aarch64_prev_real_insn (insn);
20733 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
20734 Restore recog state to INSN to avoid state corruption. */
20735 extract_constrain_insn_cached (insn);
20737 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
20738 return false;
20740 body = single_set (prev);
20742 /* If the previous insn is a memory op and there is no dependency between
20743 it and the DImode madd, emit a NOP between them. If body is NULL then we
20744 have a complex memory operation, probably a load/store pair.
20745 Be conservative for now and emit a NOP. */
20746 if (GET_MODE (recog_data.operand[0]) == DImode
20747 && (!body || !dep_between_memop_and_curr (body)))
20748 return true;
20750 return false;
20755 /* Implement FINAL_PRESCAN_INSN. */
20757 void
20758 aarch64_final_prescan_insn (rtx_insn *insn)
20760 if (aarch64_madd_needs_nop (insn))
20761 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
20765 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
20766 instruction. */
20768 bool
20769 aarch64_sve_index_immediate_p (rtx base_or_step)
20771 return (CONST_INT_P (base_or_step)
20772 && IN_RANGE (INTVAL (base_or_step), -16, 15));
20775 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
20776 when applied to mode MODE. Negate X first if NEGATE_P is true. */
20778 bool
20779 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
20781 rtx elt = unwrap_const_vec_duplicate (x);
20782 if (!CONST_INT_P (elt))
20783 return false;
20785 HOST_WIDE_INT val = INTVAL (elt);
20786 if (negate_p)
20787 val = -val;
20788 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
20790 if (val & 0xff)
20791 return IN_RANGE (val, 0, 0xff);
20792 return IN_RANGE (val, 0, 0xff00);
20795 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
20796 instructions when applied to mode MODE. Negate X first if NEGATE_P
20797 is true. */
20799 bool
20800 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
20802 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
20803 return false;
20805 /* After the optional negation, the immediate must be nonnegative.
20806 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
20807 instead of SQADD Zn.B, Zn.B, #129. */
20808 rtx elt = unwrap_const_vec_duplicate (x);
20809 return negate_p == (INTVAL (elt) < 0);
20812 /* Return true if X is a valid immediate operand for an SVE logical
20813 instruction such as AND. */
20815 bool
20816 aarch64_sve_bitmask_immediate_p (rtx x)
20818 rtx elt;
20820 return (const_vec_duplicate_p (x, &elt)
20821 && CONST_INT_P (elt)
20822 && aarch64_bitmask_imm (INTVAL (elt),
20823 GET_MODE_INNER (GET_MODE (x))));
20826 /* Return true if X is a valid immediate for the SVE DUP and CPY
20827 instructions. */
20829 bool
20830 aarch64_sve_dup_immediate_p (rtx x)
20832 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
20833 if (!CONST_INT_P (x))
20834 return false;
20836 HOST_WIDE_INT val = INTVAL (x);
20837 if (val & 0xff)
20838 return IN_RANGE (val, -0x80, 0x7f);
20839 return IN_RANGE (val, -0x8000, 0x7f00);
20842 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
20843 SIGNED_P says whether the operand is signed rather than unsigned. */
20845 bool
20846 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
20848 x = unwrap_const_vec_duplicate (x);
20849 return (CONST_INT_P (x)
20850 && (signed_p
20851 ? IN_RANGE (INTVAL (x), -16, 15)
20852 : IN_RANGE (INTVAL (x), 0, 127)));
20855 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
20856 instruction. Negate X first if NEGATE_P is true. */
20858 bool
20859 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
20861 rtx elt;
20862 REAL_VALUE_TYPE r;
20864 if (!const_vec_duplicate_p (x, &elt)
20865 || !CONST_DOUBLE_P (elt))
20866 return false;
20868 r = *CONST_DOUBLE_REAL_VALUE (elt);
20870 if (negate_p)
20871 r = real_value_negate (&r);
20873 if (real_equal (&r, &dconst1))
20874 return true;
20875 if (real_equal (&r, &dconsthalf))
20876 return true;
20877 return false;
20880 /* Return true if X is a valid immediate operand for an SVE FMUL
20881 instruction. */
20883 bool
20884 aarch64_sve_float_mul_immediate_p (rtx x)
20886 rtx elt;
20888 return (const_vec_duplicate_p (x, &elt)
20889 && CONST_DOUBLE_P (elt)
20890 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
20891 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
20894 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
20895 for the Advanced SIMD operation described by WHICH and INSN. If INFO
20896 is nonnull, use it to describe valid immediates. */
20897 static bool
20898 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
20899 simd_immediate_info *info,
20900 enum simd_immediate_check which,
20901 simd_immediate_info::insn_type insn)
20903 /* Try a 4-byte immediate with LSL. */
20904 for (unsigned int shift = 0; shift < 32; shift += 8)
20905 if ((val32 & (0xff << shift)) == val32)
20907 if (info)
20908 *info = simd_immediate_info (SImode, val32 >> shift, insn,
20909 simd_immediate_info::LSL, shift);
20910 return true;
20913 /* Try a 2-byte immediate with LSL. */
20914 unsigned int imm16 = val32 & 0xffff;
20915 if (imm16 == (val32 >> 16))
20916 for (unsigned int shift = 0; shift < 16; shift += 8)
20917 if ((imm16 & (0xff << shift)) == imm16)
20919 if (info)
20920 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
20921 simd_immediate_info::LSL, shift);
20922 return true;
20925 /* Try a 4-byte immediate with MSL, except for cases that MVN
20926 can handle. */
20927 if (which == AARCH64_CHECK_MOV)
20928 for (unsigned int shift = 8; shift < 24; shift += 8)
20930 unsigned int low = (1 << shift) - 1;
20931 if (((val32 & (0xff << shift)) | low) == val32)
20933 if (info)
20934 *info = simd_immediate_info (SImode, val32 >> shift, insn,
20935 simd_immediate_info::MSL, shift);
20936 return true;
20940 return false;
20943 /* Return true if replicating VAL64 is a valid immediate for the
20944 Advanced SIMD operation described by WHICH. If INFO is nonnull,
20945 use it to describe valid immediates. */
20946 static bool
20947 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
20948 simd_immediate_info *info,
20949 enum simd_immediate_check which)
20951 unsigned int val32 = val64 & 0xffffffff;
20952 unsigned int val16 = val64 & 0xffff;
20953 unsigned int val8 = val64 & 0xff;
20955 if (val32 == (val64 >> 32))
20957 if ((which & AARCH64_CHECK_ORR) != 0
20958 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
20959 simd_immediate_info::MOV))
20960 return true;
20962 if ((which & AARCH64_CHECK_BIC) != 0
20963 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
20964 simd_immediate_info::MVN))
20965 return true;
20967 /* Try using a replicated byte. */
20968 if (which == AARCH64_CHECK_MOV
20969 && val16 == (val32 >> 16)
20970 && val8 == (val16 >> 8))
20972 if (info)
20973 *info = simd_immediate_info (QImode, val8);
20974 return true;
20978 /* Try using a bit-to-bytemask. */
20979 if (which == AARCH64_CHECK_MOV)
20981 unsigned int i;
20982 for (i = 0; i < 64; i += 8)
20984 unsigned char byte = (val64 >> i) & 0xff;
20985 if (byte != 0 && byte != 0xff)
20986 break;
20988 if (i == 64)
20990 if (info)
20991 *info = simd_immediate_info (DImode, val64);
20992 return true;
20995 return false;
20998 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
20999 instruction. If INFO is nonnull, use it to describe valid immediates. */
21001 static bool
21002 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21003 simd_immediate_info *info)
21005 scalar_int_mode mode = DImode;
21006 unsigned int val32 = val64 & 0xffffffff;
21007 if (val32 == (val64 >> 32))
21009 mode = SImode;
21010 unsigned int val16 = val32 & 0xffff;
21011 if (val16 == (val32 >> 16))
21013 mode = HImode;
21014 unsigned int val8 = val16 & 0xff;
21015 if (val8 == (val16 >> 8))
21016 mode = QImode;
21019 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21020 if (IN_RANGE (val, -0x80, 0x7f))
21022 /* DUP with no shift. */
21023 if (info)
21024 *info = simd_immediate_info (mode, val);
21025 return true;
21027 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21029 /* DUP with LSL #8. */
21030 if (info)
21031 *info = simd_immediate_info (mode, val);
21032 return true;
21034 if (aarch64_bitmask_imm (val64, mode))
21036 /* DUPM. */
21037 if (info)
21038 *info = simd_immediate_info (mode, val);
21039 return true;
21041 return false;
21044 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21046 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21048 where PATTERN is the svpattern as a CONST_INT and where ZERO
21049 is a zero constant of the required PTRUE mode (which can have
21050 fewer elements than X's mode, if zero bits are significant).
21052 If so, and if INFO is nonnull, describe the immediate in INFO. */
21053 bool
21054 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21056 if (GET_CODE (x) != CONST)
21057 return false;
21059 x = XEXP (x, 0);
21060 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21061 return false;
21063 if (info)
21065 aarch64_svpattern pattern
21066 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21067 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21068 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21069 *info = simd_immediate_info (int_mode, pattern);
21071 return true;
21074 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
21075 it to describe valid immediates. */
21077 static bool
21078 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21080 if (aarch64_sve_ptrue_svpattern_p (x, info))
21081 return true;
21083 if (x == CONST0_RTX (GET_MODE (x)))
21085 if (info)
21086 *info = simd_immediate_info (DImode, 0);
21087 return true;
21090 /* Analyze the value as a VNx16BImode. This should be relatively
21091 efficient, since rtx_vector_builder has enough built-in capacity
21092 to store all VLA predicate constants without needing the heap. */
21093 rtx_vector_builder builder;
21094 if (!aarch64_get_sve_pred_bits (builder, x))
21095 return false;
21097 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21098 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21100 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21101 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21102 if (pattern != AARCH64_NUM_SVPATTERNS)
21104 if (info)
21106 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21107 *info = simd_immediate_info (int_mode, pattern);
21109 return true;
21112 return false;
21115 /* Return true if OP is a valid SIMD immediate for the operation
21116 described by WHICH. If INFO is nonnull, use it to describe valid
21117 immediates. */
21118 bool
21119 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21120 enum simd_immediate_check which)
21122 machine_mode mode = GET_MODE (op);
21123 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21124 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21125 return false;
21127 if (vec_flags & VEC_SVE_PRED)
21128 return aarch64_sve_pred_valid_immediate (op, info);
21130 scalar_mode elt_mode = GET_MODE_INNER (mode);
21131 rtx base, step;
21132 unsigned int n_elts;
21133 if (CONST_VECTOR_P (op)
21134 && CONST_VECTOR_DUPLICATE_P (op))
21135 n_elts = CONST_VECTOR_NPATTERNS (op);
21136 else if ((vec_flags & VEC_SVE_DATA)
21137 && const_vec_series_p (op, &base, &step))
21139 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21140 if (!aarch64_sve_index_immediate_p (base)
21141 || !aarch64_sve_index_immediate_p (step))
21142 return false;
21144 if (info)
21146 /* Get the corresponding container mode. E.g. an INDEX on V2SI
21147 should yield two integer values per 128-bit block, meaning
21148 that we need to treat it in the same way as V2DI and then
21149 ignore the upper 32 bits of each element. */
21150 elt_mode = aarch64_sve_container_int_mode (mode);
21151 *info = simd_immediate_info (elt_mode, base, step);
21153 return true;
21155 else if (CONST_VECTOR_P (op)
21156 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21157 /* N_ELTS set above. */;
21158 else
21159 return false;
21161 scalar_float_mode elt_float_mode;
21162 if (n_elts == 1
21163 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21165 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21166 if (aarch64_float_const_zero_rtx_p (elt)
21167 || aarch64_float_const_representable_p (elt))
21169 if (info)
21170 *info = simd_immediate_info (elt_float_mode, elt);
21171 return true;
21175 /* If all elements in an SVE vector have the same value, we have a free
21176 choice between using the element mode and using the container mode.
21177 Using the element mode means that unused parts of the vector are
21178 duplicates of the used elements, while using the container mode means
21179 that the unused parts are an extension of the used elements. Using the
21180 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21181 for its container mode VNx4SI while 0x00000101 isn't.
21183 If not all elements in an SVE vector have the same value, we need the
21184 transition from one element to the next to occur at container boundaries.
21185 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21186 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
21187 scalar_int_mode elt_int_mode;
21188 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21189 elt_int_mode = aarch64_sve_container_int_mode (mode);
21190 else
21191 elt_int_mode = int_mode_for_mode (elt_mode).require ();
21193 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21194 if (elt_size > 8)
21195 return false;
21197 /* Expand the vector constant out into a byte vector, with the least
21198 significant byte of the register first. */
21199 auto_vec<unsigned char, 16> bytes;
21200 bytes.reserve (n_elts * elt_size);
21201 for (unsigned int i = 0; i < n_elts; i++)
21203 /* The vector is provided in gcc endian-neutral fashion.
21204 For aarch64_be Advanced SIMD, it must be laid out in the vector
21205 register in reverse order. */
21206 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21207 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21209 if (elt_mode != elt_int_mode)
21210 elt = gen_lowpart (elt_int_mode, elt);
21212 if (!CONST_INT_P (elt))
21213 return false;
21215 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21216 for (unsigned int byte = 0; byte < elt_size; byte++)
21218 bytes.quick_push (elt_val & 0xff);
21219 elt_val >>= BITS_PER_UNIT;
21223 /* The immediate must repeat every eight bytes. */
21224 unsigned int nbytes = bytes.length ();
21225 for (unsigned i = 8; i < nbytes; ++i)
21226 if (bytes[i] != bytes[i - 8])
21227 return false;
21229 /* Get the repeating 8-byte value as an integer. No endian correction
21230 is needed here because bytes is already in lsb-first order. */
21231 unsigned HOST_WIDE_INT val64 = 0;
21232 for (unsigned int i = 0; i < 8; i++)
21233 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21234 << (i * BITS_PER_UNIT));
21236 if (vec_flags & VEC_SVE_DATA)
21237 return aarch64_sve_valid_immediate (val64, info);
21238 else
21239 return aarch64_advsimd_valid_immediate (val64, info, which);
21242 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21243 has a step in the range of INDEX. Return the index expression if so,
21244 otherwise return null. */
21246 aarch64_check_zero_based_sve_index_immediate (rtx x)
21248 rtx base, step;
21249 if (const_vec_series_p (x, &base, &step)
21250 && base == const0_rtx
21251 && aarch64_sve_index_immediate_p (step))
21252 return step;
21253 return NULL_RTX;
21256 /* Check of immediate shift constants are within range. */
21257 bool
21258 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21260 x = unwrap_const_vec_duplicate (x);
21261 if (!CONST_INT_P (x))
21262 return false;
21263 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21264 if (left)
21265 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21266 else
21267 return IN_RANGE (INTVAL (x), 1, bit_width);
21270 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21271 operation of width WIDTH at bit position POS. */
21274 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21276 gcc_assert (CONST_INT_P (width));
21277 gcc_assert (CONST_INT_P (pos));
21279 unsigned HOST_WIDE_INT mask
21280 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21281 return GEN_INT (mask << UINTVAL (pos));
21284 bool
21285 aarch64_mov_operand_p (rtx x, machine_mode mode)
21287 if (GET_CODE (x) == HIGH
21288 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21289 return true;
21291 if (CONST_INT_P (x))
21292 return true;
21294 if (VECTOR_MODE_P (GET_MODE (x)))
21296 /* Require predicate constants to be VNx16BI before RA, so that we
21297 force everything to have a canonical form. */
21298 if (!lra_in_progress
21299 && !reload_completed
21300 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21301 && GET_MODE (x) != VNx16BImode)
21302 return false;
21304 return aarch64_simd_valid_immediate (x, NULL);
21307 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
21308 x = strip_salt (x);
21310 /* GOT accesses are valid moves. */
21311 if (SYMBOL_REF_P (x)
21312 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21313 return true;
21315 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21316 return true;
21318 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21319 return true;
21321 return aarch64_classify_symbolic_expression (x)
21322 == SYMBOL_TINY_ABSOLUTE;
21325 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21326 the constant creation. */
21329 aarch64_gen_shareable_zero (machine_mode mode)
21331 machine_mode zmode = V4SImode;
21332 rtx tmp = gen_reg_rtx (zmode);
21333 emit_move_insn (tmp, CONST0_RTX (zmode));
21334 return lowpart_subreg (mode, tmp, zmode);
21337 /* Return a const_int vector of VAL. */
21339 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21341 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21342 return gen_const_vec_duplicate (mode, c);
21345 /* Check OP is a legal scalar immediate for the MOVI instruction. */
21347 bool
21348 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21350 machine_mode vmode;
21352 vmode = aarch64_simd_container_mode (mode, 64);
21353 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21354 return aarch64_simd_valid_immediate (op_v, NULL);
21357 /* Construct and return a PARALLEL RTX vector with elements numbering the
21358 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21359 the vector - from the perspective of the architecture. This does not
21360 line up with GCC's perspective on lane numbers, so we end up with
21361 different masks depending on our target endian-ness. The diagram
21362 below may help. We must draw the distinction when building masks
21363 which select one half of the vector. An instruction selecting
21364 architectural low-lanes for a big-endian target, must be described using
21365 a mask selecting GCC high-lanes.
21367 Big-Endian Little-Endian
21369 GCC 0 1 2 3 3 2 1 0
21370 | x | x | x | x | | x | x | x | x |
21371 Architecture 3 2 1 0 3 2 1 0
21373 Low Mask: { 2, 3 } { 0, 1 }
21374 High Mask: { 0, 1 } { 2, 3 }
21376 MODE Is the mode of the vector and NUNITS is the number of units in it. */
21379 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21381 rtvec v = rtvec_alloc (nunits / 2);
21382 int high_base = nunits / 2;
21383 int low_base = 0;
21384 int base;
21385 rtx t1;
21386 int i;
21388 if (BYTES_BIG_ENDIAN)
21389 base = high ? low_base : high_base;
21390 else
21391 base = high ? high_base : low_base;
21393 for (i = 0; i < nunits / 2; i++)
21394 RTVEC_ELT (v, i) = GEN_INT (base + i);
21396 t1 = gen_rtx_PARALLEL (mode, v);
21397 return t1;
21400 /* Check OP for validity as a PARALLEL RTX vector with elements
21401 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21402 from the perspective of the architecture. See the diagram above
21403 aarch64_simd_vect_par_cnst_half for more details. */
21405 bool
21406 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21407 bool high)
21409 int nelts;
21410 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21411 return false;
21413 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21414 HOST_WIDE_INT count_op = XVECLEN (op, 0);
21415 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21416 int i = 0;
21418 if (count_op != count_ideal)
21419 return false;
21421 for (i = 0; i < count_ideal; i++)
21423 rtx elt_op = XVECEXP (op, 0, i);
21424 rtx elt_ideal = XVECEXP (ideal, 0, i);
21426 if (!CONST_INT_P (elt_op)
21427 || INTVAL (elt_ideal) != INTVAL (elt_op))
21428 return false;
21430 return true;
21433 /* Return a PARALLEL containing NELTS elements, with element I equal
21434 to BASE + I * STEP. */
21437 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21439 rtvec vec = rtvec_alloc (nelts);
21440 for (unsigned int i = 0; i < nelts; ++i)
21441 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21442 return gen_rtx_PARALLEL (VOIDmode, vec);
21445 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21446 series with step STEP. */
21448 bool
21449 aarch64_stepped_int_parallel_p (rtx op, int step)
21451 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21452 return false;
21454 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21455 for (int i = 1; i < XVECLEN (op, 0); ++i)
21456 if (!CONST_INT_P (XVECEXP (op, 0, i))
21457 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21458 return false;
21460 return true;
21463 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
21464 HIGH (exclusive). */
21465 void
21466 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21467 const_tree exp)
21469 HOST_WIDE_INT lane;
21470 gcc_assert (CONST_INT_P (operand));
21471 lane = INTVAL (operand);
21473 if (lane < low || lane >= high)
21475 if (exp)
21476 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21477 lane, low, high - 1);
21478 else
21479 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21483 /* Peform endian correction on lane number N, which indexes a vector
21484 of mode MODE, and return the result as an SImode rtx. */
21487 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21489 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21492 /* Return TRUE if OP is a valid vector addressing mode. */
21494 bool
21495 aarch64_simd_mem_operand_p (rtx op)
21497 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21498 || REG_P (XEXP (op, 0)));
21501 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
21503 bool
21504 aarch64_sve_ld1r_operand_p (rtx op)
21506 struct aarch64_address_info addr;
21507 scalar_mode mode;
21509 return (MEM_P (op)
21510 && is_a <scalar_mode> (GET_MODE (op), &mode)
21511 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21512 && addr.type == ADDRESS_REG_IMM
21513 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21516 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21517 where the size of the read data is specified by `mode` and the size of the
21518 vector elements are specified by `elem_mode`. */
21519 bool
21520 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21521 scalar_mode elem_mode)
21523 struct aarch64_address_info addr;
21524 if (!MEM_P (op)
21525 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21526 return false;
21528 if (addr.type == ADDRESS_REG_IMM)
21529 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21531 if (addr.type == ADDRESS_REG_REG)
21532 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21534 return false;
21537 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
21538 bool
21539 aarch64_sve_ld1rq_operand_p (rtx op)
21541 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21542 GET_MODE_INNER (GET_MODE (op)));
21545 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21546 accessing a vector where the element size is specified by `elem_mode`. */
21547 bool
21548 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21550 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21553 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
21554 bool
21555 aarch64_sve_ldff1_operand_p (rtx op)
21557 if (!MEM_P (op))
21558 return false;
21560 struct aarch64_address_info addr;
21561 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21562 return false;
21564 if (addr.type == ADDRESS_REG_IMM)
21565 return known_eq (addr.const_offset, 0);
21567 return addr.type == ADDRESS_REG_REG;
21570 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
21571 bool
21572 aarch64_sve_ldnf1_operand_p (rtx op)
21574 struct aarch64_address_info addr;
21576 return (MEM_P (op)
21577 && aarch64_classify_address (&addr, XEXP (op, 0),
21578 GET_MODE (op), false)
21579 && addr.type == ADDRESS_REG_IMM);
21582 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21583 The conditions for STR are the same. */
21584 bool
21585 aarch64_sve_ldr_operand_p (rtx op)
21587 struct aarch64_address_info addr;
21589 return (MEM_P (op)
21590 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21591 false, ADDR_QUERY_ANY)
21592 && addr.type == ADDRESS_REG_IMM);
21595 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21596 addressing memory of mode MODE. */
21597 bool
21598 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21600 struct aarch64_address_info addr;
21601 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21602 return false;
21604 if (addr.type == ADDRESS_REG_IMM)
21605 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21607 return addr.type == ADDRESS_REG_REG;
21610 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21611 We need to be able to access the individual pieces, so the range
21612 is different from LD[234] and ST[234]. */
21613 bool
21614 aarch64_sve_struct_memory_operand_p (rtx op)
21616 if (!MEM_P (op))
21617 return false;
21619 machine_mode mode = GET_MODE (op);
21620 struct aarch64_address_info addr;
21621 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21622 ADDR_QUERY_ANY)
21623 || addr.type != ADDRESS_REG_IMM)
21624 return false;
21626 poly_int64 first = addr.const_offset;
21627 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21628 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21629 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21632 /* Emit a register copy from operand to operand, taking care not to
21633 early-clobber source registers in the process.
21635 COUNT is the number of components into which the copy needs to be
21636 decomposed. */
21637 void
21638 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21639 unsigned int count)
21641 unsigned int i;
21642 int rdest = REGNO (operands[0]);
21643 int rsrc = REGNO (operands[1]);
21645 if (!reg_overlap_mentioned_p (operands[0], operands[1])
21646 || rdest < rsrc)
21647 for (i = 0; i < count; i++)
21648 emit_move_insn (gen_rtx_REG (mode, rdest + i),
21649 gen_rtx_REG (mode, rsrc + i));
21650 else
21651 for (i = 0; i < count; i++)
21652 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21653 gen_rtx_REG (mode, rsrc + count - i - 1));
21656 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21657 one of VSTRUCT modes: OI, CI, or XI. */
21659 aarch64_simd_attr_length_rglist (machine_mode mode)
21661 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
21662 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21665 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
21666 alignment of a vector to 128 bits. SVE predicates have an alignment of
21667 16 bits. */
21668 static HOST_WIDE_INT
21669 aarch64_simd_vector_alignment (const_tree type)
21671 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21672 be set for non-predicate vectors of booleans. Modes are the most
21673 direct way we have of identifying real SVE predicate types. */
21674 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21675 return 16;
21676 widest_int min_size
21677 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21678 return wi::umin (min_size, 128).to_uhwi ();
21681 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
21682 static poly_uint64
21683 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21685 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21687 /* If the length of the vector is a fixed power of 2, try to align
21688 to that length, otherwise don't try to align at all. */
21689 HOST_WIDE_INT result;
21690 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21691 || !pow2p_hwi (result))
21692 result = TYPE_ALIGN (TREE_TYPE (type));
21693 return result;
21695 return TYPE_ALIGN (type);
21698 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
21699 static bool
21700 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21702 if (is_packed)
21703 return false;
21705 /* For fixed-length vectors, check that the vectorizer will aim for
21706 full-vector alignment. This isn't true for generic GCC vectors
21707 that are wider than the ABI maximum of 128 bits. */
21708 poly_uint64 preferred_alignment =
21709 aarch64_vectorize_preferred_vector_alignment (type);
21710 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
21711 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
21712 preferred_alignment))
21713 return false;
21715 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
21716 return true;
21719 /* Return true if the vector misalignment factor is supported by the
21720 target. */
21721 static bool
21722 aarch64_builtin_support_vector_misalignment (machine_mode mode,
21723 const_tree type, int misalignment,
21724 bool is_packed)
21726 if (TARGET_SIMD && STRICT_ALIGNMENT)
21728 /* Return if movmisalign pattern is not supported for this mode. */
21729 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
21730 return false;
21732 /* Misalignment factor is unknown at compile time. */
21733 if (misalignment == -1)
21734 return false;
21736 return default_builtin_support_vector_misalignment (mode, type, misalignment,
21737 is_packed);
21740 /* If VALS is a vector constant that can be loaded into a register
21741 using DUP, generate instructions to do so and return an RTX to
21742 assign to the register. Otherwise return NULL_RTX. */
21743 static rtx
21744 aarch64_simd_dup_constant (rtx vals)
21746 machine_mode mode = GET_MODE (vals);
21747 machine_mode inner_mode = GET_MODE_INNER (mode);
21748 rtx x;
21750 if (!const_vec_duplicate_p (vals, &x))
21751 return NULL_RTX;
21753 /* We can load this constant by using DUP and a constant in a
21754 single ARM register. This will be cheaper than a vector
21755 load. */
21756 x = copy_to_mode_reg (inner_mode, x);
21757 return gen_vec_duplicate (mode, x);
21761 /* Generate code to load VALS, which is a PARALLEL containing only
21762 constants (for vec_init) or CONST_VECTOR, efficiently into a
21763 register. Returns an RTX to copy into the register, or NULL_RTX
21764 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
21765 static rtx
21766 aarch64_simd_make_constant (rtx vals)
21768 machine_mode mode = GET_MODE (vals);
21769 rtx const_dup;
21770 rtx const_vec = NULL_RTX;
21771 int n_const = 0;
21772 int i;
21774 if (CONST_VECTOR_P (vals))
21775 const_vec = vals;
21776 else if (GET_CODE (vals) == PARALLEL)
21778 /* A CONST_VECTOR must contain only CONST_INTs and
21779 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
21780 Only store valid constants in a CONST_VECTOR. */
21781 int n_elts = XVECLEN (vals, 0);
21782 for (i = 0; i < n_elts; ++i)
21784 rtx x = XVECEXP (vals, 0, i);
21785 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
21786 n_const++;
21788 if (n_const == n_elts)
21789 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
21791 else
21792 gcc_unreachable ();
21794 if (const_vec != NULL_RTX
21795 && aarch64_simd_valid_immediate (const_vec, NULL))
21796 /* Load using MOVI/MVNI. */
21797 return const_vec;
21798 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
21799 /* Loaded using DUP. */
21800 return const_dup;
21801 else if (const_vec != NULL_RTX)
21802 /* Load from constant pool. We cannot take advantage of single-cycle
21803 LD1 because we need a PC-relative addressing mode. */
21804 return const_vec;
21805 else
21806 /* A PARALLEL containing something not valid inside CONST_VECTOR.
21807 We cannot construct an initializer. */
21808 return NULL_RTX;
21811 /* Expand a vector initialisation sequence, such that TARGET is
21812 initialised to contain VALS. */
21814 void
21815 aarch64_expand_vector_init (rtx target, rtx vals)
21817 machine_mode mode = GET_MODE (target);
21818 scalar_mode inner_mode = GET_MODE_INNER (mode);
21819 /* The number of vector elements. */
21820 int n_elts = XVECLEN (vals, 0);
21821 /* The number of vector elements which are not constant. */
21822 int n_var = 0;
21823 rtx any_const = NULL_RTX;
21824 /* The first element of vals. */
21825 rtx v0 = XVECEXP (vals, 0, 0);
21826 bool all_same = true;
21828 /* This is a special vec_init<M><N> where N is not an element mode but a
21829 vector mode with half the elements of M. We expect to find two entries
21830 of mode N in VALS and we must put their concatentation into TARGET. */
21831 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
21833 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
21834 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
21835 && known_eq (GET_MODE_SIZE (mode),
21836 2 * GET_MODE_SIZE (narrow_mode)));
21837 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
21838 XVECEXP (vals, 0, 0),
21839 XVECEXP (vals, 0, 1)));
21840 return;
21843 /* Count the number of variable elements to initialise. */
21844 for (int i = 0; i < n_elts; ++i)
21846 rtx x = XVECEXP (vals, 0, i);
21847 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
21848 ++n_var;
21849 else
21850 any_const = x;
21852 all_same &= rtx_equal_p (x, v0);
21855 /* No variable elements, hand off to aarch64_simd_make_constant which knows
21856 how best to handle this. */
21857 if (n_var == 0)
21859 rtx constant = aarch64_simd_make_constant (vals);
21860 if (constant != NULL_RTX)
21862 emit_move_insn (target, constant);
21863 return;
21867 /* Splat a single non-constant element if we can. */
21868 if (all_same)
21870 rtx x = copy_to_mode_reg (inner_mode, v0);
21871 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
21872 return;
21875 enum insn_code icode = optab_handler (vec_set_optab, mode);
21876 gcc_assert (icode != CODE_FOR_nothing);
21878 /* If there are only variable elements, try to optimize
21879 the insertion using dup for the most common element
21880 followed by insertions. */
21882 /* The algorithm will fill matches[*][0] with the earliest matching element,
21883 and matches[X][1] with the count of duplicate elements (if X is the
21884 earliest element which has duplicates). */
21886 if (n_var == n_elts && n_elts <= 16)
21888 int matches[16][2] = {0};
21889 for (int i = 0; i < n_elts; i++)
21891 for (int j = 0; j <= i; j++)
21893 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
21895 matches[i][0] = j;
21896 matches[j][1]++;
21897 break;
21901 int maxelement = 0;
21902 int maxv = 0;
21903 for (int i = 0; i < n_elts; i++)
21904 if (matches[i][1] > maxv)
21906 maxelement = i;
21907 maxv = matches[i][1];
21910 /* Create a duplicate of the most common element, unless all elements
21911 are equally useless to us, in which case just immediately set the
21912 vector register using the first element. */
21914 if (maxv == 1)
21916 /* For vectors of two 64-bit elements, we can do even better. */
21917 if (n_elts == 2
21918 && (inner_mode == E_DImode
21919 || inner_mode == E_DFmode))
21922 rtx x0 = XVECEXP (vals, 0, 0);
21923 rtx x1 = XVECEXP (vals, 0, 1);
21924 /* Combine can pick up this case, but handling it directly
21925 here leaves clearer RTL.
21927 This is load_pair_lanes<mode>, and also gives us a clean-up
21928 for store_pair_lanes<mode>. */
21929 if (memory_operand (x0, inner_mode)
21930 && memory_operand (x1, inner_mode)
21931 && aarch64_mergeable_load_pair_p (mode, x0, x1))
21933 rtx t;
21934 if (inner_mode == DFmode)
21935 t = gen_load_pair_lanesdf (target, x0, x1);
21936 else
21937 t = gen_load_pair_lanesdi (target, x0, x1);
21938 emit_insn (t);
21939 return;
21942 /* The subreg-move sequence below will move into lane zero of the
21943 vector register. For big-endian we want that position to hold
21944 the last element of VALS. */
21945 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
21946 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
21947 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
21949 else
21951 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
21952 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
21955 /* Insert the rest. */
21956 for (int i = 0; i < n_elts; i++)
21958 rtx x = XVECEXP (vals, 0, i);
21959 if (matches[i][0] == maxelement)
21960 continue;
21961 x = copy_to_mode_reg (inner_mode, x);
21962 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
21964 return;
21967 /* Initialise a vector which is part-variable. We want to first try
21968 to build those lanes which are constant in the most efficient way we
21969 can. */
21970 if (n_var != n_elts)
21972 rtx copy = copy_rtx (vals);
21974 /* Load constant part of vector. We really don't care what goes into the
21975 parts we will overwrite, but we're more likely to be able to load the
21976 constant efficiently if it has fewer, larger, repeating parts
21977 (see aarch64_simd_valid_immediate). */
21978 for (int i = 0; i < n_elts; i++)
21980 rtx x = XVECEXP (vals, 0, i);
21981 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
21982 continue;
21983 rtx subst = any_const;
21984 for (int bit = n_elts / 2; bit > 0; bit /= 2)
21986 /* Look in the copied vector, as more elements are const. */
21987 rtx test = XVECEXP (copy, 0, i ^ bit);
21988 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
21990 subst = test;
21991 break;
21994 XVECEXP (copy, 0, i) = subst;
21996 aarch64_expand_vector_init (target, copy);
21999 /* Insert the variable lanes directly. */
22000 for (int i = 0; i < n_elts; i++)
22002 rtx x = XVECEXP (vals, 0, i);
22003 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22004 continue;
22005 x = copy_to_mode_reg (inner_mode, x);
22006 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22010 /* Emit RTL corresponding to:
22011 insr TARGET, ELEM. */
22013 static void
22014 emit_insr (rtx target, rtx elem)
22016 machine_mode mode = GET_MODE (target);
22017 scalar_mode elem_mode = GET_MODE_INNER (mode);
22018 elem = force_reg (elem_mode, elem);
22020 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22021 gcc_assert (icode != CODE_FOR_nothing);
22022 emit_insn (GEN_FCN (icode) (target, target, elem));
22025 /* Subroutine of aarch64_sve_expand_vector_init for handling
22026 trailing constants.
22027 This function works as follows:
22028 (a) Create a new vector consisting of trailing constants.
22029 (b) Initialize TARGET with the constant vector using emit_move_insn.
22030 (c) Insert remaining elements in TARGET using insr.
22031 NELTS is the total number of elements in original vector while
22032 while NELTS_REQD is the number of elements that are actually
22033 significant.
22035 ??? The heuristic used is to do above only if number of constants
22036 is at least half the total number of elements. May need fine tuning. */
22038 static bool
22039 aarch64_sve_expand_vector_init_handle_trailing_constants
22040 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22042 machine_mode mode = GET_MODE (target);
22043 scalar_mode elem_mode = GET_MODE_INNER (mode);
22044 int n_trailing_constants = 0;
22046 for (int i = nelts_reqd - 1;
22047 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22048 i--)
22049 n_trailing_constants++;
22051 if (n_trailing_constants >= nelts_reqd / 2)
22053 /* Try to use the natural pattern of BUILDER to extend the trailing
22054 constant elements to a full vector. Replace any variables in the
22055 extra elements with zeros.
22057 ??? It would be better if the builders supported "don't care"
22058 elements, with the builder filling in whichever elements
22059 give the most compact encoding. */
22060 rtx_vector_builder v (mode, nelts, 1);
22061 for (int i = 0; i < nelts; i++)
22063 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22064 if (!valid_for_const_vector_p (elem_mode, x))
22065 x = CONST0_RTX (elem_mode);
22066 v.quick_push (x);
22068 rtx const_vec = v.build ();
22069 emit_move_insn (target, const_vec);
22071 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22072 emit_insr (target, builder.elt (i));
22074 return true;
22077 return false;
22080 /* Subroutine of aarch64_sve_expand_vector_init.
22081 Works as follows:
22082 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22083 (b) Skip trailing elements from BUILDER, which are the same as
22084 element NELTS_REQD - 1.
22085 (c) Insert earlier elements in reverse order in TARGET using insr. */
22087 static void
22088 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22089 const rtx_vector_builder &builder,
22090 int nelts_reqd)
22092 machine_mode mode = GET_MODE (target);
22093 scalar_mode elem_mode = GET_MODE_INNER (mode);
22095 struct expand_operand ops[2];
22096 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22097 gcc_assert (icode != CODE_FOR_nothing);
22099 create_output_operand (&ops[0], target, mode);
22100 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22101 expand_insn (icode, 2, ops);
22103 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22104 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22105 emit_insr (target, builder.elt (i));
22108 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22109 when all trailing elements of builder are same.
22110 This works as follows:
22111 (a) Use expand_insn interface to broadcast last vector element in TARGET.
22112 (b) Insert remaining elements in TARGET using insr.
22114 ??? The heuristic used is to do above if number of same trailing elements
22115 is at least 3/4 of total number of elements, loosely based on
22116 heuristic from mostly_zeros_p. May need fine-tuning. */
22118 static bool
22119 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22120 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22122 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22123 if (ndups >= (3 * nelts_reqd) / 4)
22125 aarch64_sve_expand_vector_init_insert_elems (target, builder,
22126 nelts_reqd - ndups + 1);
22127 return true;
22130 return false;
22133 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22134 of elements in BUILDER.
22136 The function tries to initialize TARGET from BUILDER if it fits one
22137 of the special cases outlined below.
22139 Failing that, the function divides BUILDER into two sub-vectors:
22140 v_even = even elements of BUILDER;
22141 v_odd = odd elements of BUILDER;
22143 and recursively calls itself with v_even and v_odd.
22145 if (recursive call succeeded for v_even or v_odd)
22146 TARGET = zip (v_even, v_odd)
22148 The function returns true if it managed to build TARGET from BUILDER
22149 with one of the special cases, false otherwise.
22151 Example: {a, 1, b, 2, c, 3, d, 4}
22153 The vector gets divided into:
22154 v_even = {a, b, c, d}
22155 v_odd = {1, 2, 3, 4}
22157 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22158 initialize tmp2 from constant vector v_odd using emit_move_insn.
22160 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22161 4 elements, so we construct tmp1 from v_even using insr:
22162 tmp1 = dup(d)
22163 insr tmp1, c
22164 insr tmp1, b
22165 insr tmp1, a
22167 And finally:
22168 TARGET = zip (tmp1, tmp2)
22169 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
22171 static bool
22172 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22173 int nelts, int nelts_reqd)
22175 machine_mode mode = GET_MODE (target);
22177 /* Case 1: Vector contains trailing constants. */
22179 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22180 (target, builder, nelts, nelts_reqd))
22181 return true;
22183 /* Case 2: Vector contains leading constants. */
22185 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22186 for (int i = 0; i < nelts_reqd; i++)
22187 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22188 rev_builder.finalize ();
22190 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22191 (target, rev_builder, nelts, nelts_reqd))
22193 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22194 return true;
22197 /* Case 3: Vector contains trailing same element. */
22199 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22200 (target, builder, nelts_reqd))
22201 return true;
22203 /* Case 4: Vector contains leading same element. */
22205 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22206 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22208 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22209 return true;
22212 /* Avoid recursing below 4-elements.
22213 ??? The threshold 4 may need fine-tuning. */
22215 if (nelts_reqd <= 4)
22216 return false;
22218 rtx_vector_builder v_even (mode, nelts, 1);
22219 rtx_vector_builder v_odd (mode, nelts, 1);
22221 for (int i = 0; i < nelts * 2; i += 2)
22223 v_even.quick_push (builder.elt (i));
22224 v_odd.quick_push (builder.elt (i + 1));
22227 v_even.finalize ();
22228 v_odd.finalize ();
22230 rtx tmp1 = gen_reg_rtx (mode);
22231 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22232 nelts, nelts_reqd / 2);
22234 rtx tmp2 = gen_reg_rtx (mode);
22235 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22236 nelts, nelts_reqd / 2);
22238 if (!did_even_p && !did_odd_p)
22239 return false;
22241 /* Initialize v_even and v_odd using INSR if it didn't match any of the
22242 special cases and zip v_even, v_odd. */
22244 if (!did_even_p)
22245 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22247 if (!did_odd_p)
22248 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22250 rtvec v = gen_rtvec (2, tmp1, tmp2);
22251 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22252 return true;
22255 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
22257 void
22258 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22260 machine_mode mode = GET_MODE (target);
22261 int nelts = XVECLEN (vals, 0);
22263 rtx_vector_builder v (mode, nelts, 1);
22264 for (int i = 0; i < nelts; i++)
22265 v.quick_push (XVECEXP (vals, 0, i));
22266 v.finalize ();
22268 /* If neither sub-vectors of v could be initialized specially,
22269 then use INSR to insert all elements from v into TARGET.
22270 ??? This might not be optimal for vectors with large
22271 initializers like 16-element or above.
22272 For nelts < 4, it probably isn't useful to handle specially. */
22274 if (nelts < 4
22275 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22276 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22279 /* Check whether VALUE is a vector constant in which every element
22280 is either a power of 2 or a negated power of 2. If so, return
22281 a constant vector of log2s, and flip CODE between PLUS and MINUS
22282 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
22284 static rtx
22285 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22287 if (!CONST_VECTOR_P (value))
22288 return NULL_RTX;
22290 rtx_vector_builder builder;
22291 if (!builder.new_unary_operation (GET_MODE (value), value, false))
22292 return NULL_RTX;
22294 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22295 /* 1 if the result of the multiplication must be negated,
22296 0 if it mustn't, or -1 if we don't yet care. */
22297 int negate = -1;
22298 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22299 for (unsigned int i = 0; i < encoded_nelts; ++i)
22301 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22302 if (!CONST_SCALAR_INT_P (elt))
22303 return NULL_RTX;
22304 rtx_mode_t val (elt, int_mode);
22305 wide_int pow2 = wi::neg (val);
22306 if (val != pow2)
22308 /* It matters whether we negate or not. Make that choice,
22309 and make sure that it's consistent with previous elements. */
22310 if (negate == !wi::neg_p (val))
22311 return NULL_RTX;
22312 negate = wi::neg_p (val);
22313 if (!negate)
22314 pow2 = val;
22316 /* POW2 is now the value that we want to be a power of 2. */
22317 int shift = wi::exact_log2 (pow2);
22318 if (shift < 0)
22319 return NULL_RTX;
22320 builder.quick_push (gen_int_mode (shift, int_mode));
22322 if (negate == -1)
22323 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
22324 code = PLUS;
22325 else if (negate == 1)
22326 code = code == PLUS ? MINUS : PLUS;
22327 return builder.build ();
22330 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22331 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
22332 operands array, in the same order as for fma_optab. Return true if
22333 the function emitted all the necessary instructions, false if the caller
22334 should generate the pattern normally with the new OPERANDS array. */
22336 bool
22337 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22339 machine_mode mode = GET_MODE (operands[0]);
22340 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22342 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22343 NULL_RTX, true, OPTAB_DIRECT);
22344 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22345 operands[3], product, operands[0], true,
22346 OPTAB_DIRECT);
22347 return true;
22349 operands[2] = force_reg (mode, operands[2]);
22350 return false;
22353 /* Likewise, but for a conditional pattern. */
22355 bool
22356 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22358 machine_mode mode = GET_MODE (operands[0]);
22359 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22361 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22362 NULL_RTX, true, OPTAB_DIRECT);
22363 emit_insn (gen_cond (code, mode, operands[0], operands[1],
22364 operands[4], product, operands[5]));
22365 return true;
22367 operands[3] = force_reg (mode, operands[3]);
22368 return false;
22371 static unsigned HOST_WIDE_INT
22372 aarch64_shift_truncation_mask (machine_mode mode)
22374 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22375 return 0;
22376 return GET_MODE_UNIT_BITSIZE (mode) - 1;
22379 /* Select a format to encode pointers in exception handling data. */
22381 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22383 int type;
22384 switch (aarch64_cmodel)
22386 case AARCH64_CMODEL_TINY:
22387 case AARCH64_CMODEL_TINY_PIC:
22388 case AARCH64_CMODEL_SMALL:
22389 case AARCH64_CMODEL_SMALL_PIC:
22390 case AARCH64_CMODEL_SMALL_SPIC:
22391 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
22392 for everything. */
22393 type = DW_EH_PE_sdata4;
22394 break;
22395 default:
22396 /* No assumptions here. 8-byte relocs required. */
22397 type = DW_EH_PE_sdata8;
22398 break;
22400 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22403 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
22405 static void
22406 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22408 if (TREE_CODE (decl) == FUNCTION_DECL)
22410 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22411 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22413 fprintf (stream, "\t.variant_pcs\t");
22414 assemble_name (stream, name);
22415 fprintf (stream, "\n");
22420 /* The last .arch and .tune assembly strings that we printed. */
22421 static std::string aarch64_last_printed_arch_string;
22422 static std::string aarch64_last_printed_tune_string;
22424 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
22425 by the function fndecl. */
22427 void
22428 aarch64_declare_function_name (FILE *stream, const char* name,
22429 tree fndecl)
22431 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22433 struct cl_target_option *targ_options;
22434 if (target_parts)
22435 targ_options = TREE_TARGET_OPTION (target_parts);
22436 else
22437 targ_options = TREE_TARGET_OPTION (target_option_current_node);
22438 gcc_assert (targ_options);
22440 const struct processor *this_arch
22441 = aarch64_get_arch (targ_options->x_selected_arch);
22443 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
22444 std::string extension
22445 = aarch64_get_extension_string_for_isa_flags (isa_flags,
22446 this_arch->flags);
22447 /* Only update the assembler .arch string if it is distinct from the last
22448 such string we printed. */
22449 std::string to_print = this_arch->name + extension;
22450 if (to_print != aarch64_last_printed_arch_string)
22452 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22453 aarch64_last_printed_arch_string = to_print;
22456 /* Print the cpu name we're tuning for in the comments, might be
22457 useful to readers of the generated asm. Do it only when it changes
22458 from function to function and verbose assembly is requested. */
22459 const struct processor *this_tune
22460 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
22462 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22464 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22465 this_tune->name);
22466 aarch64_last_printed_tune_string = this_tune->name;
22469 aarch64_asm_output_variant_pcs (stream, fndecl, name);
22471 /* Don't forget the type directive for ELF. */
22472 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22473 ASM_OUTPUT_LABEL (stream, name);
22475 cfun->machine->label_is_assembled = true;
22478 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
22479 the function label and emit a BTI if necessary. */
22481 void
22482 aarch64_print_patchable_function_entry (FILE *file,
22483 unsigned HOST_WIDE_INT patch_area_size,
22484 bool record_p)
22486 if (cfun->machine->label_is_assembled
22487 && aarch64_bti_enabled ()
22488 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
22490 /* Remove the BTI that follows the patch area and insert a new BTI
22491 before the patch area right after the function label. */
22492 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22493 if (insn
22494 && INSN_P (insn)
22495 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
22496 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
22497 delete_insn (insn);
22498 asm_fprintf (file, "\thint\t34 // bti c\n");
22501 default_print_patchable_function_entry (file, patch_area_size, record_p);
22504 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
22506 void
22507 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22509 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22510 const char *value = IDENTIFIER_POINTER (target);
22511 aarch64_asm_output_variant_pcs (stream, decl, name);
22512 ASM_OUTPUT_DEF (stream, name, value);
22515 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
22516 function symbol references. */
22518 void
22519 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22521 default_elf_asm_output_external (stream, decl, name);
22522 aarch64_asm_output_variant_pcs (stream, decl, name);
22525 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22526 Used to output the .cfi_b_key_frame directive when signing the current
22527 function with the B key. */
22529 void
22530 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22532 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22533 && aarch64_ra_sign_key == AARCH64_KEY_B)
22534 asm_fprintf (f, "\t.cfi_b_key_frame\n");
22537 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
22539 static void
22540 aarch64_start_file (void)
22542 struct cl_target_option *default_options
22543 = TREE_TARGET_OPTION (target_option_default_node);
22545 const struct processor *default_arch
22546 = aarch64_get_arch (default_options->x_selected_arch);
22547 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
22548 std::string extension
22549 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22550 default_arch->flags);
22552 aarch64_last_printed_arch_string = default_arch->name + extension;
22553 aarch64_last_printed_tune_string = "";
22554 asm_fprintf (asm_out_file, "\t.arch %s\n",
22555 aarch64_last_printed_arch_string.c_str ());
22557 default_file_start ();
22560 /* Emit load exclusive. */
22562 static void
22563 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22564 rtx mem, rtx model_rtx)
22566 if (mode == TImode)
22567 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22568 gen_highpart (DImode, rval),
22569 mem, model_rtx));
22570 else
22571 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22574 /* Emit store exclusive. */
22576 static void
22577 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22578 rtx mem, rtx rval, rtx model_rtx)
22580 if (mode == TImode)
22581 emit_insn (gen_aarch64_store_exclusive_pair
22582 (bval, mem, operand_subword (rval, 0, 0, TImode),
22583 operand_subword (rval, 1, 0, TImode), model_rtx));
22584 else
22585 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22588 /* Mark the previous jump instruction as unlikely. */
22590 static void
22591 aarch64_emit_unlikely_jump (rtx insn)
22593 rtx_insn *jump = emit_jump_insn (insn);
22594 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22597 /* We store the names of the various atomic helpers in a 5x5 array.
22598 Return the libcall function given MODE, MODEL and NAMES. */
22601 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22602 const atomic_ool_names *names)
22604 memmodel model = memmodel_from_int (INTVAL (model_rtx));
22605 int mode_idx, model_idx;
22607 switch (mode)
22609 case E_QImode:
22610 mode_idx = 0;
22611 break;
22612 case E_HImode:
22613 mode_idx = 1;
22614 break;
22615 case E_SImode:
22616 mode_idx = 2;
22617 break;
22618 case E_DImode:
22619 mode_idx = 3;
22620 break;
22621 case E_TImode:
22622 mode_idx = 4;
22623 break;
22624 default:
22625 gcc_unreachable ();
22628 switch (model)
22630 case MEMMODEL_RELAXED:
22631 model_idx = 0;
22632 break;
22633 case MEMMODEL_CONSUME:
22634 case MEMMODEL_ACQUIRE:
22635 model_idx = 1;
22636 break;
22637 case MEMMODEL_RELEASE:
22638 model_idx = 2;
22639 break;
22640 case MEMMODEL_ACQ_REL:
22641 case MEMMODEL_SEQ_CST:
22642 model_idx = 3;
22643 break;
22644 case MEMMODEL_SYNC_ACQUIRE:
22645 case MEMMODEL_SYNC_RELEASE:
22646 case MEMMODEL_SYNC_SEQ_CST:
22647 model_idx = 4;
22648 break;
22649 default:
22650 gcc_unreachable ();
22653 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
22654 VISIBILITY_HIDDEN);
22657 #define DEF0(B, N) \
22658 { "__aarch64_" #B #N "_relax", \
22659 "__aarch64_" #B #N "_acq", \
22660 "__aarch64_" #B #N "_rel", \
22661 "__aarch64_" #B #N "_acq_rel", \
22662 "__aarch64_" #B #N "_sync" }
22664 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
22665 { NULL, NULL, NULL, NULL }
22666 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
22668 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
22669 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
22670 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
22671 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
22672 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
22673 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
22675 #undef DEF0
22676 #undef DEF4
22677 #undef DEF5
22679 /* Expand a compare and swap pattern. */
22681 void
22682 aarch64_expand_compare_and_swap (rtx operands[])
22684 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
22685 machine_mode mode, r_mode;
22687 bval = operands[0];
22688 rval = operands[1];
22689 mem = operands[2];
22690 oldval = operands[3];
22691 newval = operands[4];
22692 is_weak = operands[5];
22693 mod_s = operands[6];
22694 mod_f = operands[7];
22695 mode = GET_MODE (mem);
22697 /* Normally the succ memory model must be stronger than fail, but in the
22698 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
22699 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
22700 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
22701 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
22702 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
22704 r_mode = mode;
22705 if (mode == QImode || mode == HImode)
22707 r_mode = SImode;
22708 rval = gen_reg_rtx (r_mode);
22711 if (TARGET_LSE)
22713 /* The CAS insn requires oldval and rval overlap, but we need to
22714 have a copy of oldval saved across the operation to tell if
22715 the operation is successful. */
22716 if (reg_overlap_mentioned_p (rval, oldval))
22717 rval = copy_to_mode_reg (r_mode, oldval);
22718 else
22719 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
22721 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
22722 newval, mod_s));
22723 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22725 else if (TARGET_OUTLINE_ATOMICS)
22727 /* Oldval must satisfy compare afterward. */
22728 if (!aarch64_plus_operand (oldval, mode))
22729 oldval = force_reg (mode, oldval);
22730 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
22731 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
22732 oldval, mode, newval, mode,
22733 XEXP (mem, 0), Pmode);
22734 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22736 else
22738 /* The oldval predicate varies by mode. Test it and force to reg. */
22739 insn_code code = code_for_aarch64_compare_and_swap (mode);
22740 if (!insn_data[code].operand[2].predicate (oldval, mode))
22741 oldval = force_reg (mode, oldval);
22743 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
22744 is_weak, mod_s, mod_f));
22745 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
22748 if (r_mode != mode)
22749 rval = gen_lowpart (mode, rval);
22750 emit_move_insn (operands[1], rval);
22752 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
22753 emit_insn (gen_rtx_SET (bval, x));
22756 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
22757 sequence implementing an atomic operation. */
22759 static void
22760 aarch64_emit_post_barrier (enum memmodel model)
22762 const enum memmodel base_model = memmodel_base (model);
22764 if (is_mm_sync (model)
22765 && (base_model == MEMMODEL_ACQUIRE
22766 || base_model == MEMMODEL_ACQ_REL
22767 || base_model == MEMMODEL_SEQ_CST))
22769 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
22773 /* Split a compare and swap pattern. */
22775 void
22776 aarch64_split_compare_and_swap (rtx operands[])
22778 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
22779 gcc_assert (epilogue_completed);
22781 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
22782 machine_mode mode;
22783 bool is_weak;
22784 rtx_code_label *label1, *label2;
22785 enum memmodel model;
22787 rval = operands[0];
22788 mem = operands[1];
22789 oldval = operands[2];
22790 newval = operands[3];
22791 is_weak = (operands[4] != const0_rtx);
22792 model_rtx = operands[5];
22793 scratch = operands[7];
22794 mode = GET_MODE (mem);
22795 model = memmodel_from_int (INTVAL (model_rtx));
22797 /* When OLDVAL is zero and we want the strong version we can emit a tighter
22798 loop:
22799 .label1:
22800 LD[A]XR rval, [mem]
22801 CBNZ rval, .label2
22802 ST[L]XR scratch, newval, [mem]
22803 CBNZ scratch, .label1
22804 .label2:
22805 CMP rval, 0. */
22806 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
22807 oldval == const0_rtx && mode != TImode);
22809 label1 = NULL;
22810 if (!is_weak)
22812 label1 = gen_label_rtx ();
22813 emit_label (label1);
22815 label2 = gen_label_rtx ();
22817 /* The initial load can be relaxed for a __sync operation since a final
22818 barrier will be emitted to stop code hoisting. */
22819 if (is_mm_sync (model))
22820 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
22821 else
22822 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
22824 if (strong_zero_p)
22825 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
22826 else
22828 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22829 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
22831 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
22832 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
22833 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
22835 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
22837 if (!is_weak)
22839 if (aarch64_track_speculation)
22841 /* Emit an explicit compare instruction, so that we can correctly
22842 track the condition codes. */
22843 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
22844 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
22846 else
22847 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
22849 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
22850 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
22851 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
22853 else
22854 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
22856 emit_label (label2);
22858 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
22859 to set the condition flags. If this is not used it will be removed by
22860 later passes. */
22861 if (strong_zero_p)
22862 aarch64_gen_compare_reg (NE, rval, const0_rtx);
22864 /* Emit any final barrier needed for a __sync operation. */
22865 if (is_mm_sync (model))
22866 aarch64_emit_post_barrier (model);
22869 /* Split an atomic operation. */
22871 void
22872 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
22873 rtx value, rtx model_rtx, rtx cond)
22875 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
22876 gcc_assert (epilogue_completed);
22878 machine_mode mode = GET_MODE (mem);
22879 machine_mode wmode = (mode == DImode ? DImode : SImode);
22880 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
22881 const bool is_sync = is_mm_sync (model);
22882 rtx_code_label *label;
22883 rtx x;
22885 /* Split the atomic operation into a sequence. */
22886 label = gen_label_rtx ();
22887 emit_label (label);
22889 if (new_out)
22890 new_out = gen_lowpart (wmode, new_out);
22891 if (old_out)
22892 old_out = gen_lowpart (wmode, old_out);
22893 else
22894 old_out = new_out;
22895 value = simplify_gen_subreg (wmode, value, mode, 0);
22897 /* The initial load can be relaxed for a __sync operation since a final
22898 barrier will be emitted to stop code hoisting. */
22899 if (is_sync)
22900 aarch64_emit_load_exclusive (mode, old_out, mem,
22901 GEN_INT (MEMMODEL_RELAXED));
22902 else
22903 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
22905 switch (code)
22907 case SET:
22908 new_out = value;
22909 break;
22911 case NOT:
22912 x = gen_rtx_AND (wmode, old_out, value);
22913 emit_insn (gen_rtx_SET (new_out, x));
22914 x = gen_rtx_NOT (wmode, new_out);
22915 emit_insn (gen_rtx_SET (new_out, x));
22916 break;
22918 case MINUS:
22919 if (CONST_INT_P (value))
22921 value = GEN_INT (-UINTVAL (value));
22922 code = PLUS;
22924 /* Fall through. */
22926 default:
22927 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
22928 emit_insn (gen_rtx_SET (new_out, x));
22929 break;
22932 aarch64_emit_store_exclusive (mode, cond, mem,
22933 gen_lowpart (mode, new_out), model_rtx);
22935 if (aarch64_track_speculation)
22937 /* Emit an explicit compare instruction, so that we can correctly
22938 track the condition codes. */
22939 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
22940 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
22942 else
22943 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
22945 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
22946 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
22947 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
22949 /* Emit any final barrier needed for a __sync operation. */
22950 if (is_sync)
22951 aarch64_emit_post_barrier (model);
22954 static void
22955 aarch64_init_libfuncs (void)
22957 /* Half-precision float operations. The compiler handles all operations
22958 with NULL libfuncs by converting to SFmode. */
22960 /* Conversions. */
22961 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
22962 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
22964 /* Arithmetic. */
22965 set_optab_libfunc (add_optab, HFmode, NULL);
22966 set_optab_libfunc (sdiv_optab, HFmode, NULL);
22967 set_optab_libfunc (smul_optab, HFmode, NULL);
22968 set_optab_libfunc (neg_optab, HFmode, NULL);
22969 set_optab_libfunc (sub_optab, HFmode, NULL);
22971 /* Comparisons. */
22972 set_optab_libfunc (eq_optab, HFmode, NULL);
22973 set_optab_libfunc (ne_optab, HFmode, NULL);
22974 set_optab_libfunc (lt_optab, HFmode, NULL);
22975 set_optab_libfunc (le_optab, HFmode, NULL);
22976 set_optab_libfunc (ge_optab, HFmode, NULL);
22977 set_optab_libfunc (gt_optab, HFmode, NULL);
22978 set_optab_libfunc (unord_optab, HFmode, NULL);
22981 /* Target hook for c_mode_for_suffix. */
22982 static machine_mode
22983 aarch64_c_mode_for_suffix (char suffix)
22985 if (suffix == 'q')
22986 return TFmode;
22988 return VOIDmode;
22991 /* We can only represent floating point constants which will fit in
22992 "quarter-precision" values. These values are characterised by
22993 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
22996 (-1)^s * (n/16) * 2^r
22998 Where:
22999 's' is the sign bit.
23000 'n' is an integer in the range 16 <= n <= 31.
23001 'r' is an integer in the range -3 <= r <= 4. */
23003 /* Return true iff X can be represented by a quarter-precision
23004 floating point immediate operand X. Note, we cannot represent 0.0. */
23005 bool
23006 aarch64_float_const_representable_p (rtx x)
23008 /* This represents our current view of how many bits
23009 make up the mantissa. */
23010 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23011 int exponent;
23012 unsigned HOST_WIDE_INT mantissa, mask;
23013 REAL_VALUE_TYPE r, m;
23014 bool fail;
23016 x = unwrap_const_vec_duplicate (x);
23017 if (!CONST_DOUBLE_P (x))
23018 return false;
23020 if (GET_MODE (x) == VOIDmode
23021 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23022 return false;
23024 r = *CONST_DOUBLE_REAL_VALUE (x);
23026 /* We cannot represent infinities, NaNs or +/-zero. We won't
23027 know if we have +zero until we analyse the mantissa, but we
23028 can reject the other invalid values. */
23029 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23030 || REAL_VALUE_MINUS_ZERO (r))
23031 return false;
23033 /* Extract exponent. */
23034 r = real_value_abs (&r);
23035 exponent = REAL_EXP (&r);
23037 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23038 highest (sign) bit, with a fixed binary point at bit point_pos.
23039 m1 holds the low part of the mantissa, m2 the high part.
23040 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23041 bits for the mantissa, this can fail (low bits will be lost). */
23042 real_ldexp (&m, &r, point_pos - exponent);
23043 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23045 /* If the low part of the mantissa has bits set we cannot represent
23046 the value. */
23047 if (w.ulow () != 0)
23048 return false;
23049 /* We have rejected the lower HOST_WIDE_INT, so update our
23050 understanding of how many bits lie in the mantissa and
23051 look only at the high HOST_WIDE_INT. */
23052 mantissa = w.elt (1);
23053 point_pos -= HOST_BITS_PER_WIDE_INT;
23055 /* We can only represent values with a mantissa of the form 1.xxxx. */
23056 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23057 if ((mantissa & mask) != 0)
23058 return false;
23060 /* Having filtered unrepresentable values, we may now remove all
23061 but the highest 5 bits. */
23062 mantissa >>= point_pos - 5;
23064 /* We cannot represent the value 0.0, so reject it. This is handled
23065 elsewhere. */
23066 if (mantissa == 0)
23067 return false;
23069 /* Then, as bit 4 is always set, we can mask it off, leaving
23070 the mantissa in the range [0, 15]. */
23071 mantissa &= ~(1 << 4);
23072 gcc_assert (mantissa <= 15);
23074 /* GCC internally does not use IEEE754-like encoding (where normalized
23075 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
23076 Our mantissa values are shifted 4 places to the left relative to
23077 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23078 by 5 places to correct for GCC's representation. */
23079 exponent = 5 - exponent;
23081 return (exponent >= 0 && exponent <= 7);
23084 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23085 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
23086 output MOVI/MVNI, ORR or BIC immediate. */
23087 char*
23088 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23089 enum simd_immediate_check which)
23091 bool is_valid;
23092 static char templ[40];
23093 const char *mnemonic;
23094 const char *shift_op;
23095 unsigned int lane_count = 0;
23096 char element_char;
23098 struct simd_immediate_info info;
23100 /* This will return true to show const_vector is legal for use as either
23101 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23102 It will also update INFO to show how the immediate should be generated.
23103 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
23104 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23105 gcc_assert (is_valid);
23107 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23108 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23110 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23112 gcc_assert (info.insn == simd_immediate_info::MOV
23113 && info.u.mov.shift == 0);
23114 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23115 move immediate path. */
23116 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23117 info.u.mov.value = GEN_INT (0);
23118 else
23120 const unsigned int buf_size = 20;
23121 char float_buf[buf_size] = {'\0'};
23122 real_to_decimal_for_mode (float_buf,
23123 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23124 buf_size, buf_size, 1, info.elt_mode);
23126 if (lane_count == 1)
23127 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23128 else
23129 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23130 lane_count, element_char, float_buf);
23131 return templ;
23135 gcc_assert (CONST_INT_P (info.u.mov.value));
23137 if (which == AARCH64_CHECK_MOV)
23139 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23140 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23141 ? "msl" : "lsl");
23142 if (lane_count == 1)
23143 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23144 mnemonic, UINTVAL (info.u.mov.value));
23145 else if (info.u.mov.shift)
23146 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23147 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23148 element_char, UINTVAL (info.u.mov.value), shift_op,
23149 info.u.mov.shift);
23150 else
23151 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23152 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23153 element_char, UINTVAL (info.u.mov.value));
23155 else
23157 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
23158 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23159 if (info.u.mov.shift)
23160 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23161 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23162 element_char, UINTVAL (info.u.mov.value), "lsl",
23163 info.u.mov.shift);
23164 else
23165 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23166 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23167 element_char, UINTVAL (info.u.mov.value));
23169 return templ;
23172 char*
23173 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23176 /* If a floating point number was passed and we desire to use it in an
23177 integer mode do the conversion to integer. */
23178 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23180 unsigned HOST_WIDE_INT ival;
23181 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23182 gcc_unreachable ();
23183 immediate = gen_int_mode (ival, mode);
23186 machine_mode vmode;
23187 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23188 a 128 bit vector mode. */
23189 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23191 vmode = aarch64_simd_container_mode (mode, width);
23192 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23193 return aarch64_output_simd_mov_immediate (v_op, width);
23196 /* Return the output string to use for moving immediate CONST_VECTOR
23197 into an SVE register. */
23199 char *
23200 aarch64_output_sve_mov_immediate (rtx const_vector)
23202 static char templ[40];
23203 struct simd_immediate_info info;
23204 char element_char;
23206 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23207 gcc_assert (is_valid);
23209 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23211 machine_mode vec_mode = GET_MODE (const_vector);
23212 if (aarch64_sve_pred_mode_p (vec_mode))
23214 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23215 if (info.insn == simd_immediate_info::MOV)
23217 gcc_assert (info.u.mov.value == const0_rtx);
23218 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23220 else
23222 gcc_assert (info.insn == simd_immediate_info::PTRUE);
23223 unsigned int total_bytes;
23224 if (info.u.pattern == AARCH64_SV_ALL
23225 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23226 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23227 total_bytes / GET_MODE_SIZE (info.elt_mode));
23228 else
23229 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23230 svpattern_token (info.u.pattern));
23232 return buf;
23235 if (info.insn == simd_immediate_info::INDEX)
23237 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23238 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23239 element_char, INTVAL (info.u.index.base),
23240 INTVAL (info.u.index.step));
23241 return templ;
23244 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23246 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23247 info.u.mov.value = GEN_INT (0);
23248 else
23250 const int buf_size = 20;
23251 char float_buf[buf_size] = {};
23252 real_to_decimal_for_mode (float_buf,
23253 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23254 buf_size, buf_size, 1, info.elt_mode);
23256 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23257 element_char, float_buf);
23258 return templ;
23262 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23263 element_char, INTVAL (info.u.mov.value));
23264 return templ;
23267 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
23268 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23269 pattern. */
23271 char *
23272 aarch64_output_sve_ptrues (rtx const_unspec)
23274 static char templ[40];
23276 struct simd_immediate_info info;
23277 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23278 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23280 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23281 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23282 svpattern_token (info.u.pattern));
23283 return templ;
23286 /* Split operands into moves from op[1] + op[2] into op[0]. */
23288 void
23289 aarch64_split_combinev16qi (rtx operands[3])
23291 unsigned int dest = REGNO (operands[0]);
23292 unsigned int src1 = REGNO (operands[1]);
23293 unsigned int src2 = REGNO (operands[2]);
23294 machine_mode halfmode = GET_MODE (operands[1]);
23295 unsigned int halfregs = REG_NREGS (operands[1]);
23296 rtx destlo, desthi;
23298 gcc_assert (halfmode == V16QImode);
23300 if (src1 == dest && src2 == dest + halfregs)
23302 /* No-op move. Can't split to nothing; emit something. */
23303 emit_note (NOTE_INSN_DELETED);
23304 return;
23307 /* Preserve register attributes for variable tracking. */
23308 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23309 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23310 GET_MODE_SIZE (halfmode));
23312 /* Special case of reversed high/low parts. */
23313 if (reg_overlap_mentioned_p (operands[2], destlo)
23314 && reg_overlap_mentioned_p (operands[1], desthi))
23316 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23317 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23318 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23320 else if (!reg_overlap_mentioned_p (operands[2], destlo))
23322 /* Try to avoid unnecessary moves if part of the result
23323 is in the right place already. */
23324 if (src1 != dest)
23325 emit_move_insn (destlo, operands[1]);
23326 if (src2 != dest + halfregs)
23327 emit_move_insn (desthi, operands[2]);
23329 else
23331 if (src2 != dest + halfregs)
23332 emit_move_insn (desthi, operands[2]);
23333 if (src1 != dest)
23334 emit_move_insn (destlo, operands[1]);
23338 /* vec_perm support. */
23340 struct expand_vec_perm_d
23342 rtx target, op0, op1;
23343 vec_perm_indices perm;
23344 machine_mode vmode;
23345 machine_mode op_mode;
23346 unsigned int vec_flags;
23347 unsigned int op_vec_flags;
23348 bool one_vector_p;
23349 bool testing_p;
23352 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23354 /* Generate a variable permutation. */
23356 static void
23357 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23359 machine_mode vmode = GET_MODE (target);
23360 bool one_vector_p = rtx_equal_p (op0, op1);
23362 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23363 gcc_checking_assert (GET_MODE (op0) == vmode);
23364 gcc_checking_assert (GET_MODE (op1) == vmode);
23365 gcc_checking_assert (GET_MODE (sel) == vmode);
23366 gcc_checking_assert (TARGET_SIMD);
23368 if (one_vector_p)
23370 if (vmode == V8QImode)
23372 /* Expand the argument to a V16QI mode by duplicating it. */
23373 rtx pair = gen_reg_rtx (V16QImode);
23374 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23375 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23377 else
23379 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23382 else
23384 rtx pair;
23386 if (vmode == V8QImode)
23388 pair = gen_reg_rtx (V16QImode);
23389 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23390 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23392 else
23394 pair = gen_reg_rtx (V2x16QImode);
23395 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23396 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23401 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23402 NELT is the number of elements in the vector. */
23404 void
23405 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23406 unsigned int nelt)
23408 machine_mode vmode = GET_MODE (target);
23409 bool one_vector_p = rtx_equal_p (op0, op1);
23410 rtx mask;
23412 /* The TBL instruction does not use a modulo index, so we must take care
23413 of that ourselves. */
23414 mask = aarch64_simd_gen_const_vector_dup (vmode,
23415 one_vector_p ? nelt - 1 : 2 * nelt - 1);
23416 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23418 /* For big-endian, we also need to reverse the index within the vector
23419 (but not which vector). */
23420 if (BYTES_BIG_ENDIAN)
23422 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
23423 if (!one_vector_p)
23424 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23425 sel = expand_simple_binop (vmode, XOR, sel, mask,
23426 NULL, 0, OPTAB_LIB_WIDEN);
23428 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23431 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
23433 static void
23434 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23436 emit_insn (gen_rtx_SET (target,
23437 gen_rtx_UNSPEC (GET_MODE (target),
23438 gen_rtvec (2, op0, op1), code)));
23441 /* Expand an SVE vec_perm with the given operands. */
23443 void
23444 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23446 machine_mode data_mode = GET_MODE (target);
23447 machine_mode sel_mode = GET_MODE (sel);
23448 /* Enforced by the pattern condition. */
23449 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23451 /* Note: vec_perm indices are supposed to wrap when they go beyond the
23452 size of the two value vectors, i.e. the upper bits of the indices
23453 are effectively ignored. SVE TBL instead produces 0 for any
23454 out-of-range indices, so we need to modulo all the vec_perm indices
23455 to ensure they are all in range. */
23456 rtx sel_reg = force_reg (sel_mode, sel);
23458 /* Check if the sel only references the first values vector. */
23459 if (CONST_VECTOR_P (sel)
23460 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23462 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23463 return;
23466 /* Check if the two values vectors are the same. */
23467 if (rtx_equal_p (op0, op1))
23469 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23470 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23471 NULL, 0, OPTAB_DIRECT);
23472 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23473 return;
23476 /* Run TBL on for each value vector and combine the results. */
23478 rtx res0 = gen_reg_rtx (data_mode);
23479 rtx res1 = gen_reg_rtx (data_mode);
23480 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23481 if (!CONST_VECTOR_P (sel)
23482 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23484 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23485 2 * nunits - 1);
23486 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23487 NULL, 0, OPTAB_DIRECT);
23489 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23490 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23491 NULL, 0, OPTAB_DIRECT);
23492 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23493 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23494 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23495 else
23496 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23499 /* Recognize patterns suitable for the TRN instructions. */
23500 static bool
23501 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23503 HOST_WIDE_INT odd;
23504 poly_uint64 nelt = d->perm.length ();
23505 rtx out, in0, in1, x;
23506 machine_mode vmode = d->vmode;
23508 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23509 return false;
23511 /* Note that these are little-endian tests.
23512 We correct for big-endian later. */
23513 if (!d->perm[0].is_constant (&odd)
23514 || (odd != 0 && odd != 1)
23515 || !d->perm.series_p (0, 2, odd, 2)
23516 || !d->perm.series_p (1, 2, nelt + odd, 2))
23517 return false;
23519 /* Success! */
23520 if (d->testing_p)
23521 return true;
23523 in0 = d->op0;
23524 in1 = d->op1;
23525 /* We don't need a big-endian lane correction for SVE; see the comment
23526 at the head of aarch64-sve.md for details. */
23527 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23529 x = in0, in0 = in1, in1 = x;
23530 odd = !odd;
23532 out = d->target;
23534 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23535 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23536 return true;
23539 /* Try to re-encode the PERM constant so it combines odd and even elements.
23540 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23541 We retry with this new constant with the full suite of patterns. */
23542 static bool
23543 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23545 expand_vec_perm_d newd;
23546 unsigned HOST_WIDE_INT nelt;
23548 if (d->vec_flags != VEC_ADVSIMD)
23549 return false;
23551 /* Get the new mode. Always twice the size of the inner
23552 and half the elements. */
23553 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23554 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23555 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23556 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23558 if (new_mode == word_mode)
23559 return false;
23561 /* to_constant is safe since this routine is specific to Advanced SIMD
23562 vectors. */
23563 nelt = d->perm.length ().to_constant ();
23565 vec_perm_builder newpermconst;
23566 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23568 /* Convert the perm constant if we can. Require even, odd as the pairs. */
23569 for (unsigned int i = 0; i < nelt; i += 2)
23571 poly_int64 elt0 = d->perm[i];
23572 poly_int64 elt1 = d->perm[i + 1];
23573 poly_int64 newelt;
23574 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23575 return false;
23576 newpermconst.quick_push (newelt.to_constant ());
23578 newpermconst.finalize ();
23580 newd.vmode = new_mode;
23581 newd.vec_flags = VEC_ADVSIMD;
23582 newd.op_mode = newd.vmode;
23583 newd.op_vec_flags = newd.vec_flags;
23584 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23585 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23586 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23587 newd.testing_p = d->testing_p;
23588 newd.one_vector_p = d->one_vector_p;
23590 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23591 return aarch64_expand_vec_perm_const_1 (&newd);
23594 /* Recognize patterns suitable for the UZP instructions. */
23595 static bool
23596 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23598 HOST_WIDE_INT odd;
23599 rtx out, in0, in1, x;
23600 machine_mode vmode = d->vmode;
23602 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23603 return false;
23605 /* Note that these are little-endian tests.
23606 We correct for big-endian later. */
23607 if (!d->perm[0].is_constant (&odd)
23608 || (odd != 0 && odd != 1)
23609 || !d->perm.series_p (0, 1, odd, 2))
23610 return false;
23612 /* Success! */
23613 if (d->testing_p)
23614 return true;
23616 in0 = d->op0;
23617 in1 = d->op1;
23618 /* We don't need a big-endian lane correction for SVE; see the comment
23619 at the head of aarch64-sve.md for details. */
23620 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23622 x = in0, in0 = in1, in1 = x;
23623 odd = !odd;
23625 out = d->target;
23627 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23628 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23629 return true;
23632 /* Recognize patterns suitable for the ZIP instructions. */
23633 static bool
23634 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23636 unsigned int high;
23637 poly_uint64 nelt = d->perm.length ();
23638 rtx out, in0, in1, x;
23639 machine_mode vmode = d->vmode;
23641 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23642 return false;
23644 /* Note that these are little-endian tests.
23645 We correct for big-endian later. */
23646 poly_uint64 first = d->perm[0];
23647 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
23648 || !d->perm.series_p (0, 2, first, 1)
23649 || !d->perm.series_p (1, 2, first + nelt, 1))
23650 return false;
23651 high = maybe_ne (first, 0U);
23653 /* Success! */
23654 if (d->testing_p)
23655 return true;
23657 in0 = d->op0;
23658 in1 = d->op1;
23659 /* We don't need a big-endian lane correction for SVE; see the comment
23660 at the head of aarch64-sve.md for details. */
23661 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23663 x = in0, in0 = in1, in1 = x;
23664 high = !high;
23666 out = d->target;
23668 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23669 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
23670 return true;
23673 /* Recognize patterns for the EXT insn. */
23675 static bool
23676 aarch64_evpc_ext (struct expand_vec_perm_d *d)
23678 HOST_WIDE_INT location;
23679 rtx offset;
23681 /* The first element always refers to the first vector.
23682 Check if the extracted indices are increasing by one. */
23683 if (d->vec_flags == VEC_SVE_PRED
23684 || !d->perm[0].is_constant (&location)
23685 || !d->perm.series_p (0, 1, location, 1))
23686 return false;
23688 /* Success! */
23689 if (d->testing_p)
23690 return true;
23692 /* The case where (location == 0) is a no-op for both big- and little-endian,
23693 and is removed by the mid-end at optimization levels -O1 and higher.
23695 We don't need a big-endian lane correction for SVE; see the comment
23696 at the head of aarch64-sve.md for details. */
23697 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
23699 /* After setup, we want the high elements of the first vector (stored
23700 at the LSB end of the register), and the low elements of the second
23701 vector (stored at the MSB end of the register). So swap. */
23702 std::swap (d->op0, d->op1);
23703 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
23704 to_constant () is safe since this is restricted to Advanced SIMD
23705 vectors. */
23706 location = d->perm.length ().to_constant () - location;
23709 offset = GEN_INT (location);
23710 emit_set_insn (d->target,
23711 gen_rtx_UNSPEC (d->vmode,
23712 gen_rtvec (3, d->op0, d->op1, offset),
23713 UNSPEC_EXT));
23714 return true;
23717 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
23718 within each 64-bit, 32-bit or 16-bit granule. */
23720 static bool
23721 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
23723 HOST_WIDE_INT diff;
23724 unsigned int i, size, unspec;
23725 machine_mode pred_mode;
23727 if (d->vec_flags == VEC_SVE_PRED
23728 || !d->one_vector_p
23729 || !d->perm[0].is_constant (&diff)
23730 || !diff)
23731 return false;
23733 if (d->vec_flags & VEC_SVE_DATA)
23734 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
23735 else
23736 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
23737 if (size == 64)
23739 unspec = UNSPEC_REV64;
23740 pred_mode = VNx2BImode;
23742 else if (size == 32)
23744 unspec = UNSPEC_REV32;
23745 pred_mode = VNx4BImode;
23747 else if (size == 16)
23749 unspec = UNSPEC_REV16;
23750 pred_mode = VNx8BImode;
23752 else
23753 return false;
23755 unsigned int step = diff + 1;
23756 for (i = 0; i < step; ++i)
23757 if (!d->perm.series_p (i, step, diff - i, step))
23758 return false;
23760 /* Success! */
23761 if (d->testing_p)
23762 return true;
23764 if (d->vec_flags & VEC_SVE_DATA)
23766 rtx pred = aarch64_ptrue_reg (pred_mode);
23767 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
23768 d->target, pred, d->op0));
23769 return true;
23771 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
23772 emit_set_insn (d->target, src);
23773 return true;
23776 /* Recognize patterns for the REV insn, which reverses elements within
23777 a full vector. */
23779 static bool
23780 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
23782 poly_uint64 nelt = d->perm.length ();
23784 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
23785 return false;
23787 if (!d->perm.series_p (0, 1, nelt - 1, -1))
23788 return false;
23790 /* Success! */
23791 if (d->testing_p)
23792 return true;
23794 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
23795 emit_set_insn (d->target, src);
23796 return true;
23799 static bool
23800 aarch64_evpc_dup (struct expand_vec_perm_d *d)
23802 rtx out = d->target;
23803 rtx in0;
23804 HOST_WIDE_INT elt;
23805 machine_mode vmode = d->vmode;
23806 rtx lane;
23808 if (d->vec_flags == VEC_SVE_PRED
23809 || d->perm.encoding ().encoded_nelts () != 1
23810 || !d->perm[0].is_constant (&elt))
23811 return false;
23813 if ((d->vec_flags & VEC_SVE_DATA)
23814 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
23815 return false;
23817 /* Success! */
23818 if (d->testing_p)
23819 return true;
23821 /* The generic preparation in aarch64_expand_vec_perm_const_1
23822 swaps the operand order and the permute indices if it finds
23823 d->perm[0] to be in the second operand. Thus, we can always
23824 use d->op0 and need not do any extra arithmetic to get the
23825 correct lane number. */
23826 in0 = d->op0;
23827 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
23829 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
23830 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
23831 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
23832 return true;
23835 static bool
23836 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
23838 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
23839 machine_mode vmode = d->vmode;
23841 /* Make sure that the indices are constant. */
23842 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
23843 for (unsigned int i = 0; i < encoded_nelts; ++i)
23844 if (!d->perm[i].is_constant ())
23845 return false;
23847 if (d->testing_p)
23848 return true;
23850 /* Generic code will try constant permutation twice. Once with the
23851 original mode and again with the elements lowered to QImode.
23852 So wait and don't do the selector expansion ourselves. */
23853 if (vmode != V8QImode && vmode != V16QImode)
23854 return false;
23856 /* to_constant is safe since this routine is specific to Advanced SIMD
23857 vectors. */
23858 unsigned int nelt = d->perm.length ().to_constant ();
23859 for (unsigned int i = 0; i < nelt; ++i)
23860 /* If big-endian and two vectors we end up with a weird mixed-endian
23861 mode on NEON. Reverse the index within each word but not the word
23862 itself. to_constant is safe because we checked is_constant above. */
23863 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
23864 ? d->perm[i].to_constant () ^ (nelt - 1)
23865 : d->perm[i].to_constant ());
23867 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
23868 sel = force_reg (vmode, sel);
23870 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
23871 return true;
23874 /* Try to implement D using an SVE TBL instruction. */
23876 static bool
23877 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
23879 unsigned HOST_WIDE_INT nelt;
23881 /* Permuting two variable-length vectors could overflow the
23882 index range. */
23883 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
23884 return false;
23886 if (d->testing_p)
23887 return true;
23889 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
23890 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
23891 if (d->one_vector_p)
23892 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
23893 else
23894 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
23895 return true;
23898 /* Try to implement D using SVE dup instruction. */
23900 static bool
23901 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
23903 if (BYTES_BIG_ENDIAN
23904 || !d->one_vector_p
23905 || d->vec_flags != VEC_SVE_DATA
23906 || d->op_vec_flags != VEC_ADVSIMD
23907 || d->perm.encoding ().nelts_per_pattern () != 1
23908 || !known_eq (d->perm.encoding ().npatterns (),
23909 GET_MODE_NUNITS (d->op_mode))
23910 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
23911 return false;
23913 int npatterns = d->perm.encoding ().npatterns ();
23914 for (int i = 0; i < npatterns; i++)
23915 if (!known_eq (d->perm[i], i))
23916 return false;
23918 if (d->testing_p)
23919 return true;
23921 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
23922 return true;
23925 /* Try to implement D using SVE SEL instruction. */
23927 static bool
23928 aarch64_evpc_sel (struct expand_vec_perm_d *d)
23930 machine_mode vmode = d->vmode;
23931 int unit_size = GET_MODE_UNIT_SIZE (vmode);
23933 if (d->vec_flags != VEC_SVE_DATA
23934 || unit_size > 8)
23935 return false;
23937 int n_patterns = d->perm.encoding ().npatterns ();
23938 poly_int64 vec_len = d->perm.length ();
23940 for (int i = 0; i < n_patterns; ++i)
23941 if (!known_eq (d->perm[i], i)
23942 && !known_eq (d->perm[i], vec_len + i))
23943 return false;
23945 for (int i = n_patterns; i < n_patterns * 2; i++)
23946 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
23947 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
23948 return false;
23950 if (d->testing_p)
23951 return true;
23953 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
23955 /* Build a predicate that is true when op0 elements should be used. */
23956 rtx_vector_builder builder (pred_mode, n_patterns, 2);
23957 for (int i = 0; i < n_patterns * 2; i++)
23959 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
23960 : CONST0_RTX (BImode);
23961 builder.quick_push (elem);
23964 rtx const_vec = builder.build ();
23965 rtx pred = force_reg (pred_mode, const_vec);
23966 /* TARGET = PRED ? OP0 : OP1. */
23967 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
23968 return true;
23971 /* Recognize patterns suitable for the INS instructions. */
23972 static bool
23973 aarch64_evpc_ins (struct expand_vec_perm_d *d)
23975 machine_mode mode = d->vmode;
23976 unsigned HOST_WIDE_INT nelt;
23978 if (d->vec_flags != VEC_ADVSIMD)
23979 return false;
23981 /* to_constant is safe since this routine is specific to Advanced SIMD
23982 vectors. */
23983 nelt = d->perm.length ().to_constant ();
23984 rtx insv = d->op0;
23986 HOST_WIDE_INT idx = -1;
23988 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
23990 HOST_WIDE_INT elt;
23991 if (!d->perm[i].is_constant (&elt))
23992 return false;
23993 if (elt == (HOST_WIDE_INT) i)
23994 continue;
23995 if (idx != -1)
23997 idx = -1;
23998 break;
24000 idx = i;
24003 if (idx == -1)
24005 insv = d->op1;
24006 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24008 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24009 continue;
24010 if (idx != -1)
24011 return false;
24012 idx = i;
24015 if (idx == -1)
24016 return false;
24019 if (d->testing_p)
24020 return true;
24022 gcc_assert (idx != -1);
24024 unsigned extractindex = d->perm[idx].to_constant ();
24025 rtx extractv = d->op0;
24026 if (extractindex >= nelt)
24028 extractv = d->op1;
24029 extractindex -= nelt;
24031 gcc_assert (extractindex < nelt);
24033 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24034 expand_operand ops[5];
24035 create_output_operand (&ops[0], d->target, mode);
24036 create_input_operand (&ops[1], insv, mode);
24037 create_integer_operand (&ops[2], 1 << idx);
24038 create_input_operand (&ops[3], extractv, mode);
24039 create_integer_operand (&ops[4], extractindex);
24040 expand_insn (icode, 5, ops);
24042 return true;
24045 static bool
24046 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24048 gcc_assert (d->op_mode != E_VOIDmode);
24050 /* The pattern matching functions above are written to look for a small
24051 number to begin the sequence (0, 1, N/2). If we begin with an index
24052 from the second operand, we can swap the operands. */
24053 poly_int64 nelt = d->perm.length ();
24054 if (known_ge (d->perm[0], nelt))
24056 d->perm.rotate_inputs (1);
24057 std::swap (d->op0, d->op1);
24060 if ((d->vec_flags == VEC_ADVSIMD
24061 || d->vec_flags == VEC_SVE_DATA
24062 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24063 || d->vec_flags == VEC_SVE_PRED)
24064 && known_gt (nelt, 1))
24066 if (d->vmode == d->op_mode)
24068 if (aarch64_evpc_rev_local (d))
24069 return true;
24070 else if (aarch64_evpc_rev_global (d))
24071 return true;
24072 else if (aarch64_evpc_ext (d))
24073 return true;
24074 else if (aarch64_evpc_dup (d))
24075 return true;
24076 else if (aarch64_evpc_zip (d))
24077 return true;
24078 else if (aarch64_evpc_uzp (d))
24079 return true;
24080 else if (aarch64_evpc_trn (d))
24081 return true;
24082 else if (aarch64_evpc_sel (d))
24083 return true;
24084 else if (aarch64_evpc_ins (d))
24085 return true;
24086 else if (aarch64_evpc_reencode (d))
24087 return true;
24089 if (d->vec_flags == VEC_SVE_DATA)
24090 return aarch64_evpc_sve_tbl (d);
24091 else if (d->vec_flags == VEC_ADVSIMD)
24092 return aarch64_evpc_tbl (d);
24094 else
24096 if (aarch64_evpc_sve_dup (d))
24097 return true;
24100 return false;
24103 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
24105 static bool
24106 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24107 rtx target, rtx op0, rtx op1,
24108 const vec_perm_indices &sel)
24110 struct expand_vec_perm_d d;
24112 /* Check whether the mask can be applied to a single vector. */
24113 if (sel.ninputs () == 1
24114 || (op0 && rtx_equal_p (op0, op1)))
24115 d.one_vector_p = true;
24116 else if (sel.all_from_input_p (0))
24118 d.one_vector_p = true;
24119 op1 = op0;
24121 else if (sel.all_from_input_p (1))
24123 d.one_vector_p = true;
24124 op0 = op1;
24126 else
24127 d.one_vector_p = false;
24129 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24130 sel.nelts_per_input ());
24131 d.vmode = vmode;
24132 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24133 d.op_mode = op_mode;
24134 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
24135 d.target = target;
24136 d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
24137 if (op0 == op1)
24138 d.op1 = d.op0;
24139 else
24140 d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
24141 d.testing_p = !target;
24143 if (!d.testing_p)
24144 return aarch64_expand_vec_perm_const_1 (&d);
24146 rtx_insn *last = get_last_insn ();
24147 bool ret = aarch64_expand_vec_perm_const_1 (&d);
24148 gcc_assert (last == get_last_insn ());
24150 return ret;
24153 /* Generate a byte permute mask for a register of mode MODE,
24154 which has NUNITS units. */
24157 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24159 /* We have to reverse each vector because we dont have
24160 a permuted load that can reverse-load according to ABI rules. */
24161 rtx mask;
24162 rtvec v = rtvec_alloc (16);
24163 unsigned int i, j;
24164 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24166 gcc_assert (BYTES_BIG_ENDIAN);
24167 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24169 for (i = 0; i < nunits; i++)
24170 for (j = 0; j < usize; j++)
24171 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24172 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24173 return force_reg (V16QImode, mask);
24176 /* Expand an SVE integer comparison using the SVE equivalent of:
24178 (set TARGET (CODE OP0 OP1)). */
24180 void
24181 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24183 machine_mode pred_mode = GET_MODE (target);
24184 machine_mode data_mode = GET_MODE (op0);
24185 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24186 op0, op1);
24187 if (!rtx_equal_p (target, res))
24188 emit_move_insn (target, res);
24191 /* Return the UNSPEC_COND_* code for comparison CODE. */
24193 static unsigned int
24194 aarch64_unspec_cond_code (rtx_code code)
24196 switch (code)
24198 case NE:
24199 return UNSPEC_COND_FCMNE;
24200 case EQ:
24201 return UNSPEC_COND_FCMEQ;
24202 case LT:
24203 return UNSPEC_COND_FCMLT;
24204 case GT:
24205 return UNSPEC_COND_FCMGT;
24206 case LE:
24207 return UNSPEC_COND_FCMLE;
24208 case GE:
24209 return UNSPEC_COND_FCMGE;
24210 case UNORDERED:
24211 return UNSPEC_COND_FCMUO;
24212 default:
24213 gcc_unreachable ();
24217 /* Emit:
24219 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24221 where <X> is the operation associated with comparison CODE.
24222 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24224 static void
24225 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24226 bool known_ptrue_p, rtx op0, rtx op1)
24228 rtx flag = gen_int_mode (known_ptrue_p, SImode);
24229 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24230 gen_rtvec (4, pred, flag, op0, op1),
24231 aarch64_unspec_cond_code (code));
24232 emit_set_insn (target, unspec);
24235 /* Emit the SVE equivalent of:
24237 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24238 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24239 (set TARGET (ior:PRED_MODE TMP1 TMP2))
24241 where <Xi> is the operation associated with comparison CODEi.
24242 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24244 static void
24245 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24246 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24248 machine_mode pred_mode = GET_MODE (pred);
24249 rtx tmp1 = gen_reg_rtx (pred_mode);
24250 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24251 rtx tmp2 = gen_reg_rtx (pred_mode);
24252 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24253 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24256 /* Emit the SVE equivalent of:
24258 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24259 (set TARGET (not TMP))
24261 where <X> is the operation associated with comparison CODE.
24262 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24264 static void
24265 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24266 bool known_ptrue_p, rtx op0, rtx op1)
24268 machine_mode pred_mode = GET_MODE (pred);
24269 rtx tmp = gen_reg_rtx (pred_mode);
24270 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24271 aarch64_emit_unop (target, one_cmpl_optab, tmp);
24274 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24276 (set TARGET (CODE OP0 OP1))
24278 If CAN_INVERT_P is true, the caller can also handle inverted results;
24279 return true if the result is in fact inverted. */
24281 bool
24282 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24283 rtx op0, rtx op1, bool can_invert_p)
24285 machine_mode pred_mode = GET_MODE (target);
24286 machine_mode data_mode = GET_MODE (op0);
24288 rtx ptrue = aarch64_ptrue_reg (pred_mode);
24289 switch (code)
24291 case UNORDERED:
24292 /* UNORDERED has no immediate form. */
24293 op1 = force_reg (data_mode, op1);
24294 /* fall through */
24295 case LT:
24296 case LE:
24297 case GT:
24298 case GE:
24299 case EQ:
24300 case NE:
24302 /* There is native support for the comparison. */
24303 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24304 return false;
24307 case LTGT:
24308 /* This is a trapping operation (LT or GT). */
24309 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24310 return false;
24312 case UNEQ:
24313 if (!flag_trapping_math)
24315 /* This would trap for signaling NaNs. */
24316 op1 = force_reg (data_mode, op1);
24317 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24318 ptrue, true, op0, op1);
24319 return false;
24321 /* fall through */
24322 case UNLT:
24323 case UNLE:
24324 case UNGT:
24325 case UNGE:
24326 if (flag_trapping_math)
24328 /* Work out which elements are ordered. */
24329 rtx ordered = gen_reg_rtx (pred_mode);
24330 op1 = force_reg (data_mode, op1);
24331 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24332 ptrue, true, op0, op1);
24334 /* Test the opposite condition for the ordered elements,
24335 then invert the result. */
24336 if (code == UNEQ)
24337 code = NE;
24338 else
24339 code = reverse_condition_maybe_unordered (code);
24340 if (can_invert_p)
24342 aarch64_emit_sve_fp_cond (target, code,
24343 ordered, false, op0, op1);
24344 return true;
24346 aarch64_emit_sve_invert_fp_cond (target, code,
24347 ordered, false, op0, op1);
24348 return false;
24350 break;
24352 case ORDERED:
24353 /* ORDERED has no immediate form. */
24354 op1 = force_reg (data_mode, op1);
24355 break;
24357 default:
24358 gcc_unreachable ();
24361 /* There is native support for the inverse comparison. */
24362 code = reverse_condition_maybe_unordered (code);
24363 if (can_invert_p)
24365 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24366 return true;
24368 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24369 return false;
24372 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
24373 of the data being selected and CMP_MODE is the mode of the values being
24374 compared. */
24376 void
24377 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24378 rtx *ops)
24380 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24381 rtx pred = gen_reg_rtx (pred_mode);
24382 if (FLOAT_MODE_P (cmp_mode))
24384 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24385 ops[4], ops[5], true))
24386 std::swap (ops[1], ops[2]);
24388 else
24389 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24391 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24392 ops[1] = force_reg (data_mode, ops[1]);
24393 /* The "false" value can only be zero if the "true" value is a constant. */
24394 if (register_operand (ops[1], data_mode)
24395 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24396 ops[2] = force_reg (data_mode, ops[2]);
24398 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24399 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24402 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
24403 true. However due to issues with register allocation it is preferable
24404 to avoid tieing integer scalar and FP scalar modes. Executing integer
24405 operations in general registers is better than treating them as scalar
24406 vector operations. This reduces latency and avoids redundant int<->FP
24407 moves. So tie modes if they are either the same class, or vector modes
24408 with other vector modes, vector structs or any scalar mode. */
24410 static bool
24411 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24413 if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24414 != aarch64_advsimd_partial_struct_mode_p (mode2))
24415 && maybe_gt (GET_MODE_SIZE (mode1), 8)
24416 && maybe_gt (GET_MODE_SIZE (mode2), 8))
24417 return false;
24419 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24420 return true;
24422 /* We specifically want to allow elements of "structure" modes to
24423 be tieable to the structure. This more general condition allows
24424 other rarer situations too. The reason we don't extend this to
24425 predicate modes is that there are no predicate structure modes
24426 nor any specific instructions for extracting part of a predicate
24427 register. */
24428 if (aarch64_vector_data_mode_p (mode1)
24429 && aarch64_vector_data_mode_p (mode2))
24430 return true;
24432 /* Also allow any scalar modes with vectors. */
24433 if (aarch64_vector_mode_supported_p (mode1)
24434 || aarch64_vector_mode_supported_p (mode2))
24435 return true;
24437 return false;
24440 /* Return a new RTX holding the result of moving POINTER forward by
24441 AMOUNT bytes. */
24443 static rtx
24444 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24446 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24448 return adjust_automodify_address (pointer, GET_MODE (pointer),
24449 next, amount);
24452 /* Return a new RTX holding the result of moving POINTER forward by the
24453 size of the mode it points to. */
24455 static rtx
24456 aarch64_progress_pointer (rtx pointer)
24458 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24461 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24462 MODE bytes. */
24464 static void
24465 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24466 machine_mode mode)
24468 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
24469 address copies using V4SImode so that we can use Q registers. */
24470 if (known_eq (GET_MODE_BITSIZE (mode), 256))
24472 mode = V4SImode;
24473 rtx reg1 = gen_reg_rtx (mode);
24474 rtx reg2 = gen_reg_rtx (mode);
24475 /* "Cast" the pointers to the correct mode. */
24476 *src = adjust_address (*src, mode, 0);
24477 *dst = adjust_address (*dst, mode, 0);
24478 /* Emit the memcpy. */
24479 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24480 aarch64_progress_pointer (*src)));
24481 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24482 aarch64_progress_pointer (*dst), reg2));
24483 /* Move the pointers forward. */
24484 *src = aarch64_move_pointer (*src, 32);
24485 *dst = aarch64_move_pointer (*dst, 32);
24486 return;
24489 rtx reg = gen_reg_rtx (mode);
24491 /* "Cast" the pointers to the correct mode. */
24492 *src = adjust_address (*src, mode, 0);
24493 *dst = adjust_address (*dst, mode, 0);
24494 /* Emit the memcpy. */
24495 emit_move_insn (reg, *src);
24496 emit_move_insn (*dst, reg);
24497 /* Move the pointers forward. */
24498 *src = aarch64_progress_pointer (*src);
24499 *dst = aarch64_progress_pointer (*dst);
24502 /* Expand a cpymem using the MOPS extension. OPERANDS are taken
24503 from the cpymem pattern. Return true iff we succeeded. */
24504 static bool
24505 aarch64_expand_cpymem_mops (rtx *operands)
24507 if (!TARGET_MOPS)
24508 return false;
24510 /* All three registers are changed by the instruction, so each one
24511 must be a fresh pseudo. */
24512 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24513 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24514 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24515 rtx src_mem = replace_equiv_address (operands[1], src_addr);
24516 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24517 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24519 return true;
24522 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
24523 we succeed, otherwise return false, indicating that a libcall to
24524 memcpy should be emitted. */
24526 bool
24527 aarch64_expand_cpymem (rtx *operands)
24529 int mode_bits;
24530 rtx dst = operands[0];
24531 rtx src = operands[1];
24532 rtx base;
24533 machine_mode cur_mode = BLKmode;
24535 /* Variable-sized memcpy can go through the MOPS expansion if available. */
24536 if (!CONST_INT_P (operands[2]))
24537 return aarch64_expand_cpymem_mops (operands);
24539 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24541 /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
24542 unsigned HOST_WIDE_INT max_copy_size
24543 = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24545 bool size_p = optimize_function_for_size_p (cfun);
24547 /* Large constant-sized cpymem should go through MOPS when possible.
24548 It should be a win even for size optimization in the general case.
24549 For speed optimization the choice between MOPS and the SIMD sequence
24550 depends on the size of the copy, rather than number of instructions,
24551 alignment etc. */
24552 if (size > max_copy_size)
24553 return aarch64_expand_cpymem_mops (operands);
24555 int copy_bits = 256;
24557 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24558 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
24559 if (size <= 24
24560 || !TARGET_SIMD
24561 || (aarch64_tune_params.extra_tuning_flags
24562 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24563 copy_bits = 128;
24565 /* Emit an inline load+store sequence and count the number of operations
24566 involved. We use a simple count of just the loads and stores emitted
24567 rather than rtx_insn count as all the pointer adjustments and reg copying
24568 in this function will get optimized away later in the pipeline. */
24569 start_sequence ();
24570 unsigned nops = 0;
24572 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24573 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24575 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24576 src = adjust_automodify_address (src, VOIDmode, base, 0);
24578 /* Convert size to bits to make the rest of the code simpler. */
24579 int n = size * BITS_PER_UNIT;
24581 while (n > 0)
24583 /* Find the largest mode in which to do the copy in without over reading
24584 or writing. */
24585 opt_scalar_int_mode mode_iter;
24586 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24587 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24588 cur_mode = mode_iter.require ();
24590 gcc_assert (cur_mode != BLKmode);
24592 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24594 /* Prefer Q-register accesses for the last bytes. */
24595 if (mode_bits == 128 && copy_bits == 256)
24596 cur_mode = V4SImode;
24598 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24599 /* A single block copy is 1 load + 1 store. */
24600 nops += 2;
24601 n -= mode_bits;
24603 /* Emit trailing copies using overlapping unaligned accesses
24604 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
24605 if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24607 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
24608 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24609 gcc_assert (n_bits <= mode_bits);
24610 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
24611 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24612 n = n_bits;
24615 rtx_insn *seq = get_insns ();
24616 end_sequence ();
24617 /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
24618 the constant size into a register. */
24619 unsigned mops_cost = 3 + 1;
24621 /* If MOPS is available at this point we don't consider the libcall as it's
24622 not a win even on code size. At this point only consider MOPS if
24623 optimizing for size. For speed optimizations we will have chosen between
24624 the two based on copy size already. */
24625 if (TARGET_MOPS)
24627 if (size_p && mops_cost < nops)
24628 return aarch64_expand_cpymem_mops (operands);
24629 emit_insn (seq);
24630 return true;
24633 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
24634 arguments + 1 for the call. When MOPS is not available and we're
24635 optimizing for size a libcall may be preferable. */
24636 unsigned libcall_cost = 4;
24637 if (size_p && libcall_cost < nops)
24638 return false;
24640 emit_insn (seq);
24641 return true;
24644 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
24645 SRC is a register we have created with the duplicated value to be set. */
24646 static void
24647 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
24648 machine_mode mode)
24650 /* If we are copying 128bits or 256bits, we can do that straight from
24651 the SIMD register we prepared. */
24652 if (known_eq (GET_MODE_BITSIZE (mode), 256))
24654 mode = GET_MODE (src);
24655 /* "Cast" the *dst to the correct mode. */
24656 *dst = adjust_address (*dst, mode, 0);
24657 /* Emit the memset. */
24658 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
24659 aarch64_progress_pointer (*dst), src));
24661 /* Move the pointers forward. */
24662 *dst = aarch64_move_pointer (*dst, 32);
24663 return;
24665 if (known_eq (GET_MODE_BITSIZE (mode), 128))
24667 /* "Cast" the *dst to the correct mode. */
24668 *dst = adjust_address (*dst, GET_MODE (src), 0);
24669 /* Emit the memset. */
24670 emit_move_insn (*dst, src);
24671 /* Move the pointers forward. */
24672 *dst = aarch64_move_pointer (*dst, 16);
24673 return;
24675 /* For copying less, we have to extract the right amount from src. */
24676 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
24678 /* "Cast" the *dst to the correct mode. */
24679 *dst = adjust_address (*dst, mode, 0);
24680 /* Emit the memset. */
24681 emit_move_insn (*dst, reg);
24682 /* Move the pointer forward. */
24683 *dst = aarch64_progress_pointer (*dst);
24686 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
24687 as for the setmem pattern. Return true iff we succeed. */
24688 static bool
24689 aarch64_expand_setmem_mops (rtx *operands)
24691 if (!TARGET_MOPS)
24692 return false;
24694 /* The first two registers are changed by the instruction, so both
24695 of them must be a fresh pseudo. */
24696 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24697 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24698 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
24699 rtx val = operands[2];
24700 if (val != CONST0_RTX (QImode))
24701 val = force_reg (QImode, val);
24702 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
24703 return true;
24706 /* Expand setmem, as if from a __builtin_memset. Return true if
24707 we succeed, otherwise return false. */
24709 bool
24710 aarch64_expand_setmem (rtx *operands)
24712 int n, mode_bits;
24713 unsigned HOST_WIDE_INT len;
24714 rtx dst = operands[0];
24715 rtx val = operands[2], src;
24716 rtx base;
24717 machine_mode cur_mode = BLKmode, next_mode;
24719 /* If we don't have SIMD registers or the size is variable use the MOPS
24720 inlined sequence if possible. */
24721 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
24722 return aarch64_expand_setmem_mops (operands);
24724 bool size_p = optimize_function_for_size_p (cfun);
24726 /* Default the maximum to 256-bytes when considering only libcall vs
24727 SIMD broadcast sequence. */
24728 unsigned max_set_size = 256;
24730 len = INTVAL (operands[1]);
24731 if (len > max_set_size && !TARGET_MOPS)
24732 return false;
24734 int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
24735 /* The MOPS sequence takes:
24736 3 instructions for the memory storing
24737 + 1 to move the constant size into a reg
24738 + 1 if VAL is a non-zero constant to move into a reg
24739 (zero constants can use XZR directly). */
24740 unsigned mops_cost = 3 + 1 + cst_val;
24741 /* A libcall to memset in the worst case takes 3 instructions to prepare
24742 the arguments + 1 for the call. */
24743 unsigned libcall_cost = 4;
24745 /* Upper bound check. For large constant-sized setmem use the MOPS sequence
24746 when available. */
24747 if (TARGET_MOPS
24748 && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
24749 return aarch64_expand_setmem_mops (operands);
24751 /* Attempt a sequence with a vector broadcast followed by stores.
24752 Count the number of operations involved to see if it's worth it
24753 against the alternatives. A simple counter simd_ops on the
24754 algorithmically-relevant operations is used rather than an rtx_insn count
24755 as all the pointer adjusmtents and mode reinterprets will be optimized
24756 away later. */
24757 start_sequence ();
24758 unsigned simd_ops = 0;
24760 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24761 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24763 /* Prepare the val using a DUP/MOVI v0.16B, val. */
24764 src = expand_vector_broadcast (V16QImode, val);
24765 src = force_reg (V16QImode, src);
24766 simd_ops++;
24767 /* Convert len to bits to make the rest of the code simpler. */
24768 n = len * BITS_PER_UNIT;
24770 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
24771 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
24772 const int copy_limit = (aarch64_tune_params.extra_tuning_flags
24773 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
24774 ? GET_MODE_BITSIZE (TImode) : 256;
24776 while (n > 0)
24778 /* Find the largest mode in which to do the copy without
24779 over writing. */
24780 opt_scalar_int_mode mode_iter;
24781 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24782 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
24783 cur_mode = mode_iter.require ();
24785 gcc_assert (cur_mode != BLKmode);
24787 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24788 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
24789 simd_ops++;
24790 n -= mode_bits;
24792 /* Do certain trailing copies as overlapping if it's going to be
24793 cheaper. i.e. less instructions to do so. For instance doing a 15
24794 byte copy it's more efficient to do two overlapping 8 byte copies than
24795 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
24796 if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
24798 next_mode = smallest_mode_for_size (n, MODE_INT);
24799 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24800 gcc_assert (n_bits <= mode_bits);
24801 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24802 n = n_bits;
24805 rtx_insn *seq = get_insns ();
24806 end_sequence ();
24808 if (size_p)
24810 /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
24811 call to memset or the MOPS expansion. */
24812 if (TARGET_MOPS
24813 && mops_cost <= libcall_cost
24814 && mops_cost <= simd_ops)
24815 return aarch64_expand_setmem_mops (operands);
24816 /* If MOPS is not available or not shorter pick a libcall if the SIMD
24817 sequence is too long. */
24818 else if (libcall_cost < simd_ops)
24819 return false;
24820 emit_insn (seq);
24821 return true;
24824 /* At this point the SIMD broadcast sequence is the best choice when
24825 optimizing for speed. */
24826 emit_insn (seq);
24827 return true;
24831 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
24832 SImode stores. Handle the case when the constant has identical
24833 bottom and top halves. This is beneficial when the two stores can be
24834 merged into an STP and we avoid synthesising potentially expensive
24835 immediates twice. Return true if such a split is possible. */
24837 bool
24838 aarch64_split_dimode_const_store (rtx dst, rtx src)
24840 rtx lo = gen_lowpart (SImode, src);
24841 rtx hi = gen_highpart_mode (SImode, DImode, src);
24843 bool size_p = optimize_function_for_size_p (cfun);
24845 if (!rtx_equal_p (lo, hi))
24846 return false;
24848 unsigned int orig_cost
24849 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
24850 unsigned int lo_cost
24851 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
24853 /* We want to transform:
24854 MOV x1, 49370
24855 MOVK x1, 0x140, lsl 16
24856 MOVK x1, 0xc0da, lsl 32
24857 MOVK x1, 0x140, lsl 48
24858 STR x1, [x0]
24859 into:
24860 MOV w1, 49370
24861 MOVK w1, 0x140, lsl 16
24862 STP w1, w1, [x0]
24863 So we want to perform this only when we save two instructions
24864 or more. When optimizing for size, however, accept any code size
24865 savings we can. */
24866 if (size_p && orig_cost <= lo_cost)
24867 return false;
24869 if (!size_p
24870 && (orig_cost <= lo_cost + 1))
24871 return false;
24873 rtx mem_lo = adjust_address (dst, SImode, 0);
24874 if (!aarch64_mem_pair_operand (mem_lo, SImode))
24875 return false;
24877 rtx tmp_reg = gen_reg_rtx (SImode);
24878 aarch64_expand_mov_immediate (tmp_reg, lo);
24879 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
24880 /* Don't emit an explicit store pair as this may not be always profitable.
24881 Let the sched-fusion logic decide whether to merge them. */
24882 emit_move_insn (mem_lo, tmp_reg);
24883 emit_move_insn (mem_hi, tmp_reg);
24885 return true;
24888 /* Generate RTL for a conditional branch with rtx comparison CODE in
24889 mode CC_MODE. The destination of the unlikely conditional branch
24890 is LABEL_REF. */
24892 void
24893 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
24894 rtx label_ref)
24896 rtx x;
24897 x = gen_rtx_fmt_ee (code, VOIDmode,
24898 gen_rtx_REG (cc_mode, CC_REGNUM),
24899 const0_rtx);
24901 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24902 gen_rtx_LABEL_REF (VOIDmode, label_ref),
24903 pc_rtx);
24904 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24907 /* Generate DImode scratch registers for 128-bit (TImode) addition.
24909 OP1 represents the TImode destination operand 1
24910 OP2 represents the TImode destination operand 2
24911 LOW_DEST represents the low half (DImode) of TImode operand 0
24912 LOW_IN1 represents the low half (DImode) of TImode operand 1
24913 LOW_IN2 represents the low half (DImode) of TImode operand 2
24914 HIGH_DEST represents the high half (DImode) of TImode operand 0
24915 HIGH_IN1 represents the high half (DImode) of TImode operand 1
24916 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
24918 void
24919 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
24920 rtx *low_in1, rtx *low_in2,
24921 rtx *high_dest, rtx *high_in1,
24922 rtx *high_in2)
24924 *low_dest = gen_reg_rtx (DImode);
24925 *low_in1 = gen_lowpart (DImode, op1);
24926 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
24927 subreg_lowpart_offset (DImode, TImode));
24928 *high_dest = gen_reg_rtx (DImode);
24929 *high_in1 = gen_highpart (DImode, op1);
24930 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
24931 subreg_highpart_offset (DImode, TImode));
24934 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
24936 This function differs from 'arch64_addti_scratch_regs' in that
24937 OP1 can be an immediate constant (zero). We must call
24938 subreg_highpart_offset with DImode and TImode arguments, otherwise
24939 VOIDmode will be used for the const_int which generates an internal
24940 error from subreg_size_highpart_offset which does not expect a size of zero.
24942 OP1 represents the TImode destination operand 1
24943 OP2 represents the TImode destination operand 2
24944 LOW_DEST represents the low half (DImode) of TImode operand 0
24945 LOW_IN1 represents the low half (DImode) of TImode operand 1
24946 LOW_IN2 represents the low half (DImode) of TImode operand 2
24947 HIGH_DEST represents the high half (DImode) of TImode operand 0
24948 HIGH_IN1 represents the high half (DImode) of TImode operand 1
24949 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
24952 void
24953 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
24954 rtx *low_in1, rtx *low_in2,
24955 rtx *high_dest, rtx *high_in1,
24956 rtx *high_in2)
24958 *low_dest = gen_reg_rtx (DImode);
24959 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
24960 subreg_lowpart_offset (DImode, TImode));
24962 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
24963 subreg_lowpart_offset (DImode, TImode));
24964 *high_dest = gen_reg_rtx (DImode);
24966 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
24967 subreg_highpart_offset (DImode, TImode));
24968 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
24969 subreg_highpart_offset (DImode, TImode));
24972 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
24974 OP0 represents the TImode destination operand 0
24975 LOW_DEST represents the low half (DImode) of TImode operand 0
24976 LOW_IN1 represents the low half (DImode) of TImode operand 1
24977 LOW_IN2 represents the low half (DImode) of TImode operand 2
24978 HIGH_DEST represents the high half (DImode) of TImode operand 0
24979 HIGH_IN1 represents the high half (DImode) of TImode operand 1
24980 HIGH_IN2 represents the high half (DImode) of TImode operand 2
24981 UNSIGNED_P is true if the operation is being performed on unsigned
24982 values. */
24983 void
24984 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
24985 rtx low_in2, rtx high_dest, rtx high_in1,
24986 rtx high_in2, bool unsigned_p)
24988 if (low_in2 == const0_rtx)
24990 low_dest = low_in1;
24991 high_in2 = force_reg (DImode, high_in2);
24992 if (unsigned_p)
24993 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
24994 else
24995 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
24997 else
24999 if (aarch64_plus_immediate (low_in2, DImode))
25000 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25001 GEN_INT (-UINTVAL (low_in2))));
25002 else
25004 low_in2 = force_reg (DImode, low_in2);
25005 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25007 high_in2 = force_reg (DImode, high_in2);
25009 if (unsigned_p)
25010 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25011 else
25012 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25015 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25016 emit_move_insn (gen_highpart (DImode, op0), high_dest);
25020 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
25022 static unsigned HOST_WIDE_INT
25023 aarch64_asan_shadow_offset (void)
25025 if (TARGET_ILP32)
25026 return (HOST_WIDE_INT_1 << 29);
25027 else
25028 return (HOST_WIDE_INT_1 << 36);
25031 static rtx
25032 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25033 int code, tree treeop0, tree treeop1)
25035 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25036 rtx op0, op1;
25037 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25038 insn_code icode;
25039 struct expand_operand ops[4];
25041 start_sequence ();
25042 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25044 op_mode = GET_MODE (op0);
25045 if (op_mode == VOIDmode)
25046 op_mode = GET_MODE (op1);
25048 switch (op_mode)
25050 case E_QImode:
25051 case E_HImode:
25052 case E_SImode:
25053 cmp_mode = SImode;
25054 icode = CODE_FOR_cmpsi;
25055 break;
25057 case E_DImode:
25058 cmp_mode = DImode;
25059 icode = CODE_FOR_cmpdi;
25060 break;
25062 case E_SFmode:
25063 cmp_mode = SFmode;
25064 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25065 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25066 break;
25068 case E_DFmode:
25069 cmp_mode = DFmode;
25070 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25071 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25072 break;
25074 default:
25075 end_sequence ();
25076 return NULL_RTX;
25079 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25080 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25081 if (!op0 || !op1)
25083 end_sequence ();
25084 return NULL_RTX;
25086 *prep_seq = get_insns ();
25087 end_sequence ();
25089 create_fixed_operand (&ops[0], op0);
25090 create_fixed_operand (&ops[1], op1);
25092 start_sequence ();
25093 if (!maybe_expand_insn (icode, 2, ops))
25095 end_sequence ();
25096 return NULL_RTX;
25098 *gen_seq = get_insns ();
25099 end_sequence ();
25101 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25102 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25105 static rtx
25106 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25107 int cmp_code, tree treeop0, tree treeop1, int bit_code)
25109 rtx op0, op1, target;
25110 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25111 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25112 insn_code icode;
25113 struct expand_operand ops[6];
25114 int aarch64_cond;
25116 push_to_sequence (*prep_seq);
25117 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25119 op_mode = GET_MODE (op0);
25120 if (op_mode == VOIDmode)
25121 op_mode = GET_MODE (op1);
25123 switch (op_mode)
25125 case E_QImode:
25126 case E_HImode:
25127 case E_SImode:
25128 cmp_mode = SImode;
25129 break;
25131 case E_DImode:
25132 cmp_mode = DImode;
25133 break;
25135 case E_SFmode:
25136 cmp_mode = SFmode;
25137 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25138 break;
25140 case E_DFmode:
25141 cmp_mode = DFmode;
25142 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25143 break;
25145 default:
25146 end_sequence ();
25147 return NULL_RTX;
25150 icode = code_for_ccmp (cc_mode, cmp_mode);
25152 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25153 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25154 if (!op0 || !op1)
25156 end_sequence ();
25157 return NULL_RTX;
25159 *prep_seq = get_insns ();
25160 end_sequence ();
25162 target = gen_rtx_REG (cc_mode, CC_REGNUM);
25163 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25165 if (bit_code != AND)
25167 /* Treat the ccmp patterns as canonical and use them where possible,
25168 but fall back to ccmp_rev patterns if there's no other option. */
25169 rtx_code prev_code = GET_CODE (prev);
25170 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25171 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25172 && !(prev_code == EQ
25173 || prev_code == NE
25174 || prev_code == ORDERED
25175 || prev_code == UNORDERED))
25176 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25177 else
25179 rtx_code code = reverse_condition (prev_code);
25180 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25182 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25185 create_fixed_operand (&ops[0], XEXP (prev, 0));
25186 create_fixed_operand (&ops[1], target);
25187 create_fixed_operand (&ops[2], op0);
25188 create_fixed_operand (&ops[3], op1);
25189 create_fixed_operand (&ops[4], prev);
25190 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25192 push_to_sequence (*gen_seq);
25193 if (!maybe_expand_insn (icode, 6, ops))
25195 end_sequence ();
25196 return NULL_RTX;
25199 *gen_seq = get_insns ();
25200 end_sequence ();
25202 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25205 #undef TARGET_GEN_CCMP_FIRST
25206 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25208 #undef TARGET_GEN_CCMP_NEXT
25209 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25211 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
25212 instruction fusion of some sort. */
25214 static bool
25215 aarch64_macro_fusion_p (void)
25217 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25221 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
25222 should be kept together during scheduling. */
25224 static bool
25225 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25227 rtx set_dest;
25228 rtx prev_set = single_set (prev);
25229 rtx curr_set = single_set (curr);
25230 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
25231 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25233 if (!aarch64_macro_fusion_p ())
25234 return false;
25236 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25238 /* We are trying to match:
25239 prev (mov) == (set (reg r0) (const_int imm16))
25240 curr (movk) == (set (zero_extract (reg r0)
25241 (const_int 16)
25242 (const_int 16))
25243 (const_int imm16_1)) */
25245 set_dest = SET_DEST (curr_set);
25247 if (GET_CODE (set_dest) == ZERO_EXTRACT
25248 && CONST_INT_P (SET_SRC (curr_set))
25249 && CONST_INT_P (SET_SRC (prev_set))
25250 && CONST_INT_P (XEXP (set_dest, 2))
25251 && INTVAL (XEXP (set_dest, 2)) == 16
25252 && REG_P (XEXP (set_dest, 0))
25253 && REG_P (SET_DEST (prev_set))
25254 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25256 return true;
25260 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25263 /* We're trying to match:
25264 prev (adrp) == (set (reg r1)
25265 (high (symbol_ref ("SYM"))))
25266 curr (add) == (set (reg r0)
25267 (lo_sum (reg r1)
25268 (symbol_ref ("SYM"))))
25269 Note that r0 need not necessarily be the same as r1, especially
25270 during pre-regalloc scheduling. */
25272 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25273 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25275 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25276 && REG_P (XEXP (SET_SRC (curr_set), 0))
25277 && REGNO (XEXP (SET_SRC (curr_set), 0))
25278 == REGNO (SET_DEST (prev_set))
25279 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25280 XEXP (SET_SRC (curr_set), 1)))
25281 return true;
25285 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25288 /* We're trying to match:
25289 prev (movk) == (set (zero_extract (reg r0)
25290 (const_int 16)
25291 (const_int 32))
25292 (const_int imm16_1))
25293 curr (movk) == (set (zero_extract (reg r0)
25294 (const_int 16)
25295 (const_int 48))
25296 (const_int imm16_2)) */
25298 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25299 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25300 && REG_P (XEXP (SET_DEST (prev_set), 0))
25301 && REG_P (XEXP (SET_DEST (curr_set), 0))
25302 && REGNO (XEXP (SET_DEST (prev_set), 0))
25303 == REGNO (XEXP (SET_DEST (curr_set), 0))
25304 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25305 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25306 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25307 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25308 && CONST_INT_P (SET_SRC (prev_set))
25309 && CONST_INT_P (SET_SRC (curr_set)))
25310 return true;
25313 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25315 /* We're trying to match:
25316 prev (adrp) == (set (reg r0)
25317 (high (symbol_ref ("SYM"))))
25318 curr (ldr) == (set (reg r1)
25319 (mem (lo_sum (reg r0)
25320 (symbol_ref ("SYM")))))
25322 curr (ldr) == (set (reg r1)
25323 (zero_extend (mem
25324 (lo_sum (reg r0)
25325 (symbol_ref ("SYM")))))) */
25326 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25327 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25329 rtx curr_src = SET_SRC (curr_set);
25331 if (GET_CODE (curr_src) == ZERO_EXTEND)
25332 curr_src = XEXP (curr_src, 0);
25334 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25335 && REG_P (XEXP (XEXP (curr_src, 0), 0))
25336 && REGNO (XEXP (XEXP (curr_src, 0), 0))
25337 == REGNO (SET_DEST (prev_set))
25338 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25339 XEXP (SET_SRC (prev_set), 0)))
25340 return true;
25344 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
25345 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25346 && prev_set && curr_set && any_condjump_p (curr)
25347 && GET_CODE (SET_SRC (prev_set)) == COMPARE
25348 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25349 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25350 return true;
25352 /* Fuse flag-setting ALU instructions and conditional branch. */
25353 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25354 && any_condjump_p (curr))
25356 unsigned int condreg1, condreg2;
25357 rtx cc_reg_1;
25358 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25359 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25361 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25362 && prev
25363 && modified_in_p (cc_reg_1, prev))
25365 enum attr_type prev_type = get_attr_type (prev);
25367 /* FIXME: this misses some which is considered simple arthematic
25368 instructions for ThunderX. Simple shifts are missed here. */
25369 if (prev_type == TYPE_ALUS_SREG
25370 || prev_type == TYPE_ALUS_IMM
25371 || prev_type == TYPE_LOGICS_REG
25372 || prev_type == TYPE_LOGICS_IMM)
25373 return true;
25377 /* Fuse ALU instructions and CBZ/CBNZ. */
25378 if (prev_set
25379 && curr_set
25380 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25381 && any_condjump_p (curr))
25383 /* We're trying to match:
25384 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25385 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
25386 (const_int 0))
25387 (label_ref ("SYM"))
25388 (pc)) */
25389 if (SET_DEST (curr_set) == (pc_rtx)
25390 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25391 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25392 && REG_P (SET_DEST (prev_set))
25393 && REGNO (SET_DEST (prev_set))
25394 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25396 /* Fuse ALU operations followed by conditional branch instruction. */
25397 switch (get_attr_type (prev))
25399 case TYPE_ALU_IMM:
25400 case TYPE_ALU_SREG:
25401 case TYPE_ADC_REG:
25402 case TYPE_ADC_IMM:
25403 case TYPE_ADCS_REG:
25404 case TYPE_ADCS_IMM:
25405 case TYPE_LOGIC_REG:
25406 case TYPE_LOGIC_IMM:
25407 case TYPE_CSEL:
25408 case TYPE_ADR:
25409 case TYPE_MOV_IMM:
25410 case TYPE_SHIFT_REG:
25411 case TYPE_SHIFT_IMM:
25412 case TYPE_BFM:
25413 case TYPE_RBIT:
25414 case TYPE_REV:
25415 case TYPE_EXTEND:
25416 return true;
25418 default:;
25423 return false;
25426 /* Return true iff the instruction fusion described by OP is enabled. */
25428 bool
25429 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25431 return (aarch64_tune_params.fusible_ops & op) != 0;
25434 /* If MEM is in the form of [base+offset], extract the two parts
25435 of address and set to BASE and OFFSET, otherwise return false
25436 after clearing BASE and OFFSET. */
25438 bool
25439 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25441 rtx addr;
25443 gcc_assert (MEM_P (mem));
25445 addr = XEXP (mem, 0);
25447 if (REG_P (addr))
25449 *base = addr;
25450 *offset = const0_rtx;
25451 return true;
25454 if (GET_CODE (addr) == PLUS
25455 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25457 *base = XEXP (addr, 0);
25458 *offset = XEXP (addr, 1);
25459 return true;
25462 *base = NULL_RTX;
25463 *offset = NULL_RTX;
25465 return false;
25468 /* Types for scheduling fusion. */
25469 enum sched_fusion_type
25471 SCHED_FUSION_NONE = 0,
25472 SCHED_FUSION_LD_SIGN_EXTEND,
25473 SCHED_FUSION_LD_ZERO_EXTEND,
25474 SCHED_FUSION_LD,
25475 SCHED_FUSION_ST,
25476 SCHED_FUSION_NUM
25479 /* If INSN is a load or store of address in the form of [base+offset],
25480 extract the two parts and set to BASE and OFFSET. Return scheduling
25481 fusion type this INSN is. */
25483 static enum sched_fusion_type
25484 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25486 rtx x, dest, src;
25487 enum sched_fusion_type fusion = SCHED_FUSION_LD;
25489 gcc_assert (INSN_P (insn));
25490 x = PATTERN (insn);
25491 if (GET_CODE (x) != SET)
25492 return SCHED_FUSION_NONE;
25494 src = SET_SRC (x);
25495 dest = SET_DEST (x);
25497 machine_mode dest_mode = GET_MODE (dest);
25499 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25500 return SCHED_FUSION_NONE;
25502 if (GET_CODE (src) == SIGN_EXTEND)
25504 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25505 src = XEXP (src, 0);
25506 if (!MEM_P (src) || GET_MODE (src) != SImode)
25507 return SCHED_FUSION_NONE;
25509 else if (GET_CODE (src) == ZERO_EXTEND)
25511 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25512 src = XEXP (src, 0);
25513 if (!MEM_P (src) || GET_MODE (src) != SImode)
25514 return SCHED_FUSION_NONE;
25517 if (MEM_P (src) && REG_P (dest))
25518 extract_base_offset_in_addr (src, base, offset);
25519 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25521 fusion = SCHED_FUSION_ST;
25522 extract_base_offset_in_addr (dest, base, offset);
25524 else
25525 return SCHED_FUSION_NONE;
25527 if (*base == NULL_RTX || *offset == NULL_RTX)
25528 fusion = SCHED_FUSION_NONE;
25530 return fusion;
25533 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25535 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25536 and PRI are only calculated for these instructions. For other instruction,
25537 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
25538 type instruction fusion can be added by returning different priorities.
25540 It's important that irrelevant instructions get the largest FUSION_PRI. */
25542 static void
25543 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25544 int *fusion_pri, int *pri)
25546 int tmp, off_val;
25547 rtx base, offset;
25548 enum sched_fusion_type fusion;
25550 gcc_assert (INSN_P (insn));
25552 tmp = max_pri - 1;
25553 fusion = fusion_load_store (insn, &base, &offset);
25554 if (fusion == SCHED_FUSION_NONE)
25556 *pri = tmp;
25557 *fusion_pri = tmp;
25558 return;
25561 /* Set FUSION_PRI according to fusion type and base register. */
25562 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25564 /* Calculate PRI. */
25565 tmp /= 2;
25567 /* INSN with smaller offset goes first. */
25568 off_val = (int)(INTVAL (offset));
25569 if (off_val >= 0)
25570 tmp -= (off_val & 0xfffff);
25571 else
25572 tmp += ((- off_val) & 0xfffff);
25574 *pri = tmp;
25575 return;
25578 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25579 Adjust priority of sha1h instructions so they are scheduled before
25580 other SHA1 instructions. */
25582 static int
25583 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
25585 rtx x = PATTERN (insn);
25587 if (GET_CODE (x) == SET)
25589 x = SET_SRC (x);
25591 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
25592 return priority + 10;
25595 return priority;
25598 /* If REVERSED is null, return true if memory reference *MEM2 comes
25599 immediately after memory reference *MEM1. Do not change the references
25600 in this case.
25602 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
25603 if they are, try to make them use constant offsets from the same base
25604 register. Return true on success. When returning true, set *REVERSED
25605 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
25606 static bool
25607 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
25609 if (reversed)
25610 *reversed = false;
25612 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
25613 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
25614 return false;
25616 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
25617 return false;
25619 auto size1 = MEM_SIZE (*mem1);
25620 auto size2 = MEM_SIZE (*mem2);
25622 rtx base1, base2, offset1, offset2;
25623 extract_base_offset_in_addr (*mem1, &base1, &offset1);
25624 extract_base_offset_in_addr (*mem2, &base2, &offset2);
25626 /* Make sure at least one memory is in base+offset form. */
25627 if (!(base1 && offset1) && !(base2 && offset2))
25628 return false;
25630 /* If both mems already use the same base register, just check the
25631 offsets. */
25632 if (base1 && base2 && rtx_equal_p (base1, base2))
25634 if (!offset1 || !offset2)
25635 return false;
25637 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
25638 return true;
25640 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
25642 *reversed = true;
25643 return true;
25646 return false;
25649 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
25650 guarantee that the values are consecutive. */
25651 if (MEM_EXPR (*mem1)
25652 && MEM_EXPR (*mem2)
25653 && MEM_OFFSET_KNOWN_P (*mem1)
25654 && MEM_OFFSET_KNOWN_P (*mem2))
25656 poly_int64 expr_offset1;
25657 poly_int64 expr_offset2;
25658 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
25659 &expr_offset1);
25660 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
25661 &expr_offset2);
25662 if (!expr_base1
25663 || !expr_base2
25664 || !DECL_P (expr_base1)
25665 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
25666 return false;
25668 expr_offset1 += MEM_OFFSET (*mem1);
25669 expr_offset2 += MEM_OFFSET (*mem2);
25671 if (known_eq (expr_offset1 + size1, expr_offset2))
25673 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
25674 *reversed = true;
25675 else
25676 return false;
25678 if (reversed)
25680 if (base2)
25682 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
25683 expr_offset1 - expr_offset2);
25684 *mem1 = replace_equiv_address_nv (*mem1, addr1);
25686 else
25688 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
25689 expr_offset2 - expr_offset1);
25690 *mem2 = replace_equiv_address_nv (*mem2, addr2);
25693 return true;
25696 return false;
25699 /* Return true if MEM1 and MEM2 can be combined into a single access
25700 of mode MODE, with the combined access having the same address as MEM1. */
25702 bool
25703 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
25705 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
25706 return false;
25707 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
25710 /* Given OPERANDS of consecutive load/store, check if we can merge
25711 them into ldp/stp. LOAD is true if they are load instructions.
25712 MODE is the mode of memory operands. */
25714 bool
25715 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
25716 machine_mode mode)
25718 enum reg_class rclass_1, rclass_2;
25719 rtx mem_1, mem_2, reg_1, reg_2;
25721 if (load)
25723 mem_1 = operands[1];
25724 mem_2 = operands[3];
25725 reg_1 = operands[0];
25726 reg_2 = operands[2];
25727 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
25728 if (REGNO (reg_1) == REGNO (reg_2))
25729 return false;
25730 if (reg_overlap_mentioned_p (reg_1, mem_2))
25731 return false;
25733 else
25735 mem_1 = operands[0];
25736 mem_2 = operands[2];
25737 reg_1 = operands[1];
25738 reg_2 = operands[3];
25741 /* The mems cannot be volatile. */
25742 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
25743 return false;
25745 /* If we have SImode and slow unaligned ldp,
25746 check the alignment to be at least 8 byte. */
25747 if (mode == SImode
25748 && (aarch64_tune_params.extra_tuning_flags
25749 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
25750 && !optimize_size
25751 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
25752 return false;
25754 /* Check if the addresses are in the form of [base+offset]. */
25755 bool reversed = false;
25756 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
25757 return false;
25759 /* The operands must be of the same size. */
25760 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
25761 GET_MODE_SIZE (GET_MODE (mem_2))));
25763 /* One of the memory accesses must be a mempair operand.
25764 If it is not the first one, they need to be swapped by the
25765 peephole. */
25766 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
25767 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
25768 return false;
25770 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
25771 rclass_1 = FP_REGS;
25772 else
25773 rclass_1 = GENERAL_REGS;
25775 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
25776 rclass_2 = FP_REGS;
25777 else
25778 rclass_2 = GENERAL_REGS;
25780 /* Check if the registers are of same class. */
25781 if (rclass_1 != rclass_2)
25782 return false;
25784 return true;
25787 /* Given OPERANDS of consecutive load/store that can be merged,
25788 swap them if they are not in ascending order. */
25789 void
25790 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
25792 int mem_op = load ? 1 : 0;
25793 bool reversed = false;
25794 if (!aarch64_check_consecutive_mems (operands + mem_op,
25795 operands + mem_op + 2, &reversed))
25796 gcc_unreachable ();
25798 if (reversed)
25800 /* Irrespective of whether this is a load or a store,
25801 we do the same swap. */
25802 std::swap (operands[0], operands[2]);
25803 std::swap (operands[1], operands[3]);
25807 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
25808 comparison between the two. */
25810 aarch64_host_wide_int_compare (const void *x, const void *y)
25812 return wi::cmps (* ((const HOST_WIDE_INT *) x),
25813 * ((const HOST_WIDE_INT *) y));
25816 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
25817 other pointing to a REG rtx containing an offset, compare the offsets
25818 of the two pairs.
25820 Return:
25822 1 iff offset (X) > offset (Y)
25823 0 iff offset (X) == offset (Y)
25824 -1 iff offset (X) < offset (Y) */
25826 aarch64_ldrstr_offset_compare (const void *x, const void *y)
25828 const rtx * operands_1 = (const rtx *) x;
25829 const rtx * operands_2 = (const rtx *) y;
25830 rtx mem_1, mem_2, base, offset_1, offset_2;
25832 if (MEM_P (operands_1[0]))
25833 mem_1 = operands_1[0];
25834 else
25835 mem_1 = operands_1[1];
25837 if (MEM_P (operands_2[0]))
25838 mem_2 = operands_2[0];
25839 else
25840 mem_2 = operands_2[1];
25842 /* Extract the offsets. */
25843 extract_base_offset_in_addr (mem_1, &base, &offset_1);
25844 extract_base_offset_in_addr (mem_2, &base, &offset_2);
25846 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
25848 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
25851 /* Given OPERANDS of consecutive load/store, check if we can merge
25852 them into ldp/stp by adjusting the offset. LOAD is true if they
25853 are load instructions. MODE is the mode of memory operands.
25855 Given below consecutive stores:
25857 str w1, [xb, 0x100]
25858 str w1, [xb, 0x104]
25859 str w1, [xb, 0x108]
25860 str w1, [xb, 0x10c]
25862 Though the offsets are out of the range supported by stp, we can
25863 still pair them after adjusting the offset, like:
25865 add scratch, xb, 0x100
25866 stp w1, w1, [scratch]
25867 stp w1, w1, [scratch, 0x8]
25869 The peephole patterns detecting this opportunity should guarantee
25870 the scratch register is avaliable. */
25872 bool
25873 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
25874 machine_mode mode)
25876 const int num_insns = 4;
25877 enum reg_class rclass;
25878 HOST_WIDE_INT offvals[num_insns], msize;
25879 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
25881 if (load)
25883 for (int i = 0; i < num_insns; i++)
25885 reg[i] = operands[2 * i];
25886 mem[i] = operands[2 * i + 1];
25888 gcc_assert (REG_P (reg[i]));
25891 /* Do not attempt to merge the loads if the loads clobber each other. */
25892 for (int i = 0; i < 8; i += 2)
25893 for (int j = i + 2; j < 8; j += 2)
25894 if (reg_overlap_mentioned_p (operands[i], operands[j]))
25895 return false;
25897 else
25898 for (int i = 0; i < num_insns; i++)
25900 mem[i] = operands[2 * i];
25901 reg[i] = operands[2 * i + 1];
25904 /* Skip if memory operand is by itself valid for ldp/stp. */
25905 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
25906 return false;
25908 for (int i = 0; i < num_insns; i++)
25910 /* The mems cannot be volatile. */
25911 if (MEM_VOLATILE_P (mem[i]))
25912 return false;
25914 /* Check if the addresses are in the form of [base+offset]. */
25915 extract_base_offset_in_addr (mem[i], base + i, offset + i);
25916 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
25917 return false;
25920 /* Check if the registers are of same class. */
25921 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
25922 ? FP_REGS : GENERAL_REGS;
25924 for (int i = 1; i < num_insns; i++)
25925 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
25927 if (rclass != FP_REGS)
25928 return false;
25930 else
25932 if (rclass != GENERAL_REGS)
25933 return false;
25936 /* Only the last register in the order in which they occur
25937 may be clobbered by the load. */
25938 if (rclass == GENERAL_REGS && load)
25939 for (int i = 0; i < num_insns - 1; i++)
25940 if (reg_mentioned_p (reg[i], mem[i]))
25941 return false;
25943 /* Check if the bases are same. */
25944 for (int i = 0; i < num_insns - 1; i++)
25945 if (!rtx_equal_p (base[i], base[i + 1]))
25946 return false;
25948 for (int i = 0; i < num_insns; i++)
25949 offvals[i] = INTVAL (offset[i]);
25951 msize = GET_MODE_SIZE (mode).to_constant ();
25953 /* Check if the offsets can be put in the right order to do a ldp/stp. */
25954 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
25955 aarch64_host_wide_int_compare);
25957 if (!(offvals[1] == offvals[0] + msize
25958 && offvals[3] == offvals[2] + msize))
25959 return false;
25961 /* Check that offsets are within range of each other. The ldp/stp
25962 instructions have 7 bit immediate offsets, so use 0x80. */
25963 if (offvals[2] - offvals[0] >= msize * 0x80)
25964 return false;
25966 /* The offsets must be aligned with respect to each other. */
25967 if (offvals[0] % msize != offvals[2] % msize)
25968 return false;
25970 /* If we have SImode and slow unaligned ldp,
25971 check the alignment to be at least 8 byte. */
25972 if (mode == SImode
25973 && (aarch64_tune_params.extra_tuning_flags
25974 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
25975 && !optimize_size
25976 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
25977 return false;
25979 return true;
25982 /* Given OPERANDS of consecutive load/store, this function pairs them
25983 into LDP/STP after adjusting the offset. It depends on the fact
25984 that the operands can be sorted so the offsets are correct for STP.
25985 MODE is the mode of memory operands. CODE is the rtl operator
25986 which should be applied to all memory operands, it's SIGN_EXTEND,
25987 ZERO_EXTEND or UNKNOWN. */
25989 bool
25990 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
25991 machine_mode mode, RTX_CODE code)
25993 rtx base, offset_1, offset_3, t1, t2;
25994 rtx mem_1, mem_2, mem_3, mem_4;
25995 rtx temp_operands[8];
25996 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
25997 stp_off_upper_limit, stp_off_lower_limit, msize;
25999 /* We make changes on a copy as we may still bail out. */
26000 for (int i = 0; i < 8; i ++)
26001 temp_operands[i] = operands[i];
26003 /* Sort the operands. Note for cases as below:
26004 [base + 0x310] = A
26005 [base + 0x320] = B
26006 [base + 0x330] = C
26007 [base + 0x320] = D
26008 We need stable sorting otherwise wrong data may be store to offset 0x320.
26009 Also note the dead store in above case should be optimized away, but no
26010 guarantees here. */
26011 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26012 aarch64_ldrstr_offset_compare);
26014 /* Copy the memory operands so that if we have to bail for some
26015 reason the original addresses are unchanged. */
26016 if (load)
26018 mem_1 = copy_rtx (temp_operands[1]);
26019 mem_2 = copy_rtx (temp_operands[3]);
26020 mem_3 = copy_rtx (temp_operands[5]);
26021 mem_4 = copy_rtx (temp_operands[7]);
26023 else
26025 mem_1 = copy_rtx (temp_operands[0]);
26026 mem_2 = copy_rtx (temp_operands[2]);
26027 mem_3 = copy_rtx (temp_operands[4]);
26028 mem_4 = copy_rtx (temp_operands[6]);
26029 gcc_assert (code == UNKNOWN);
26032 extract_base_offset_in_addr (mem_1, &base, &offset_1);
26033 extract_base_offset_in_addr (mem_3, &base, &offset_3);
26034 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26035 && offset_3 != NULL_RTX);
26037 /* Adjust offset so it can fit in LDP/STP instruction. */
26038 msize = GET_MODE_SIZE (mode).to_constant();
26039 stp_off_upper_limit = msize * (0x40 - 1);
26040 stp_off_lower_limit = - msize * 0x40;
26042 off_val_1 = INTVAL (offset_1);
26043 off_val_3 = INTVAL (offset_3);
26045 /* The base offset is optimally half way between the two STP/LDP offsets. */
26046 if (msize <= 4)
26047 base_off = (off_val_1 + off_val_3) / 2;
26048 else
26049 /* However, due to issues with negative LDP/STP offset generation for
26050 larger modes, for DF, DD, DI and vector modes. we must not use negative
26051 addresses smaller than 9 signed unadjusted bits can store. This
26052 provides the most range in this case. */
26053 base_off = off_val_1;
26055 /* Adjust the base so that it is aligned with the addresses but still
26056 optimal. */
26057 if (base_off % msize != off_val_1 % msize)
26058 /* Fix the offset, bearing in mind we want to make it bigger not
26059 smaller. */
26060 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26061 else if (msize <= 4)
26062 /* The negative range of LDP/STP is one larger than the positive range. */
26063 base_off += msize;
26065 /* Check if base offset is too big or too small. We can attempt to resolve
26066 this issue by setting it to the maximum value and seeing if the offsets
26067 still fit. */
26068 if (base_off >= 0x1000)
26070 base_off = 0x1000 - 1;
26071 /* We must still make sure that the base offset is aligned with respect
26072 to the address. But it may not be made any bigger. */
26073 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26076 /* Likewise for the case where the base is too small. */
26077 if (base_off <= -0x1000)
26079 base_off = -0x1000 + 1;
26080 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26083 /* Offset of the first STP/LDP. */
26084 new_off_1 = off_val_1 - base_off;
26086 /* Offset of the second STP/LDP. */
26087 new_off_3 = off_val_3 - base_off;
26089 /* The offsets must be within the range of the LDP/STP instructions. */
26090 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26091 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26092 return false;
26094 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26095 new_off_1), true);
26096 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26097 new_off_1 + msize), true);
26098 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26099 new_off_3), true);
26100 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26101 new_off_3 + msize), true);
26103 if (!aarch64_mem_pair_operand (mem_1, mode)
26104 || !aarch64_mem_pair_operand (mem_3, mode))
26105 return false;
26107 if (code == ZERO_EXTEND)
26109 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26110 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26111 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26112 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26114 else if (code == SIGN_EXTEND)
26116 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26117 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26118 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26119 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26122 if (load)
26124 operands[0] = temp_operands[0];
26125 operands[1] = mem_1;
26126 operands[2] = temp_operands[2];
26127 operands[3] = mem_2;
26128 operands[4] = temp_operands[4];
26129 operands[5] = mem_3;
26130 operands[6] = temp_operands[6];
26131 operands[7] = mem_4;
26133 else
26135 operands[0] = mem_1;
26136 operands[1] = temp_operands[1];
26137 operands[2] = mem_2;
26138 operands[3] = temp_operands[3];
26139 operands[4] = mem_3;
26140 operands[5] = temp_operands[5];
26141 operands[6] = mem_4;
26142 operands[7] = temp_operands[7];
26145 /* Emit adjusting instruction. */
26146 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26147 /* Emit ldp/stp instructions. */
26148 t1 = gen_rtx_SET (operands[0], operands[1]);
26149 t2 = gen_rtx_SET (operands[2], operands[3]);
26150 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26151 t1 = gen_rtx_SET (operands[4], operands[5]);
26152 t2 = gen_rtx_SET (operands[6], operands[7]);
26153 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26154 return true;
26157 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
26158 it isn't worth branching around empty masked ops (including masked
26159 stores). */
26161 static bool
26162 aarch64_empty_mask_is_expensive (unsigned)
26164 return false;
26167 /* Return 1 if pseudo register should be created and used to hold
26168 GOT address for PIC code. */
26170 bool
26171 aarch64_use_pseudo_pic_reg (void)
26173 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26176 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
26178 static int
26179 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26181 switch (XINT (x, 1))
26183 case UNSPEC_GOTSMALLPIC:
26184 case UNSPEC_GOTSMALLPIC28K:
26185 case UNSPEC_GOTTINYPIC:
26186 return 0;
26187 default:
26188 break;
26191 return default_unspec_may_trap_p (x, flags);
26195 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26196 return the log2 of that value. Otherwise return -1. */
26199 aarch64_fpconst_pow_of_2 (rtx x)
26201 const REAL_VALUE_TYPE *r;
26203 if (!CONST_DOUBLE_P (x))
26204 return -1;
26206 r = CONST_DOUBLE_REAL_VALUE (x);
26208 if (REAL_VALUE_NEGATIVE (*r)
26209 || REAL_VALUE_ISNAN (*r)
26210 || REAL_VALUE_ISINF (*r)
26211 || !real_isinteger (r, DFmode))
26212 return -1;
26214 return exact_log2 (real_to_integer (r));
26217 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26218 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26219 return n. Otherwise return -1. */
26222 aarch64_fpconst_pow2_recip (rtx x)
26224 REAL_VALUE_TYPE r0;
26226 if (!CONST_DOUBLE_P (x))
26227 return -1;
26229 r0 = *CONST_DOUBLE_REAL_VALUE (x);
26230 if (exact_real_inverse (DFmode, &r0)
26231 && !REAL_VALUE_NEGATIVE (r0))
26233 int ret = exact_log2 (real_to_integer (&r0));
26234 if (ret >= 1 && ret <= 32)
26235 return ret;
26237 return -1;
26240 /* If X is a vector of equal CONST_DOUBLE values and that value is
26241 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
26244 aarch64_vec_fpconst_pow_of_2 (rtx x)
26246 int nelts;
26247 if (!CONST_VECTOR_P (x)
26248 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26249 return -1;
26251 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26252 return -1;
26254 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26255 if (firstval <= 0)
26256 return -1;
26258 for (int i = 1; i < nelts; i++)
26259 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26260 return -1;
26262 return firstval;
26265 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26266 to float.
26268 __fp16 always promotes through this hook.
26269 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26270 through the generic excess precision logic rather than here. */
26272 static tree
26273 aarch64_promoted_type (const_tree t)
26275 if (SCALAR_FLOAT_TYPE_P (t)
26276 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26277 return float_type_node;
26279 return NULL_TREE;
26282 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
26284 static bool
26285 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26286 optimization_type opt_type)
26288 switch (op)
26290 case rsqrt_optab:
26291 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26293 default:
26294 return true;
26298 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
26300 static unsigned int
26301 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26302 int *offset)
26304 /* Polynomial invariant 1 == (VG / 2) - 1. */
26305 gcc_assert (i == 1);
26306 *factor = 2;
26307 *offset = 1;
26308 return AARCH64_DWARF_VG;
26311 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26312 if MODE is HFmode, and punt to the generic implementation otherwise. */
26314 static bool
26315 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26317 return (mode == HFmode
26318 ? true
26319 : default_libgcc_floating_mode_supported_p (mode));
26322 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26323 if MODE is HFmode, and punt to the generic implementation otherwise. */
26325 static bool
26326 aarch64_scalar_mode_supported_p (scalar_mode mode)
26328 if (DECIMAL_FLOAT_MODE_P (mode))
26329 return default_decimal_float_supported_p ();
26331 return (mode == HFmode
26332 ? true
26333 : default_scalar_mode_supported_p (mode));
26336 /* Set the value of FLT_EVAL_METHOD.
26337 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26339 0: evaluate all operations and constants, whose semantic type has at
26340 most the range and precision of type float, to the range and
26341 precision of float; evaluate all other operations and constants to
26342 the range and precision of the semantic type;
26344 N, where _FloatN is a supported interchange floating type
26345 evaluate all operations and constants, whose semantic type has at
26346 most the range and precision of _FloatN type, to the range and
26347 precision of the _FloatN type; evaluate all other operations and
26348 constants to the range and precision of the semantic type;
26350 If we have the ARMv8.2-A extensions then we support _Float16 in native
26351 precision, so we should set this to 16. Otherwise, we support the type,
26352 but want to evaluate expressions in float precision, so set this to
26353 0. */
26355 static enum flt_eval_method
26356 aarch64_excess_precision (enum excess_precision_type type)
26358 switch (type)
26360 case EXCESS_PRECISION_TYPE_FAST:
26361 case EXCESS_PRECISION_TYPE_STANDARD:
26362 /* We can calculate either in 16-bit range and precision or
26363 32-bit range and precision. Make that decision based on whether
26364 we have native support for the ARMv8.2-A 16-bit floating-point
26365 instructions or not. */
26366 return (TARGET_FP_F16INST
26367 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26368 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26369 case EXCESS_PRECISION_TYPE_IMPLICIT:
26370 case EXCESS_PRECISION_TYPE_FLOAT16:
26371 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26372 default:
26373 gcc_unreachable ();
26375 return FLT_EVAL_METHOD_UNPREDICTABLE;
26378 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
26379 scheduled for speculative execution. Reject the long-running division
26380 and square-root instructions. */
26382 static bool
26383 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26385 switch (get_attr_type (insn))
26387 case TYPE_SDIV:
26388 case TYPE_UDIV:
26389 case TYPE_FDIVS:
26390 case TYPE_FDIVD:
26391 case TYPE_FSQRTS:
26392 case TYPE_FSQRTD:
26393 case TYPE_NEON_FP_SQRT_S:
26394 case TYPE_NEON_FP_SQRT_D:
26395 case TYPE_NEON_FP_SQRT_S_Q:
26396 case TYPE_NEON_FP_SQRT_D_Q:
26397 case TYPE_NEON_FP_DIV_S:
26398 case TYPE_NEON_FP_DIV_D:
26399 case TYPE_NEON_FP_DIV_S_Q:
26400 case TYPE_NEON_FP_DIV_D_Q:
26401 return false;
26402 default:
26403 return true;
26407 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
26409 static int
26410 aarch64_compute_pressure_classes (reg_class *classes)
26412 int i = 0;
26413 classes[i++] = GENERAL_REGS;
26414 classes[i++] = FP_REGS;
26415 /* PR_REGS isn't a useful pressure class because many predicate pseudo
26416 registers need to go in PR_LO_REGS at some point during their
26417 lifetime. Splitting it into two halves has the effect of making
26418 all predicates count against PR_LO_REGS, so that we try whenever
26419 possible to restrict the number of live predicates to 8. This
26420 greatly reduces the amount of spilling in certain loops. */
26421 classes[i++] = PR_LO_REGS;
26422 classes[i++] = PR_HI_REGS;
26423 return i;
26426 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
26428 static bool
26429 aarch64_can_change_mode_class (machine_mode from,
26430 machine_mode to, reg_class_t)
26432 unsigned int from_flags = aarch64_classify_vector_mode (from);
26433 unsigned int to_flags = aarch64_classify_vector_mode (to);
26435 bool from_sve_p = (from_flags & VEC_ANY_SVE);
26436 bool to_sve_p = (to_flags & VEC_ANY_SVE);
26438 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26439 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26441 bool from_pred_p = (from_flags & VEC_SVE_PRED);
26442 bool to_pred_p = (to_flags & VEC_SVE_PRED);
26444 bool from_full_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT));
26445 bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26446 | VEC_PARTIAL));
26448 /* Don't allow changes between predicate modes and other modes.
26449 Only predicate registers can hold predicate modes and only
26450 non-predicate registers can hold non-predicate modes, so any
26451 attempt to mix them would require a round trip through memory. */
26452 if (from_pred_p != to_pred_p)
26453 return false;
26455 /* Don't allow changes between partial SVE modes and other modes.
26456 The contents of partial SVE modes are distributed evenly across
26457 the register, whereas GCC expects them to be clustered together. */
26458 if (from_partial_sve_p != to_partial_sve_p)
26459 return false;
26461 /* Similarly reject changes between partial SVE modes that have
26462 different patterns of significant and insignificant bits. */
26463 if (from_partial_sve_p
26464 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26465 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26466 return false;
26468 /* Don't allow changes between partial and full Advanced SIMD structure
26469 modes. */
26470 if (from_full_advsimd_struct_p && to_partial_advsimd_struct_p)
26471 return false;
26473 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26475 /* Don't allow changes between SVE modes and other modes that might
26476 be bigger than 128 bits. In particular, OImode, CImode and XImode
26477 divide into 128-bit quantities while SVE modes divide into
26478 BITS_PER_SVE_VECTOR quantities. */
26479 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26480 return false;
26481 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26482 return false;
26485 if (BYTES_BIG_ENDIAN)
26487 /* Don't allow changes between SVE data modes and non-SVE modes.
26488 See the comment at the head of aarch64-sve.md for details. */
26489 if (from_sve_p != to_sve_p)
26490 return false;
26492 /* Don't allow changes in element size: lane 0 of the new vector
26493 would not then be lane 0 of the old vector. See the comment
26494 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26495 description.
26497 In the worst case, this forces a register to be spilled in
26498 one mode and reloaded in the other, which handles the
26499 endianness correctly. */
26500 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26501 return false;
26503 return true;
26506 /* Implement TARGET_EARLY_REMAT_MODES. */
26508 static void
26509 aarch64_select_early_remat_modes (sbitmap modes)
26511 /* SVE values are not normally live across a call, so it should be
26512 worth doing early rematerialization even in VL-specific mode. */
26513 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26514 if (aarch64_sve_mode_p ((machine_mode) i))
26515 bitmap_set_bit (modes, i);
26518 /* Override the default target speculation_safe_value. */
26519 static rtx
26520 aarch64_speculation_safe_value (machine_mode mode,
26521 rtx result, rtx val, rtx failval)
26523 /* Maybe we should warn if falling back to hard barriers. They are
26524 likely to be noticably more expensive than the alternative below. */
26525 if (!aarch64_track_speculation)
26526 return default_speculation_safe_value (mode, result, val, failval);
26528 if (!REG_P (val))
26529 val = copy_to_mode_reg (mode, val);
26531 if (!aarch64_reg_or_zero (failval, mode))
26532 failval = copy_to_mode_reg (mode, failval);
26534 emit_insn (gen_despeculate_copy (mode, result, val, failval));
26535 return result;
26538 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26539 Look into the tuning structure for an estimate.
26540 KIND specifies the type of requested estimate: min, max or likely.
26541 For cores with a known SVE width all three estimates are the same.
26542 For generic SVE tuning we want to distinguish the maximum estimate from
26543 the minimum and likely ones.
26544 The likely estimate is the same as the minimum in that case to give a
26545 conservative behavior of auto-vectorizing with SVE when it is a win
26546 even for 128-bit SVE.
26547 When SVE width information is available VAL.coeffs[1] is multiplied by
26548 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
26550 static HOST_WIDE_INT
26551 aarch64_estimated_poly_value (poly_int64 val,
26552 poly_value_estimate_kind kind
26553 = POLY_VALUE_LIKELY)
26555 unsigned int width_source = aarch64_tune_params.sve_width;
26557 /* If there is no core-specific information then the minimum and likely
26558 values are based on 128-bit vectors and the maximum is based on
26559 the architectural maximum of 2048 bits. */
26560 if (width_source == SVE_SCALABLE)
26561 switch (kind)
26563 case POLY_VALUE_MIN:
26564 case POLY_VALUE_LIKELY:
26565 return val.coeffs[0];
26566 case POLY_VALUE_MAX:
26567 return val.coeffs[0] + val.coeffs[1] * 15;
26570 /* Allow sve_width to be a bitmask of different VL, treating the lowest
26571 as likely. This could be made more general if future -mtune options
26572 need it to be. */
26573 if (kind == POLY_VALUE_MAX)
26574 width_source = 1 << floor_log2 (width_source);
26575 else
26576 width_source = least_bit_hwi (width_source);
26578 /* If the core provides width information, use that. */
26579 HOST_WIDE_INT over_128 = width_source - 128;
26580 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
26584 /* Return true for types that could be supported as SIMD return or
26585 argument types. */
26587 static bool
26588 supported_simd_type (tree t)
26590 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
26592 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
26593 return s == 1 || s == 2 || s == 4 || s == 8;
26595 return false;
26598 /* Return true for types that currently are supported as SIMD return
26599 or argument types. */
26601 static bool
26602 currently_supported_simd_type (tree t, tree b)
26604 if (COMPLEX_FLOAT_TYPE_P (t))
26605 return false;
26607 if (TYPE_SIZE (t) != TYPE_SIZE (b))
26608 return false;
26610 return supported_simd_type (t);
26613 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
26615 static int
26616 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
26617 struct cgraph_simd_clone *clonei,
26618 tree base_type, int num)
26620 tree t, ret_type;
26621 unsigned int elt_bits, count;
26622 unsigned HOST_WIDE_INT const_simdlen;
26623 poly_uint64 vec_bits;
26625 if (!TARGET_SIMD)
26626 return 0;
26628 /* For now, SVE simdclones won't produce illegal simdlen, So only check
26629 const simdlens here. */
26630 if (maybe_ne (clonei->simdlen, 0U)
26631 && clonei->simdlen.is_constant (&const_simdlen)
26632 && (const_simdlen < 2
26633 || const_simdlen > 1024
26634 || (const_simdlen & (const_simdlen - 1)) != 0))
26636 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26637 "unsupported simdlen %wd", const_simdlen);
26638 return 0;
26641 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
26642 if (TREE_CODE (ret_type) != VOID_TYPE
26643 && !currently_supported_simd_type (ret_type, base_type))
26645 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
26646 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26647 "GCC does not currently support mixed size types "
26648 "for %<simd%> functions");
26649 else if (supported_simd_type (ret_type))
26650 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26651 "GCC does not currently support return type %qT "
26652 "for %<simd%> functions", ret_type);
26653 else
26654 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26655 "unsupported return type %qT for %<simd%> functions",
26656 ret_type);
26657 return 0;
26660 int i;
26661 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
26662 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
26664 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
26665 t && t != void_list_node; t = TREE_CHAIN (t), i++)
26667 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
26669 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
26670 && !currently_supported_simd_type (arg_type, base_type))
26672 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
26673 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26674 "GCC does not currently support mixed size types "
26675 "for %<simd%> functions");
26676 else
26677 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26678 "GCC does not currently support argument type %qT "
26679 "for %<simd%> functions", arg_type);
26680 return 0;
26684 clonei->vecsize_mangle = 'n';
26685 clonei->mask_mode = VOIDmode;
26686 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
26687 if (known_eq (clonei->simdlen, 0U))
26689 count = 2;
26690 vec_bits = (num == 0 ? 64 : 128);
26691 clonei->simdlen = exact_div (vec_bits, elt_bits);
26693 else
26695 count = 1;
26696 vec_bits = clonei->simdlen * elt_bits;
26697 /* For now, SVE simdclones won't produce illegal simdlen, So only check
26698 const simdlens here. */
26699 if (clonei->simdlen.is_constant (&const_simdlen)
26700 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
26702 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26703 "GCC does not currently support simdlen %wd for type %qT",
26704 const_simdlen, base_type);
26705 return 0;
26708 clonei->vecsize_int = vec_bits;
26709 clonei->vecsize_float = vec_bits;
26710 return count;
26713 /* Implement TARGET_SIMD_CLONE_ADJUST. */
26715 static void
26716 aarch64_simd_clone_adjust (struct cgraph_node *node)
26718 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
26719 use the correct ABI. */
26721 tree t = TREE_TYPE (node->decl);
26722 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
26723 TYPE_ATTRIBUTES (t));
26726 /* Implement TARGET_SIMD_CLONE_USABLE. */
26728 static int
26729 aarch64_simd_clone_usable (struct cgraph_node *node)
26731 switch (node->simdclone->vecsize_mangle)
26733 case 'n':
26734 if (!TARGET_SIMD)
26735 return -1;
26736 return 0;
26737 default:
26738 gcc_unreachable ();
26742 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
26744 static int
26745 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
26747 auto check_attr = [&](const char *name) {
26748 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
26749 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
26750 if (!attr1 && !attr2)
26751 return true;
26753 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
26756 if (!check_attr ("aarch64_vector_pcs"))
26757 return 0;
26758 if (!check_attr ("Advanced SIMD type"))
26759 return 0;
26760 if (!check_attr ("SVE type"))
26761 return 0;
26762 if (!check_attr ("SVE sizeless type"))
26763 return 0;
26764 return 1;
26767 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
26769 static const char *
26770 aarch64_get_multilib_abi_name (void)
26772 if (TARGET_BIG_END)
26773 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
26774 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
26777 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
26778 global variable based guard use the default else
26779 return a null tree. */
26780 static tree
26781 aarch64_stack_protect_guard (void)
26783 if (aarch64_stack_protector_guard == SSP_GLOBAL)
26784 return default_stack_protect_guard ();
26786 return NULL_TREE;
26789 /* Return the diagnostic message string if conversion from FROMTYPE to
26790 TOTYPE is not allowed, NULL otherwise. */
26792 static const char *
26793 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
26795 if (element_mode (fromtype) != element_mode (totype))
26797 /* Do no allow conversions to/from BFmode scalar types. */
26798 if (TYPE_MODE (fromtype) == BFmode)
26799 return N_("invalid conversion from type %<bfloat16_t%>");
26800 if (TYPE_MODE (totype) == BFmode)
26801 return N_("invalid conversion to type %<bfloat16_t%>");
26804 /* Conversion allowed. */
26805 return NULL;
26808 /* Return the diagnostic message string if the unary operation OP is
26809 not permitted on TYPE, NULL otherwise. */
26811 static const char *
26812 aarch64_invalid_unary_op (int op, const_tree type)
26814 /* Reject all single-operand operations on BFmode except for &. */
26815 if (element_mode (type) == BFmode && op != ADDR_EXPR)
26816 return N_("operation not permitted on type %<bfloat16_t%>");
26818 /* Operation allowed. */
26819 return NULL;
26822 /* Return the diagnostic message string if the binary operation OP is
26823 not permitted on TYPE1 and TYPE2, NULL otherwise. */
26825 static const char *
26826 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
26827 const_tree type2)
26829 /* Reject all 2-operand operations on BFmode. */
26830 if (element_mode (type1) == BFmode
26831 || element_mode (type2) == BFmode)
26832 return N_("operation not permitted on type %<bfloat16_t%>");
26834 if (VECTOR_TYPE_P (type1)
26835 && VECTOR_TYPE_P (type2)
26836 && !TYPE_INDIVISIBLE_P (type1)
26837 && !TYPE_INDIVISIBLE_P (type2)
26838 && (aarch64_sve::builtin_type_p (type1)
26839 != aarch64_sve::builtin_type_p (type2)))
26840 return N_("cannot combine GNU and SVE vectors in a binary operation");
26842 /* Operation allowed. */
26843 return NULL;
26846 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
26847 compiler that we automatically ignore the top byte of our pointers, which
26848 allows using -fsanitize=hwaddress. */
26849 bool
26850 aarch64_can_tag_addresses ()
26852 return !TARGET_ILP32;
26855 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
26856 section at the end if needed. */
26857 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
26858 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
26859 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
26860 void
26861 aarch64_file_end_indicate_exec_stack ()
26863 file_end_indicate_exec_stack ();
26865 unsigned feature_1_and = 0;
26866 if (aarch64_bti_enabled ())
26867 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
26869 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
26870 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
26872 if (feature_1_and)
26874 /* Generate .note.gnu.property section. */
26875 switch_to_section (get_section (".note.gnu.property",
26876 SECTION_NOTYPE, NULL));
26878 /* PT_NOTE header: namesz, descsz, type.
26879 namesz = 4 ("GNU\0")
26880 descsz = 16 (Size of the program property array)
26881 [(12 + padding) * Number of array elements]
26882 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
26883 assemble_align (POINTER_SIZE);
26884 assemble_integer (GEN_INT (4), 4, 32, 1);
26885 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
26886 assemble_integer (GEN_INT (5), 4, 32, 1);
26888 /* PT_NOTE name. */
26889 assemble_string ("GNU", 4);
26891 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
26892 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
26893 datasz = 4
26894 data = feature_1_and. */
26895 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
26896 assemble_integer (GEN_INT (4), 4, 32, 1);
26897 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
26899 /* Pad the size of the note to the required alignment. */
26900 assemble_align (POINTER_SIZE);
26903 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
26904 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
26905 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
26907 /* Helper function for straight line speculation.
26908 Return what barrier should be emitted for straight line speculation
26909 mitigation.
26910 When not mitigating against straight line speculation this function returns
26911 an empty string.
26912 When mitigating against straight line speculation, use:
26913 * SB when the v8.5-A SB extension is enabled.
26914 * DSB+ISB otherwise. */
26915 const char *
26916 aarch64_sls_barrier (int mitigation_required)
26918 return mitigation_required
26919 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
26920 : "";
26923 static GTY (()) tree aarch64_sls_shared_thunks[30];
26924 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
26925 const char *indirect_symbol_names[30] = {
26926 "__call_indirect_x0",
26927 "__call_indirect_x1",
26928 "__call_indirect_x2",
26929 "__call_indirect_x3",
26930 "__call_indirect_x4",
26931 "__call_indirect_x5",
26932 "__call_indirect_x6",
26933 "__call_indirect_x7",
26934 "__call_indirect_x8",
26935 "__call_indirect_x9",
26936 "__call_indirect_x10",
26937 "__call_indirect_x11",
26938 "__call_indirect_x12",
26939 "__call_indirect_x13",
26940 "__call_indirect_x14",
26941 "__call_indirect_x15",
26942 "", /* "__call_indirect_x16", */
26943 "", /* "__call_indirect_x17", */
26944 "__call_indirect_x18",
26945 "__call_indirect_x19",
26946 "__call_indirect_x20",
26947 "__call_indirect_x21",
26948 "__call_indirect_x22",
26949 "__call_indirect_x23",
26950 "__call_indirect_x24",
26951 "__call_indirect_x25",
26952 "__call_indirect_x26",
26953 "__call_indirect_x27",
26954 "__call_indirect_x28",
26955 "__call_indirect_x29",
26958 /* Function to create a BLR thunk. This thunk is used to mitigate straight
26959 line speculation. Instead of a simple BLR that can be speculated past,
26960 we emit a BL to this thunk, and this thunk contains a BR to the relevant
26961 register. These thunks have the relevant speculation barries put after
26962 their indirect branch so that speculation is blocked.
26964 We use such a thunk so the speculation barriers are kept off the
26965 architecturally executed path in order to reduce the performance overhead.
26967 When optimizing for size we use stubs shared by the linked object.
26968 When optimizing for performance we emit stubs for each function in the hope
26969 that the branch predictor can better train on jumps specific for a given
26970 function. */
26972 aarch64_sls_create_blr_label (int regnum)
26974 gcc_assert (STUB_REGNUM_P (regnum));
26975 if (optimize_function_for_size_p (cfun))
26977 /* For the thunks shared between different functions in this compilation
26978 unit we use a named symbol -- this is just for users to more easily
26979 understand the generated assembly. */
26980 aarch64_sls_shared_thunks_needed = true;
26981 const char *thunk_name = indirect_symbol_names[regnum];
26982 if (aarch64_sls_shared_thunks[regnum] == NULL)
26984 /* Build a decl representing this function stub and record it for
26985 later. We build a decl here so we can use the GCC machinery for
26986 handling sections automatically (through `get_named_section` and
26987 `make_decl_one_only`). That saves us a lot of trouble handling
26988 the specifics of different output file formats. */
26989 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
26990 get_identifier (thunk_name),
26991 build_function_type_list (void_type_node,
26992 NULL_TREE));
26993 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
26994 NULL_TREE, void_type_node);
26995 TREE_PUBLIC (decl) = 1;
26996 TREE_STATIC (decl) = 1;
26997 DECL_IGNORED_P (decl) = 1;
26998 DECL_ARTIFICIAL (decl) = 1;
26999 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27000 resolve_unique_section (decl, 0, false);
27001 aarch64_sls_shared_thunks[regnum] = decl;
27004 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27007 if (cfun->machine->call_via[regnum] == NULL)
27008 cfun->machine->call_via[regnum]
27009 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27010 return cfun->machine->call_via[regnum];
27013 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27014 aarch64_sls_emit_shared_blr_thunks below. */
27015 static void
27016 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27018 /* Save in x16 and branch to that function so this transformation does
27019 not prevent jumping to `BTI c` instructions. */
27020 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27021 asm_fprintf (out_file, "\tbr\tx16\n");
27024 /* Emit all BLR stubs for this particular function.
27025 Here we emit all the BLR stubs needed for the current function. Since we
27026 emit these stubs in a consecutive block we know there will be no speculation
27027 gadgets between each stub, and hence we only emit a speculation barrier at
27028 the end of the stub sequences.
27030 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
27031 void
27032 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27034 if (! aarch64_harden_sls_blr_p ())
27035 return;
27037 bool any_functions_emitted = false;
27038 /* We must save and restore the current function section since this assembly
27039 is emitted at the end of the function. This means it can be emitted *just
27040 after* the cold section of a function. That cold part would be emitted in
27041 a different section. That switch would trigger a `.cfi_endproc` directive
27042 to be emitted in the original section and a `.cfi_startproc` directive to
27043 be emitted in the new section. Switching to the original section without
27044 restoring would mean that the `.cfi_endproc` emitted as a function ends
27045 would happen in a different section -- leaving an unmatched
27046 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27047 in the standard text section. */
27048 section *save_text_section = in_section;
27049 switch_to_section (function_section (current_function_decl));
27050 for (int regnum = 0; regnum < 30; ++regnum)
27052 rtx specu_label = cfun->machine->call_via[regnum];
27053 if (specu_label == NULL)
27054 continue;
27056 targetm.asm_out.print_operand (out_file, specu_label, 0);
27057 asm_fprintf (out_file, ":\n");
27058 aarch64_sls_emit_function_stub (out_file, regnum);
27059 any_functions_emitted = true;
27061 if (any_functions_emitted)
27062 /* Can use the SB if needs be here, since this stub will only be used
27063 by the current function, and hence for the current target. */
27064 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27065 switch_to_section (save_text_section);
27068 /* Emit shared BLR stubs for the current compilation unit.
27069 Over the course of compiling this unit we may have converted some BLR
27070 instructions to a BL to a shared stub function. This is where we emit those
27071 stub functions.
27072 This function is for the stubs shared between different functions in this
27073 compilation unit. We share when optimizing for size instead of speed.
27075 This function is called through the TARGET_ASM_FILE_END hook. */
27076 void
27077 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27079 if (! aarch64_sls_shared_thunks_needed)
27080 return;
27082 for (int regnum = 0; regnum < 30; ++regnum)
27084 tree decl = aarch64_sls_shared_thunks[regnum];
27085 if (!decl)
27086 continue;
27088 const char *name = indirect_symbol_names[regnum];
27089 switch_to_section (get_named_section (decl, NULL, 0));
27090 ASM_OUTPUT_ALIGN (out_file, 2);
27091 targetm.asm_out.globalize_label (out_file, name);
27092 /* Only emits if the compiler is configured for an assembler that can
27093 handle visibility directives. */
27094 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27095 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27096 ASM_OUTPUT_LABEL (out_file, name);
27097 aarch64_sls_emit_function_stub (out_file, regnum);
27098 /* Use the most conservative target to ensure it can always be used by any
27099 function in the translation unit. */
27100 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27101 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27105 /* Implement TARGET_ASM_FILE_END. */
27106 void
27107 aarch64_asm_file_end ()
27109 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27110 /* Since this function will be called for the ASM_FILE_END hook, we ensure
27111 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27112 for FreeBSD) still gets called. */
27113 #ifdef TARGET_ASM_FILE_END
27114 TARGET_ASM_FILE_END ();
27115 #endif
27118 const char *
27119 aarch64_indirect_call_asm (rtx addr)
27121 gcc_assert (REG_P (addr));
27122 if (aarch64_harden_sls_blr_p ())
27124 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27125 output_asm_insn ("bl\t%0", &stub_label);
27127 else
27128 output_asm_insn ("blr\t%0", &addr);
27129 return "";
27132 /* Target-specific selftests. */
27134 #if CHECKING_P
27136 namespace selftest {
27138 /* Selftest for the RTL loader.
27139 Verify that the RTL loader copes with a dump from
27140 print_rtx_function. This is essentially just a test that class
27141 function_reader can handle a real dump, but it also verifies
27142 that lookup_reg_by_dump_name correctly handles hard regs.
27143 The presence of hard reg names in the dump means that the test is
27144 target-specific, hence it is in this file. */
27146 static void
27147 aarch64_test_loading_full_dump ()
27149 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27151 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27153 rtx_insn *insn_1 = get_insn_by_uid (1);
27154 ASSERT_EQ (NOTE, GET_CODE (insn_1));
27156 rtx_insn *insn_15 = get_insn_by_uid (15);
27157 ASSERT_EQ (INSN, GET_CODE (insn_15));
27158 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27160 /* Verify crtl->return_rtx. */
27161 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27162 ASSERT_EQ (0, REGNO (crtl->return_rtx));
27163 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27166 /* Test the fractional_cost class. */
27168 static void
27169 aarch64_test_fractional_cost ()
27171 using cf = fractional_cost;
27173 ASSERT_EQ (cf (0, 20), 0);
27175 ASSERT_EQ (cf (4, 2), 2);
27176 ASSERT_EQ (3, cf (9, 3));
27178 ASSERT_NE (cf (5, 2), 2);
27179 ASSERT_NE (3, cf (8, 3));
27181 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27182 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27183 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27185 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27186 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27187 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27188 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27189 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27190 ASSERT_EQ (3 - cf (10, 3), 0);
27192 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27193 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27195 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27196 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27197 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27198 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27199 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27200 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27201 ASSERT_TRUE (cf (239, 240) < 1);
27202 ASSERT_FALSE (cf (240, 240) < 1);
27203 ASSERT_FALSE (cf (241, 240) < 1);
27204 ASSERT_FALSE (2 < cf (207, 104));
27205 ASSERT_FALSE (2 < cf (208, 104));
27206 ASSERT_TRUE (2 < cf (209, 104));
27208 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27209 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27210 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27211 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27212 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27213 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27214 ASSERT_TRUE (cf (239, 240) < 1);
27215 ASSERT_FALSE (cf (240, 240) < 1);
27216 ASSERT_FALSE (cf (241, 240) < 1);
27217 ASSERT_FALSE (2 < cf (207, 104));
27218 ASSERT_FALSE (2 < cf (208, 104));
27219 ASSERT_TRUE (2 < cf (209, 104));
27221 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27222 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27223 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27224 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27225 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27226 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27227 ASSERT_FALSE (cf (239, 240) >= 1);
27228 ASSERT_TRUE (cf (240, 240) >= 1);
27229 ASSERT_TRUE (cf (241, 240) >= 1);
27230 ASSERT_TRUE (2 >= cf (207, 104));
27231 ASSERT_TRUE (2 >= cf (208, 104));
27232 ASSERT_FALSE (2 >= cf (209, 104));
27234 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27235 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27236 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27237 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27238 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27239 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27240 ASSERT_FALSE (cf (239, 240) > 1);
27241 ASSERT_FALSE (cf (240, 240) > 1);
27242 ASSERT_TRUE (cf (241, 240) > 1);
27243 ASSERT_TRUE (2 > cf (207, 104));
27244 ASSERT_FALSE (2 > cf (208, 104));
27245 ASSERT_FALSE (2 > cf (209, 104));
27247 ASSERT_EQ (cf (1, 2).ceil (), 1);
27248 ASSERT_EQ (cf (11, 7).ceil (), 2);
27249 ASSERT_EQ (cf (20, 1).ceil (), 20);
27250 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27251 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27252 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27253 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27254 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27256 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27259 /* Run all target-specific selftests. */
27261 static void
27262 aarch64_run_selftests (void)
27264 aarch64_test_loading_full_dump ();
27265 aarch64_test_fractional_cost ();
27268 } // namespace selftest
27270 #endif /* #if CHECKING_P */
27272 #undef TARGET_STACK_PROTECT_GUARD
27273 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27275 #undef TARGET_ADDRESS_COST
27276 #define TARGET_ADDRESS_COST aarch64_address_cost
27278 /* This hook will determines whether unnamed bitfields affect the alignment
27279 of the containing structure. The hook returns true if the structure
27280 should inherit the alignment requirements of an unnamed bitfield's
27281 type. */
27282 #undef TARGET_ALIGN_ANON_BITFIELD
27283 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27285 #undef TARGET_ASM_ALIGNED_DI_OP
27286 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27288 #undef TARGET_ASM_ALIGNED_HI_OP
27289 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27291 #undef TARGET_ASM_ALIGNED_SI_OP
27292 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27294 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27295 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27296 hook_bool_const_tree_hwi_hwi_const_tree_true
27298 #undef TARGET_ASM_FILE_START
27299 #define TARGET_ASM_FILE_START aarch64_start_file
27301 #undef TARGET_ASM_OUTPUT_MI_THUNK
27302 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27304 #undef TARGET_ASM_SELECT_RTX_SECTION
27305 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27307 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27308 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27310 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27311 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27313 #undef TARGET_BUILD_BUILTIN_VA_LIST
27314 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27316 #undef TARGET_CALLEE_COPIES
27317 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27319 #undef TARGET_CAN_ELIMINATE
27320 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27322 #undef TARGET_CAN_INLINE_P
27323 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27325 #undef TARGET_CANNOT_FORCE_CONST_MEM
27326 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27328 #undef TARGET_CASE_VALUES_THRESHOLD
27329 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27331 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27332 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27334 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27335 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27337 /* Only the least significant bit is used for initialization guard
27338 variables. */
27339 #undef TARGET_CXX_GUARD_MASK_BIT
27340 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27342 #undef TARGET_C_MODE_FOR_SUFFIX
27343 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27345 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27346 #undef TARGET_DEFAULT_TARGET_FLAGS
27347 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27348 #endif
27350 #undef TARGET_CLASS_MAX_NREGS
27351 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27353 #undef TARGET_BUILTIN_DECL
27354 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27356 #undef TARGET_BUILTIN_RECIPROCAL
27357 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27359 #undef TARGET_C_EXCESS_PRECISION
27360 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27362 #undef TARGET_EXPAND_BUILTIN
27363 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27365 #undef TARGET_EXPAND_BUILTIN_VA_START
27366 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27368 #undef TARGET_FOLD_BUILTIN
27369 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27371 #undef TARGET_FUNCTION_ARG
27372 #define TARGET_FUNCTION_ARG aarch64_function_arg
27374 #undef TARGET_FUNCTION_ARG_ADVANCE
27375 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27377 #undef TARGET_FUNCTION_ARG_BOUNDARY
27378 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27380 #undef TARGET_FUNCTION_ARG_PADDING
27381 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27383 #undef TARGET_GET_RAW_RESULT_MODE
27384 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27385 #undef TARGET_GET_RAW_ARG_MODE
27386 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27388 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27389 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27391 #undef TARGET_FUNCTION_VALUE
27392 #define TARGET_FUNCTION_VALUE aarch64_function_value
27394 #undef TARGET_FUNCTION_VALUE_REGNO_P
27395 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27397 #undef TARGET_GIMPLE_FOLD_BUILTIN
27398 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27400 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27401 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27403 #undef TARGET_INIT_BUILTINS
27404 #define TARGET_INIT_BUILTINS aarch64_init_builtins
27406 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27407 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27408 aarch64_ira_change_pseudo_allocno_class
27410 #undef TARGET_LEGITIMATE_ADDRESS_P
27411 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27413 #undef TARGET_LEGITIMATE_CONSTANT_P
27414 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27416 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27417 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27418 aarch64_legitimize_address_displacement
27420 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27421 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27423 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27424 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27425 aarch64_libgcc_floating_mode_supported_p
27427 #undef TARGET_MANGLE_TYPE
27428 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27430 #undef TARGET_INVALID_CONVERSION
27431 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27433 #undef TARGET_INVALID_UNARY_OP
27434 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27436 #undef TARGET_INVALID_BINARY_OP
27437 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27439 #undef TARGET_VERIFY_TYPE_CONTEXT
27440 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27442 #undef TARGET_MEMORY_MOVE_COST
27443 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27445 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27446 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27448 #undef TARGET_MUST_PASS_IN_STACK
27449 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27451 /* This target hook should return true if accesses to volatile bitfields
27452 should use the narrowest mode possible. It should return false if these
27453 accesses should use the bitfield container type. */
27454 #undef TARGET_NARROW_VOLATILE_BITFIELD
27455 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27457 #undef TARGET_OPTION_OVERRIDE
27458 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27460 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27461 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27462 aarch64_override_options_after_change
27464 #undef TARGET_OFFLOAD_OPTIONS
27465 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27467 #undef TARGET_OPTION_RESTORE
27468 #define TARGET_OPTION_RESTORE aarch64_option_restore
27470 #undef TARGET_OPTION_PRINT
27471 #define TARGET_OPTION_PRINT aarch64_option_print
27473 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27474 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27476 #undef TARGET_SET_CURRENT_FUNCTION
27477 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27479 #undef TARGET_PASS_BY_REFERENCE
27480 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27482 #undef TARGET_PREFERRED_RELOAD_CLASS
27483 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27485 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27486 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27488 #undef TARGET_PROMOTED_TYPE
27489 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27491 #undef TARGET_SECONDARY_RELOAD
27492 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27494 #undef TARGET_SHIFT_TRUNCATION_MASK
27495 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27497 #undef TARGET_SETUP_INCOMING_VARARGS
27498 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27500 #undef TARGET_STRUCT_VALUE_RTX
27501 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
27503 #undef TARGET_REGISTER_MOVE_COST
27504 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27506 #undef TARGET_RETURN_IN_MEMORY
27507 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27509 #undef TARGET_RETURN_IN_MSB
27510 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27512 #undef TARGET_RTX_COSTS
27513 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27515 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27516 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27518 #undef TARGET_SCHED_ISSUE_RATE
27519 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27521 #undef TARGET_SCHED_VARIABLE_ISSUE
27522 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27524 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27525 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27526 aarch64_sched_first_cycle_multipass_dfa_lookahead
27528 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27529 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27530 aarch64_first_cycle_multipass_dfa_lookahead_guard
27532 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27533 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27534 aarch64_get_separate_components
27536 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27537 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27538 aarch64_components_for_bb
27540 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27541 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27542 aarch64_disqualify_components
27544 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27545 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27546 aarch64_emit_prologue_components
27548 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27549 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27550 aarch64_emit_epilogue_components
27552 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27553 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27554 aarch64_set_handled_components
27556 #undef TARGET_TRAMPOLINE_INIT
27557 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27559 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27560 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27562 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27563 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27565 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
27566 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
27568 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
27569 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
27570 aarch64_builtin_support_vector_misalignment
27572 #undef TARGET_ARRAY_MODE
27573 #define TARGET_ARRAY_MODE aarch64_array_mode
27575 #undef TARGET_ARRAY_MODE_SUPPORTED_P
27576 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
27578 #undef TARGET_VECTORIZE_CREATE_COSTS
27579 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
27581 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
27582 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
27583 aarch64_builtin_vectorization_cost
27585 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
27586 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
27588 #undef TARGET_VECTORIZE_BUILTINS
27589 #define TARGET_VECTORIZE_BUILTINS
27591 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
27592 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
27593 aarch64_builtin_vectorized_function
27595 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
27596 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
27597 aarch64_autovectorize_vector_modes
27599 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
27600 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
27601 aarch64_atomic_assign_expand_fenv
27603 /* Section anchor support. */
27605 #undef TARGET_MIN_ANCHOR_OFFSET
27606 #define TARGET_MIN_ANCHOR_OFFSET -256
27608 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
27609 byte offset; we can do much more for larger data types, but have no way
27610 to determine the size of the access. We assume accesses are aligned. */
27611 #undef TARGET_MAX_ANCHOR_OFFSET
27612 #define TARGET_MAX_ANCHOR_OFFSET 4095
27614 #undef TARGET_VECTOR_ALIGNMENT
27615 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
27617 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
27618 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
27619 aarch64_vectorize_preferred_vector_alignment
27620 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
27621 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
27622 aarch64_simd_vector_alignment_reachable
27624 /* vec_perm support. */
27626 #undef TARGET_VECTORIZE_VEC_PERM_CONST
27627 #define TARGET_VECTORIZE_VEC_PERM_CONST \
27628 aarch64_vectorize_vec_perm_const
27630 #undef TARGET_VECTORIZE_RELATED_MODE
27631 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
27632 #undef TARGET_VECTORIZE_GET_MASK_MODE
27633 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
27634 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
27635 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
27636 aarch64_empty_mask_is_expensive
27637 #undef TARGET_PREFERRED_ELSE_VALUE
27638 #define TARGET_PREFERRED_ELSE_VALUE \
27639 aarch64_preferred_else_value
27641 #undef TARGET_INIT_LIBFUNCS
27642 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
27644 #undef TARGET_FIXED_CONDITION_CODE_REGS
27645 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
27647 #undef TARGET_FLAGS_REGNUM
27648 #define TARGET_FLAGS_REGNUM CC_REGNUM
27650 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
27651 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
27653 #undef TARGET_ASAN_SHADOW_OFFSET
27654 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
27656 #undef TARGET_LEGITIMIZE_ADDRESS
27657 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
27659 #undef TARGET_SCHED_CAN_SPECULATE_INSN
27660 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
27662 #undef TARGET_CAN_USE_DOLOOP_P
27663 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
27665 #undef TARGET_SCHED_ADJUST_PRIORITY
27666 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
27668 #undef TARGET_SCHED_MACRO_FUSION_P
27669 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
27671 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
27672 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
27674 #undef TARGET_SCHED_FUSION_PRIORITY
27675 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
27677 #undef TARGET_UNSPEC_MAY_TRAP_P
27678 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
27680 #undef TARGET_USE_PSEUDO_PIC_REG
27681 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
27683 #undef TARGET_PRINT_OPERAND
27684 #define TARGET_PRINT_OPERAND aarch64_print_operand
27686 #undef TARGET_PRINT_OPERAND_ADDRESS
27687 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
27689 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
27690 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
27692 #undef TARGET_OPTAB_SUPPORTED_P
27693 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
27695 #undef TARGET_OMIT_STRUCT_RETURN_REG
27696 #define TARGET_OMIT_STRUCT_RETURN_REG true
27698 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
27699 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
27700 aarch64_dwarf_poly_indeterminate_value
27702 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
27703 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
27704 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
27706 #undef TARGET_HARD_REGNO_NREGS
27707 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
27708 #undef TARGET_HARD_REGNO_MODE_OK
27709 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
27711 #undef TARGET_MODES_TIEABLE_P
27712 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
27714 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
27715 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
27716 aarch64_hard_regno_call_part_clobbered
27718 #undef TARGET_INSN_CALLEE_ABI
27719 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
27721 #undef TARGET_CONSTANT_ALIGNMENT
27722 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
27724 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
27725 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
27726 aarch64_stack_clash_protection_alloca_probe_range
27728 #undef TARGET_COMPUTE_PRESSURE_CLASSES
27729 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
27731 #undef TARGET_CAN_CHANGE_MODE_CLASS
27732 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
27734 #undef TARGET_SELECT_EARLY_REMAT_MODES
27735 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
27737 #undef TARGET_SPECULATION_SAFE_VALUE
27738 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
27740 #undef TARGET_ESTIMATED_POLY_VALUE
27741 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
27743 #undef TARGET_ATTRIBUTE_TABLE
27744 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
27746 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
27747 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
27748 aarch64_simd_clone_compute_vecsize_and_simdlen
27750 #undef TARGET_SIMD_CLONE_ADJUST
27751 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
27753 #undef TARGET_SIMD_CLONE_USABLE
27754 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
27756 #undef TARGET_COMP_TYPE_ATTRIBUTES
27757 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
27759 #undef TARGET_GET_MULTILIB_ABI_NAME
27760 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
27762 #undef TARGET_FNTYPE_ABI
27763 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
27765 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
27766 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
27768 #if CHECKING_P
27769 #undef TARGET_RUN_TARGET_SELFTESTS
27770 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
27771 #endif /* #if CHECKING_P */
27773 #undef TARGET_ASM_POST_CFI_STARTPROC
27774 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
27776 #undef TARGET_STRICT_ARGUMENT_NAMING
27777 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
27779 #undef TARGET_MD_ASM_ADJUST
27780 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
27782 #undef TARGET_ASM_FILE_END
27783 #define TARGET_ASM_FILE_END aarch64_asm_file_end
27785 #undef TARGET_ASM_FUNCTION_EPILOGUE
27786 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
27788 #undef TARGET_HAVE_SHADOW_CALL_STACK
27789 #define TARGET_HAVE_SHADOW_CALL_STACK true
27791 struct gcc_target targetm = TARGET_INITIALIZER;
27793 #include "gt-aarch64.h"